class TreeStats: """ Determine tree-based statistics, such as tree depths, production counts, etc. """ def __init__(self, corpus): """ We'll use the Stanford Parser to do the heavy lifting here. """ def n_productions(parse_tree, production): """ Returns the number of productions of type `production` in parse_tree. """ productions = list(parse_tree.subtrees( filter=lambda t: t.label() == production)) return len(productions) jar = '/usr/local/Cellar/stanford-parser/' '3.6.0/libexec/stanford-parser.jar' model = '/usr/local/Cellar/stanford-parser/' '3.6.0/libexec/stanford-parser-3.6.0-models.jar' self.corpus = [corpus] if isinstance(corpus[0], tuple) else corpus self.parser = StanfordParser(path_to_jar=jar, path_to_models_jar=model) self.stats = [] parsed_sents = self.parser.tagged_parse_sents(self.corpus) self.trees = [t for tree in parsed_sents for t in tree] for tree in self.trees: self.stats.append({ 'depth': tree.height(), 'noun_phrases': n_productions(tree, 'NP'), 'prepositional_phrases': n_productions(tree, 'PP'), 'sbars': n_productions(tree, 'SBAR'), 'nonterminals': len(tree.productions()), }) def get_stats(self): """ Combines all the statistics together """ n = len(self.stats) max_tree_depth = max(stat['depth'] for stat in self.stats) avg_tree_depth = sum(stat['depth'] for stat in self.stats) / n avg_noun_phrases = sum(stat['noun_phrases'] for stat in self.stats) / n avg_prep_phrases = sum(stat['prepositional_phrases'] for stat in self.stats) / n avg_sbars = sum(stat['sbars'] for stat in self.stats) / n avg_nonterminals = sum(stat['nonterminals'] for stat in self.stats) / n return { 'max_tree_depth': max_tree_depth, 'avg_tree_depth': avg_tree_depth, 'avg_noun_phrases': avg_noun_phrases, 'avg_prepositional_phrases': avg_prep_phrases, 'avg_sbars': avg_sbars, 'avg_nonterminals': avg_nonterminals, }
for j in range (0,70159): answer = answers[j] for token,tag in answer: if token == "(": stat[j]['l'] = stat[j].get('l',0) + 1 elif token == ")": stat[j]['r'] = stat[j].get('r',0) + 1 nm = [] for item in stat.items(): if item[1].get('l',0) != item[1].get('r',0): nm.append(item[0]) pure_answers = [] for i in range (0,70159): if i not in nm: pure_answers.append(answers[i]) sample = random.sample(pure_answers,50) print sample trees = parser.tagged_parse_sents(pure_answers[0:100]) for tree in trees: t = next(tree) # t.draw() print t.leaves()