def get_leaf_transitions(): file_name = 'penn_leaf_transition_counts.data' try: f = open(os.path.join('cache', file_name), 'rb') data = pickle.load(f) f.close() return data except (IOError, EOFError): from tag_utils import is_valid_tag cmd_utils.log("Building leaf counts from Penn Treebank corpus", 1) f = open(os.path.join('cache', file_name), 'wb') for sentence in nltk.corpus.treebank.parsed_sents(): leaves = list( sentence.subtrees( lambda x: len(x) > 0 and isinstance(x[0], basestring))) leaves = [ n[0].node.split("-")[0] for n in leaves if n.node not in is_valid_tag(n[0].node) ] leaves = ['START'] + leaves cmd_utils.log("Finished building tag counts", 1) pickle.dump(store_transitions._counts, f) f.close() return store_transitions._counts
def parse(text): log("Checking for coherence in '{0}'".format(text), 2) family_hits = [] family_stem_words = stemmed_words(family_words, 'family_words') for sentence in sentence_tokenizer.parse(text): tree = parsers.parse(sentence)[0] family_hits += [ (a_tree.node, a_tree[0].lower(), stemmer.stem(a_tree[0].lower()) in family_stem_words) for a_tree in tree.subtrees(lambda x: x.node in noun_tags) ] log("Family hits: {0}".format(family_hits), 4) family_hit_values = (len([hit for hit in family_hits if hit[2]]), len(family_hits)) log("%d/%d" % family_hit_values, 3) work_hits = [] work_stem_words = stemmed_words(work_words, 'work_words') for sentence in sentence_tokenizer.parse(text): tree = parsers.parse(sentence)[0] work_hits += [ (a_tree.node, a_tree[0].lower(), stemmer.stem(a_tree[0].lower()) in work_stem_words) for a_tree in tree.subtrees(lambda x: x.node in noun_tags) ] log("Work hits: {0}".format(work_hits), 4) work_hit_values = (len([hit for hit in work_hits if hit[2]]), len(work_hits)) log("%d/%d" % work_hit_values, 3) return family_hit_values[0], work_hit_values[0], work_hit_values[1]
def boost_for_sentence_tree(tree): weight = 1 first_np = list(tree.subtrees(lambda x: x.node == "NP"))[0] has_pro = len(list(first_np.subtrees(lambda x: x.node in pers_pro_tags))) > 0 if has_pro: log("BOOST: Starts with Pers Pronouns", 2) weight *= start_pers_pro_weight # @NOTE TOGGLE POINT # if tree[0].node == "S": # weight *= 10 return weight
def find_commanding_verb_tree(tree, steps=0): log("looking for verb at root: %s" % (tree.node,), 3) if tree.node in verb_tags: return (tree, steps) else: parent_node = tree.parent() if not parent_node: return None else: for sibling in parent_node: if sibling.node in verb_tags: return (sibling, steps + 1) elif sibling.node == "VP": return (list(sibling.subtrees(lambda x: x.node in verb_tags))[0], steps + 2) return find_commanding_verb_tree(parent_node, steps + 1)
def boost_for_sentence_tree(tree): weight = 1 first_np = list(tree.subtrees(lambda x: x.node == "NP"))[0] has_pro = len(list( first_np.subtrees(lambda x: x.node in pers_pro_tags))) > 0 if has_pro: log("BOOST: Starts with Pers Pronouns", 2) weight *= start_pers_pro_weight # @NOTE TOGGLE POINT # if tree[0].node == "S": # weight *= 10 return weight
def get_treebank_rules(cutoff=0, include_counts=False): all_rules = cache_utils.cache_get('treebank_rules', 'rules') if not all_rules: log('Generating lexical rules from Penn Treebank', 4) from nltk.corpus import treebank all_rules = dict() for tree in treebank.parsed_sents(): for rule, count in lexical_rules(tree).items(): all_rules[rule] = all_rules.get(rule, 0) + count cache_utils.cache_set('treebank_rules', 'rules', all_rules) if include_counts: return {k: v for (k, v) in all_rules.items() if v > cutoff} else: rules_set = set([rule for rule, count in all_rules.items() if count > cutoff]) return rules_set
def find_commanding_verb_tree(tree, steps=0): log("looking for verb at root: %s" % (tree.node, ), 3) if tree.node in verb_tags: return (tree, steps) else: parent_node = tree.parent() if not parent_node: return None else: for sibling in parent_node: if sibling.node in verb_tags: return (sibling, steps + 1) elif sibling.node == "VP": return (list( sibling.subtrees(lambda x: x.node in verb_tags))[0], steps + 2) return find_commanding_verb_tree(parent_node, steps + 1)
def get_treebank_rules(cutoff=0, include_counts=False): all_rules = cache_utils.cache_get('treebank_rules', 'rules') if not all_rules: log('Generating lexical rules from Penn Treebank', 4) from nltk.corpus import treebank all_rules = dict() for tree in treebank.parsed_sents(): for rule, count in lexical_rules(tree).items(): all_rules[rule] = all_rules.get(rule, 0) + count cache_utils.cache_set('treebank_rules', 'rules', all_rules) if include_counts: return {k: v for (k, v) in all_rules.items() if v > cutoff} else: rules_set = set( [rule for rule, count in all_rules.items() if count > cutoff]) return rules_set
def parse(text): treebank_rules = get_treebank_rules(cutoff=0) sentence_probs = [] for line in text.split("\n"): sentences = sentence_tokenizer.parse(line) for sentence in sentences: # Add a period to the end of the sentence, which sometimes # forces a better parse #if sentence[-1] not in ('.', '!', '?'): # sentence += '.' parse_trees = parsers.parse(sentence) for tree in parse_trees: if cmd_utils.cmd_log_level() > 2: print tree.pprint() evindenced_lexical_rules = set(lexical_rules(tree).keys()) differences = evindenced_lexical_rules.difference( treebank_rules) bad_generations = len(differences) log( "Found {0} bad generations ({1})".format( bad_generations, differences), 3) #bad_parse_prob = 1 if prob == 0 else 0 #log("Scored {0} for prob {1}".format(bad_parse_prob, prob), 3) bad_tag_problems = num_tag_problems(tree) log("Found {0} X or FRAG tags".format(bad_tag_problems), 3) bad_sbar_problems = num_sbar_problems(tree) log("Found {0} bad SBAR issues".format(bad_sbar_problems), 3) total_problems = bad_sbar_problems + bad_tag_problems + bad_generations log("In '{0}'".format(sentence), 2) log( "Found {0} sentence formation problems".format( total_problems), 1) sentence_probs.append(total_problems) return sentence_probs
def cache_get(cache_name, cache_key): if cache_name not in mem_caches: file_name = cache_name + '.data' file_path = os.path.join('cache', file_name) file_mode = "rb" if os.path.isfile(file_path) else "wb" f = open(file_path, file_mode) try: data = pickle.load(f) except (IOError, EOFError): data = dict() mem_caches[cache_name] = data f.close() try: rs = mem_caches[cache_name][cache_key] log('Cache Hit: %s[%s]' % (cache_name, cache_key), 5) return rs except KeyError: return None
def parse(text): treebank_rules = get_treebank_rules(cutoff=0) sentence_probs = [] for line in text.split("\n"): sentences = sentence_tokenizer.parse(line) for sentence in sentences: # Add a period to the end of the sentence, which sometimes # forces a better parse #if sentence[-1] not in ('.', '!', '?'): # sentence += '.' parse_trees = parsers.parse(sentence) for tree in parse_trees: if cmd_utils.cmd_log_level() > 2: print tree.pprint() evindenced_lexical_rules = set(lexical_rules(tree).keys()) differences = evindenced_lexical_rules.difference(treebank_rules) bad_generations = len(differences) log("Found {0} bad generations ({1})".format(bad_generations, differences), 3) #bad_parse_prob = 1 if prob == 0 else 0 #log("Scored {0} for prob {1}".format(bad_parse_prob, prob), 3) bad_tag_problems = num_tag_problems(tree) log("Found {0} X or FRAG tags".format(bad_tag_problems), 3) bad_sbar_problems = num_sbar_problems(tree) log("Found {0} bad SBAR issues".format(bad_sbar_problems), 3) total_problems = bad_sbar_problems + bad_tag_problems + bad_generations log("In '{0}'".format(sentence), 2) log("Found {0} sentence formation problems".format(total_problems), 1) sentence_probs.append(total_problems) return sentence_probs
def get_leaf_transitions(): file_name = 'penn_leaf_transition_counts.data' try: f = open(os.path.join('cache', file_name), 'rb') data = pickle.load(f) f.close() return data except (IOError, EOFError): from tag_utils import is_valid_tag cmd_utils.log("Building leaf counts from Penn Treebank corpus", 1) f = open(os.path.join('cache', file_name), 'wb') for sentence in nltk.corpus.treebank.parsed_sents(): leaves = list(sentence.subtrees(lambda x: len(x) > 0 and isinstance(x[0], basestring))) leaves = [n[0].node.split("-")[0] for n in leaves if n.node not in is_valid_tag(n[0].node)] leaves = ['START'] + leaves cmd_utils.log("Finished building tag counts", 1) pickle.dump(store_transitions._counts, f) f.close() return store_transitions._counts
def parse(text): log("Checking for coherence in '{0}'".format(text), 2) family_hits = [] family_stem_words = stemmed_words(family_words, 'family_words') for sentence in sentence_tokenizer.parse(text): tree = parsers.parse(sentence)[0] family_hits += [(a_tree.node, a_tree[0].lower(), stemmer.stem(a_tree[0].lower()) in family_stem_words) for a_tree in tree.subtrees(lambda x: x.node in noun_tags)] log("Family hits: {0}".format(family_hits), 4) family_hit_values = (len([hit for hit in family_hits if hit[2]]), len(family_hits)) log("%d/%d" % family_hit_values, 3) work_hits = [] work_stem_words = stemmed_words(work_words, 'work_words') for sentence in sentence_tokenizer.parse(text): tree = parsers.parse(sentence)[0] work_hits += [(a_tree.node, a_tree[0].lower(), stemmer.stem(a_tree[0].lower()) in work_stem_words) for a_tree in tree.subtrees(lambda x: x.node in noun_tags)] log("Work hits: {0}".format(work_hits), 4) work_hit_values = (len([hit for hit in work_hits if hit[2]]), len(work_hits)) log("%d/%d" % work_hit_values, 3) return family_hit_values[0], work_hit_values[0], work_hit_values[1]
def get_transition_counts(): file_name = 'penn_transition_counts.data' try: f = open(os.path.join('cache', file_name), 'rb') data = pickle.load(f) f.close() return data except (IOError, EOFError): cmd_utils.log("Building counts from Penn Treebank corpus", 1) f = open(os.path.join('cache', file_name), 'wb') for sentence in nltk.corpus.treebank.parsed_sents(): all_transitions = tree_utils.transitions_in_tree(sentence) for transitions in all_transitions: transitions = ['START'] + transitions if len(transitions) > 1: store_transitions(transitions) cmd_utils.log("Finished building tag counts", 1) pickle.dump(store_transitions._counts, f) f.close() return store_transitions._counts
def grade_1b(text): import agreement_utils rs = agreement_utils.parse(text) num_agreements, num_non_agreements, num_unsure = rs num_agreements_tested = sum(rs) if num_agreements_tested == 0: log("No possible agreements found in text", 2) return 0 else: log("Sub Scores: %s" % (rs, ), 2) prob = float(num_agreements) / sum(rs) log("%d/%d -> %f" % (num_agreements, sum(rs), prob), 2) return floor(prob * 5)
def grade_1b(text): import agreement_utils rs = agreement_utils.parse(text) num_agreements, num_non_agreements, num_unsure = rs num_agreements_tested = sum(rs) if num_agreements_tested == 0: log("No possible agreements found in text", 2) return 0 else: log("Sub Scores: %s" % (rs,), 2) prob = float(num_agreements) / sum(rs) log("%d/%d -> %f" % (num_agreements, sum(rs), prob), 2) return floor(prob * 5)
sentences = sentence_tokenizer.parse(text) num_sentences = len(sentences) if num_sentences >= 6: return 5 else: return max(num_sentences - 1, 1) if __name__ == '__main__': import cmd_utils tests = cmd_utils.cmd_test() tests = [tests] if tests else ('1a', '1b', '1d', '2a', '2b', '3a') essay_index = int(cmd_utils.cmd_arg('--essay', 0)) - 1 for test in tests: if essay_index >= 0: essay_text = "\n".join(essay_utils.essays[essay_index]) received_grade = grade_text(essay_text, test) log("Expect %s score: %d" % (test, correct_essay_grade(essay_index, test)), 0) log("Received %s score: %d" % (test, received_grade), 0) else: print "Values for %s" % (test,) print "-------------" for i in range(0, len(essay_utils.essays)): essay_text = "\n".join(essay_utils.essays[i]) received_grade = grade_text(essay_text, test) expected_grade = correct_essay_grade(i, test) diff = received_grade - expected_grade print " | ".join([str(s) for s in [(i + 1), expected_grade, received_grade, diff, abs(diff)]]) print "\n\n"
def issues_in_sentence(sentence, use_cache=True): """'Brute force' check for a bunch of possible word ordering issues. Specifically, looking for the following: - VP coming before NP in standard sentence - NP coming before VP in inverted sentence - JJ coming after Nount in NP - VB before PP in VP - VB before NP in VP - VP before S in standard sentence (with embedded sentences) - NN before CD in NP - NNP before CD in NP """ if use_cache: result = cache_get('word_order_issues', sentence) if result is not None: return result tree = parsers.parse(sentence)[0] tree_utils.simplify_tree(tree, trim_adjecent_prop_nouns=True, normalize_sent_roots=True, normalize_plural=True, normalize_case=True) log("Looking for order issues in: %s" % (sentence,), 1) if cmd_log_level() >= 4: print "Simplified Parse Tree" print tree problems = [] problems += ["VP->NP in S"] * num_forbidden_orders(tree, ("S",), ('VP', 'NP')) problems += ["NP->VP in SINV"] * num_forbidden_orders(tree, ('SINV',), ('NP', 'VP')) problems += ["NN->JJ in NP"] * num_forbidden_orders(tree, ('NP',), ('NN', 'JP')) problems += ["PP->VB in VP"] * num_forbidden_orders(tree, ('VP',), ('PP', 'VB')) problems += ["NP->VP in VP"] * num_forbidden_orders(tree, ('VP',), ('NP', 'VP')) problems += ["S->VP in S"] * num_forbidden_orders(tree, ('S',), ('S', 'VP')) problems += ["S->VB in VP"] * num_forbidden_orders(tree, ('VP',), ('S', 'VB')) # problems += ["VB->VP in VP"] * num_forbidden_orders(tree, ('VP',), ('VB', 'VP')) problems += ["NP->RBR in ADVP"] * num_forbidden_orders(tree, ('ADVP',), ('NP', 'RBR')) problems += ["NN->DT in NP"] * num_forbidden_orders(tree, ('NP',), ('NN', 'DT')) problems += ["NNP->DT in NP"] * num_forbidden_orders(tree, ('NP',), ('NNP', 'DT')) problems += ["NN->CD in NP"] * num_forbidden_orders(tree, ('NP',), ('NN', 'CD')) problems += ["NNP->CD in NP"] * num_forbidden_orders(tree, ('NP',), ('NNP', 'CD')) problems += ['PP->NP in S'] * num_forbidden_orders(tree, ('S',), ('PP', 'NP')) # Toggle? problems += ['NP->VP in NP'] * num_forbidden_orders(tree, ('NP',), ('NP', 'VP')) # Seems like it should be VB->ADVP->PP problems += ['VB->PP->ADVP in VP'] * num_forbidden_orders(tree, ('VP',), ('VB', 'PP', 'ADVP')) problems += ['VB->PP->SBAR in VP'] * num_forbidden_orders(tree, ('VP',), ('VB', 'PP', 'SBAR')) problems += ['NP->S in NP'] * num_forbidden_orders(tree, ('NP',), ('NP', 'S')) # Seems like the ADJP should be in a NP or somewhere else, not a sibling # of a noun phrase problems += ['NP->ADJP in S'] * num_forbidden_orders(tree, ('S',), ('NP', 'ADJP')) # Last, if there is an S w/ only one child, we call it a word order problem... problems += ['Single Child S'] * len(list(tree.subtrees(lambda x: x in tree_utils.semi_tree_roots and len(x) == 1))) if tree[0].node not in tree_utils.semi_tree_roots and not hasattr(tree[0], '_has_error'): tree[0]._has_error = True problems += ['No S Root'] log("Found %d order issues" % (len(problems),), 1) log("Issues: %s", (problems,), 2) if use_cache: cache_set('word_order_issues', sentence, problems) return problems
return max(num_sentences - 1, 1) if __name__ == '__main__': import cmd_utils tests = cmd_utils.cmd_test() tests = [tests] if tests else ('1a', '1b', '1d', '2a', '2b', '3a') essay_index = int(cmd_utils.cmd_arg('--essay', 0)) - 1 for test in tests: if essay_index >= 0: essay_text = "\n".join(essay_utils.essays[essay_index]) received_grade = grade_text(essay_text, test) log( "Expect %s score: %d" % (test, correct_essay_grade(essay_index, test)), 0) log("Received %s score: %d" % (test, received_grade), 0) else: print "Values for %s" % (test, ) print "-------------" for i in range(0, len(essay_utils.essays)): essay_text = "\n".join(essay_utils.essays[i]) received_grade = grade_text(essay_text, test) expected_grade = correct_essay_grade(i, test) diff = received_grade - expected_grade print " | ".join([ str(s) for s in [(i + 1), expected_grade, received_grade, diff, abs(diff)] ])
def is_possible_sentence(tree): """Perform some basic filtering to remove unlikely constructs, like starting a setnence with because""" leaf_trees = tree.subtrees(lambda x: x.height() == 2) leaf_nodes = [n.node for n in leaf_trees] if leaf_nodes[0] in invalid_boundary_tags: log("Rejecting sentence becuase it starts with an invalid boundry tag: %s" % (leaf_nodes[0],), 3) elif leaf_nodes[-1] in invalid_boundary_tags: log("Rejecting sentence becuase it ends with an invalid boundry tag: %s" % (leaf_nodes[-1],), 3) return False elif leaf_nodes[0] == "PP": log("Rejecting sentence because it stats with PP", 3) return False else: flatten_tags = [] useful_roots = list(tree.subtrees(lambda x: (x.node in semi_tree_roots) and len(x) > 1)) if len(useful_roots) == 0 or len(useful_roots[0]) < 2: log("Rejecting sentence becuase can't find a useful root", 3) return False sub_tree = useful_roots[0] for sub_sub_tree in sub_tree: flatten_tags.append(tag_utils.simplify_tag(sub_sub_tree.node)) sen_is_inverted = tree[0].node == "SINV" if sen_is_inverted: early_set = ("VP", "VB") late_set = ("NP",) else: early_set = ("NP", "NN") late_set = ("VP", "ADJP") try: earliest_index = min([flatten_tags.index(tag) for tag in early_set if tag in flatten_tags]) latest_index = max([flatten_tags.index(tag) for tag in late_set if tag in flatten_tags]) if earliest_index > latest_index: if sen_is_inverted: log("Rejecting possible sentence because earliest NP like tag occurs before earliest VP like tag (%d vs %d)" % (earliest_index, latest_index), 3) else: log("Rejecting possible sentence because earliest VP like tag occurs before earliest NP like tag (%d vs %d) and sentence parse SINV" % (earliest_index, latest_index), 3) return False else: return True except ValueError: log("Rejecting possible sentence because the head structure doesn't look like a valid parse", 3) return False
def parse_sentences(line, use_cache=True, include_prob=False): log("Working on: %s" % (line,), 2) if use_cache: correct_parse = cache_get("sentence_tokenizer", line) if correct_parse: log("Cache Hit: %s" % (correct_parse[0],), 4) log("-------------\n", 4) return correct_parse if include_prob else correct_parse[0] all_possible_sentences = _possible_sentences_in_line(line) all_possible_sentence_probs = [] invalid_possible_sentences = [] stored_probs = {} for possible_sentences in all_possible_sentences: log("Examining: %s" % (possible_sentences,), 1) prob_for_sentences = [] sent_is_impossible = False for possible_sentence in possible_sentences: if use_cache: possible_sentence_prob = cache_get('possible_sentences', possible_sentence) if possible_sentence_prob is not None: log("Cache Hit: %s (from %s)" % (possible_sentence, 'possible sentences'), 4) prob_for_sentences.append(possible_sentence_prob) continue if contains_any_invalid_setences(possible_sentences, invalid_possible_sentences) or sent_is_impossible: prob_for_sentences.append(0) continue elif possible_sentence in stored_probs: prob_for_sentences.append(stored_probs[possible_sentence]) continue sentence_trees = parsers.parse(possible_sentence) if len(sentence_trees) == 0: log("Wasn't able to parse input %s" % (possible_sentence,), 0) prob_for_sentences.append(0) invalid_possible_sentences.append(possible_sentence) sent_is_impossible = True continue else: sentence_tree = sentence_trees[0] if cmd_log_level() >= 4: print "--------" print "Pre Simplified Tree" print sentence_tree tree_utils.simplify_tree(sentence_tree, remove_starting_cc=possible_sentences.index(possible_sentence) == 0) if cmd_log_level() >= 4: print "--------" print "Post Simplified Tree" print sentence_tree sentence_transitions = tree_utils.transitions_in_tree(sentence_tree) if not is_possible_sentence(sentence_tree): log("%s" % (sentence_transitions,), 2) log("Invalid parse", 2) prob_for_sentences.append(0) invalid_possible_sentences.append(possible_sentence) sent_is_impossible = True if use_cache: cache_set('possible_sentences', possible_sentence, 0) else: log("%s" % (sentence_transitions,), 2) sentence_probs = [] for transition in sentence_transitions: try: probs = hmm_utils.prob_of_all_transitions(transition, counts, gram_size=3) except KeyError, e: log("'Imposible' Tag order", 2, sep=' ** ') log("%s" % (e,), 2, sep=' ** ') probs = [0] sentence_probs += probs log("Transitions: %s" % (transition,), 3) log("Probabilities: %s" % (probs,), 3) attempt_sentence_prob = prod(sentence_probs) sentence_prob_boost = boost_for_sentence_tree(sentence_tree) attempt_sentence_prob *= sentence_prob_boost prob_for_sentences.append(attempt_sentence_prob) stored_probs[possible_sentence] = attempt_sentence_prob if use_cache: cache_set('possible_sentences', possible_sentence, attempt_sentence_prob) weighted_score = prod(prob_for_sentences) * (weight ** (len(possible_sentences) - 1)) if weighted_score > 0: log("Valid Parse: %s" % (possible_sentences,), 2) log(weighted_score, 2) all_possible_sentence_probs.append(weighted_score)
def parse_sentences(line, use_cache=True, include_prob=False): log("Working on: %s" % (line, ), 2) if use_cache: correct_parse = cache_get("sentence_tokenizer", line) if correct_parse: log("Cache Hit: %s" % (correct_parse[0], ), 4) log("-------------\n", 4) return correct_parse if include_prob else correct_parse[0] all_possible_sentences = _possible_sentences_in_line(line) all_possible_sentence_probs = [] invalid_possible_sentences = [] stored_probs = {} for possible_sentences in all_possible_sentences: log("Examining: %s" % (possible_sentences, ), 1) prob_for_sentences = [] sent_is_impossible = False for possible_sentence in possible_sentences: if use_cache: possible_sentence_prob = cache_get('possible_sentences', possible_sentence) if possible_sentence_prob is not None: log( "Cache Hit: %s (from %s)" % (possible_sentence, 'possible sentences'), 4) prob_for_sentences.append(possible_sentence_prob) continue if contains_any_invalid_setences( possible_sentences, invalid_possible_sentences) or sent_is_impossible: prob_for_sentences.append(0) continue elif possible_sentence in stored_probs: prob_for_sentences.append(stored_probs[possible_sentence]) continue sentence_trees = parsers.parse(possible_sentence) if len(sentence_trees) == 0: log("Wasn't able to parse input %s" % (possible_sentence, ), 0) prob_for_sentences.append(0) invalid_possible_sentences.append(possible_sentence) sent_is_impossible = True continue else: sentence_tree = sentence_trees[0] if cmd_log_level() >= 4: print "--------" print "Pre Simplified Tree" print sentence_tree tree_utils.simplify_tree( sentence_tree, remove_starting_cc=possible_sentences.index( possible_sentence) == 0) if cmd_log_level() >= 4: print "--------" print "Post Simplified Tree" print sentence_tree sentence_transitions = tree_utils.transitions_in_tree( sentence_tree) if not is_possible_sentence(sentence_tree): log("%s" % (sentence_transitions, ), 2) log("Invalid parse", 2) prob_for_sentences.append(0) invalid_possible_sentences.append(possible_sentence) sent_is_impossible = True if use_cache: cache_set('possible_sentences', possible_sentence, 0) else: log("%s" % (sentence_transitions, ), 2) sentence_probs = [] for transition in sentence_transitions: try: probs = hmm_utils.prob_of_all_transitions(transition, counts, gram_size=3) except KeyError, e: log("'Imposible' Tag order", 2, sep=' ** ') log("%s" % (e, ), 2, sep=' ** ') probs = [0] sentence_probs += probs log("Transitions: %s" % (transition, ), 3) log("Probabilities: %s" % (probs, ), 3) attempt_sentence_prob = prod(sentence_probs) sentence_prob_boost = boost_for_sentence_tree(sentence_tree) attempt_sentence_prob *= sentence_prob_boost prob_for_sentences.append(attempt_sentence_prob) stored_probs[possible_sentence] = attempt_sentence_prob if use_cache: cache_set('possible_sentences', possible_sentence, attempt_sentence_prob) weighted_score = prod(prob_for_sentences) * (weight**( len(possible_sentences) - 1)) if weighted_score > 0: log("Valid Parse: %s" % (possible_sentences, ), 2) log(weighted_score, 2) all_possible_sentence_probs.append(weighted_score)
def parse(text, use_cache=True): num_agrees = 0 num_not_agrees = 0 num_unsure = 0 lines = text.split("\n") for line in lines: sentences = sentence_tokenizer.parse(line, use_cache=use_cache) for sentence in sentences: line_agreements, line_non_agreements, line_unsure = 0, 0, 0 # Possession seems to be tricky for the parser, so we fudge # a little here sentence = sentence.replace("'s", '') if sentence[-1] != ".": sentence += "." if use_cache: cache_rs = cache_utils.cache_get('sub_verb_agreement', sentence) if cache_rs: line_agreements, line_non_agreements, line_unsure = cache_rs num_agrees += line_agreements num_not_agrees += line_non_agreements num_unsure += line_unsure continue log("Looking for Sub-Verb agreement in '%s'" % (sentence,), 1) tree = parsers.parse(sentence)[0] dependencies = parsers.dependences(sentence) sub_verb_deps = [dep for dep in dependencies if dep['dep_name'] == 'nsubj'] if len(sub_verb_deps) == 0: log("Couldn't find Subject-Verb dependency info", 1) cache_utils.cache_set('sub_verb_agreement', sentence, (0, 0, 0)) continue for sub_verb in sub_verb_deps: first_node = node_in_tree(tree, sub_verb['first_word']) sec_node = node_in_tree(tree, sub_verb['second_word']) if first_node and sec_node: log("First Dep Node: %s" % (first_node,), 2) log("Sec Dep Node: %s" % (sec_node,), 2) try: is_agreement = check_node_agreement(first_node, sec_node) if is_agreement: line_agreements += 1 else: line_non_agreements += 1 log("Agreement in sentence? %s" % (is_agreement,), 1) except Exception as e: line_unsure += 1 log("Error looking for agreement? %s" % (e.message,), 2) # No agreement in pair. Not sure how to handle. # More exhaustive search? if use_cache: cache_utils.cache_set('sub_verb_agreement', sentence, (line_agreements, line_non_agreements, line_unsure)) num_agrees += line_agreements num_not_agrees += line_non_agreements num_unsure += line_unsure return num_agrees, num_not_agrees, num_unsure
def parse(text): # Strip numbers out, since that seems to cause problems for my approach text = re.sub(r'\d+ ?', 'some ', text) sentences = sentence_tokenizer.parse(text) sentence_pronouns = [] for sentence in sentences: log("Looking for pronouns in '{0}'".format(sentence), 2) pronoun_totals = [[], [], []] tree = parsers.parse(sentence)[0] pronoun_trees = tree.subtrees(lambda x: x.node in pronoun_tags) for pronoun_tree in pronoun_trees: # First total up all the first person pronouns for i in range(3): if pronoun_tree[0].lower() in pronouns[i]: pronoun_totals[i].append(pronoun_tree[0]) log("First Person '{0}'".format(pronoun_totals[0]), 3) log("Second Person '{0}'".format(pronoun_totals[1]), 3) log("Third Person '{0}'".format(pronoun_totals[2]), 3) sentence_pronouns.append(pronoun_totals) log("Pronouns found in text: %s" % (sentence_pronouns), 2) # If there are 3rd person pronouns in any sentence, we have to decide # if they are used correctly. We do this in the following, very # expensive, but possibly correct manner. # # Start from the top down # 1. Look back 2 sentences and see if we can find a refernece. # IF NOT - its an error and do no more # 2. If so, replace the refereneced word with "RUNNING" # and search again, to see if there is a previous word it could refer # to. # IF NOT, its correct. Replace the pronoun with the referenced word # and continue # 3. Else, its not felicitous. Give bad credit for i in range(len(sentences)): if len(sentence_pronouns[i][2]) > 0: pronoun_results = [] for third_pronoun in sentence_pronouns[i][2]: all_sentences = sentences[max(0, i - 2):i + 1] norm_sentences = ". ".join( [a_sen.strip(".") for a_sen in all_sentences]) + "." log( "Looking for pronoun coherence for '{0}'".format( norm_sentences), 4) pronouns_refs = parsers.parse_coref(norm_sentences) log("Recieved co-references {0}".format(pronouns_refs), 5) found_bundle = False for j in range(len(pronouns_refs)): if third_pronoun == pronouns_refs[j]['pronoun']: found_bundle = pronouns_refs[j] break if not found_bundle: log("Found NO anticedent for {0}".format(third_pronoun), 3) pronoun_results.append((third_pronoun, -1)) else: log("Found anticedent for {0}".format(third_pronoun), 3) ref_index = int(found_bundle['ref_sentence']) - 1 + (i - 2) sentences[ref_index] = sentences[ref_index].replace( found_bundle['ref'], 'RUNNING') log( "Replacing '{0}' with 'RUNNING'".format( found_bundle['ref']), 3) altered_sentences = sentences[max(0, i - 2):i + 1] norm_altered_sentences = ". ".join( [a_sen.strip(".") for a_sen in altered_sentences]) + "." log( "New test sentences are '{0}'".format( norm_altered_sentences), 4) altered_pronouns_refs = parsers.parse_coref( norm_altered_sentences) if third_pronoun not in [ a_ref['pronoun'] for a_ref in altered_pronouns_refs ]: log("Anticedent is unambigious!", 3) pro_index = int( found_bundle['pronoun_sentence']) - 1 + (i - 2) sentences[pro_index] = sentences[pro_index].replace( found_bundle['pronoun'], found_bundle['ref']) pronoun_results.append( (third_pronoun, found_bundle['ref'])) else: log("Anticedent is ambigious", 3) log("New Sentences: {0}".format(altered_pronouns_refs), 4) pronoun_results.append((third_pronoun, .5)) sentence_pronouns[i][2] = pronoun_results return sentence_pronouns
def parse(text): # Strip numbers out, since that seems to cause problems for my approach text = re.sub(r'\d+ ?', 'some ', text) sentences = sentence_tokenizer.parse(text) sentence_pronouns = [] for sentence in sentences: log("Looking for pronouns in '{0}'".format(sentence), 2) pronoun_totals = [[], [], []] tree = parsers.parse(sentence)[0] pronoun_trees = tree.subtrees(lambda x: x.node in pronoun_tags) for pronoun_tree in pronoun_trees: # First total up all the first person pronouns for i in range(3): if pronoun_tree[0].lower() in pronouns[i]: pronoun_totals[i].append(pronoun_tree[0]) log("First Person '{0}'".format(pronoun_totals[0]), 3) log("Second Person '{0}'".format(pronoun_totals[1]), 3) log("Third Person '{0}'".format(pronoun_totals[2]), 3) sentence_pronouns.append(pronoun_totals) log("Pronouns found in text: %s" % (sentence_pronouns), 2) # If there are 3rd person pronouns in any sentence, we have to decide # if they are used correctly. We do this in the following, very # expensive, but possibly correct manner. # # Start from the top down # 1. Look back 2 sentences and see if we can find a refernece. # IF NOT - its an error and do no more # 2. If so, replace the refereneced word with "RUNNING" # and search again, to see if there is a previous word it could refer # to. # IF NOT, its correct. Replace the pronoun with the referenced word # and continue # 3. Else, its not felicitous. Give bad credit for i in range(len(sentences)): if len(sentence_pronouns[i][2]) > 0: pronoun_results = [] for third_pronoun in sentence_pronouns[i][2]: all_sentences = sentences[max(0, i - 2):i + 1] norm_sentences = ". ".join([a_sen.strip(".") for a_sen in all_sentences]) + "." log("Looking for pronoun coherence for '{0}'".format(norm_sentences), 4) pronouns_refs = parsers.parse_coref(norm_sentences) log("Recieved co-references {0}".format(pronouns_refs), 5) found_bundle = False for j in range(len(pronouns_refs)): if third_pronoun == pronouns_refs[j]['pronoun']: found_bundle = pronouns_refs[j] break if not found_bundle: log("Found NO anticedent for {0}".format(third_pronoun), 3) pronoun_results.append((third_pronoun, -1)) else: log("Found anticedent for {0}".format(third_pronoun), 3) ref_index = int(found_bundle['ref_sentence']) - 1 + (i - 2) sentences[ref_index] = sentences[ref_index].replace(found_bundle['ref'], 'RUNNING') log("Replacing '{0}' with 'RUNNING'".format(found_bundle['ref']), 3) altered_sentences = sentences[max(0, i - 2):i + 1] norm_altered_sentences = ". ".join([a_sen.strip(".") for a_sen in altered_sentences]) + "." log("New test sentences are '{0}'".format(norm_altered_sentences), 4) altered_pronouns_refs = parsers.parse_coref(norm_altered_sentences) if third_pronoun not in [a_ref['pronoun'] for a_ref in altered_pronouns_refs]: log("Anticedent is unambigious!", 3) pro_index = int(found_bundle['pronoun_sentence']) - 1 + (i - 2) sentences[pro_index] = sentences[pro_index].replace(found_bundle['pronoun'], found_bundle['ref']) pronoun_results.append((third_pronoun, found_bundle['ref'])) else: log("Anticedent is ambigious", 3) log("New Sentences: {0}".format(altered_pronouns_refs), 4) pronoun_results.append((third_pronoun, .5)) sentence_pronouns[i][2] = pronoun_results return sentence_pronouns
def simplify_tree( tree, remove_starting_cc=False, trim_adjecent_prop_nouns=False, normalize_sent_roots=False, normalize_case=False, normalize_plural=False, collapse_redundant_sbar=True, ): """Do some transformations on a parse tree to normalize it. Currently: - Remove CD when they're paired with a noun - Stripping off conjunction from the beginning of the sentence / root of the tree - Remove proper nouns that are next to each other """ if normalize_plural: plural_transforms = dict(NNS="NN", NNPS="NNP") for a_tree in tree.subtrees(lambda x: x.node in plural_transforms): a_tree.node = plural_transforms[a_tree.node] if normalize_case: case_transforms = dict(VBD="VB", VBG="VB", VBN="VB", VBP="VB", VBZ="VB") for a_tree in tree.subtrees(lambda x: x.node in case_transforms): a_tree.node = case_transforms[a_tree.node] if normalize_sent_roots: for a_tree in tree.subtrees(lambda x: x.node in semi_tree_roots): a_tree.node = "S" if trim_adjecent_prop_nouns: np_trees = list(tree.subtrees(lambda x: x.node == "NP")) if len(np_trees) > 0: for np_tree in np_trees: num_leaves = len(np_tree) change = False if num_leaves >= 2: for i in range(0, num_leaves - 1): if not change: if np_tree[i].node == "NNP" and np_tree[i + 1].node == "NNP": np_tree.remove(np_tree[i + 1]) change = True if remove_starting_cc: useful_roots = list(tree.subtrees(lambda x: x.node in semi_tree_roots)) if len(useful_roots) > 0: useful_root = useful_roots[0] if useful_root[0].node == "CC": useful_root.remove(useful_root[0]) log("REMOVED CC from start of sentence", 2) cd_trees = tree.subtrees(lambda x: x.node == "CD") for cd_tree in cd_trees: parent_node = cd_tree.parent() if parent_node.node == "NP": parent_node_children = [parent_node[i].node for i in range(0, len(parent_node))] if "CD" in parent_node_children and ("NN" in parent_node_children or "NNS" in parent_node_children): parent_node.remove(cd_tree) log("REMOVED only child CD node", 2) if collapse_redundant_sbar: for sbar_tree in tree.subtrees(lambda x: x.node == "SBAR"): if len(sbar_tree) == 1 and sbar_tree[0].node in semi_tree_roots: sbar_child = sbar_tree[0] sbar_parent = sbar_tree.parent() index = sbar_parent.index(sbar_tree) sbar_parent.remove(sbar_tree) sbar_tree.remove(sbar_child) sbar_parent.insert(index, sbar_child) log("Collapsed SBAR", 2)
def is_possible_sentence(tree): """Perform some basic filtering to remove unlikely constructs, like starting a setnence with because""" leaf_trees = tree.subtrees(lambda x: x.height() == 2) leaf_nodes = [n.node for n in leaf_trees] if leaf_nodes[0] in invalid_boundary_tags: log( "Rejecting sentence becuase it starts with an invalid boundry tag: %s" % (leaf_nodes[0], ), 3) elif leaf_nodes[-1] in invalid_boundary_tags: log( "Rejecting sentence becuase it ends with an invalid boundry tag: %s" % (leaf_nodes[-1], ), 3) return False elif leaf_nodes[0] == "PP": log("Rejecting sentence because it stats with PP", 3) return False else: flatten_tags = [] useful_roots = list( tree.subtrees(lambda x: (x.node in semi_tree_roots) and len(x) > 1)) if len(useful_roots) == 0 or len(useful_roots[0]) < 2: log("Rejecting sentence becuase can't find a useful root", 3) return False sub_tree = useful_roots[0] for sub_sub_tree in sub_tree: flatten_tags.append(tag_utils.simplify_tag(sub_sub_tree.node)) sen_is_inverted = tree[0].node == "SINV" if sen_is_inverted: early_set = ("VP", "VB") late_set = ("NP", ) else: early_set = ("NP", "NN") late_set = ("VP", "ADJP") try: earliest_index = min([ flatten_tags.index(tag) for tag in early_set if tag in flatten_tags ]) latest_index = max([ flatten_tags.index(tag) for tag in late_set if tag in flatten_tags ]) if earliest_index > latest_index: if sen_is_inverted: log( "Rejecting possible sentence because earliest NP like tag occurs before earliest VP like tag (%d vs %d)" % (earliest_index, latest_index), 3) else: log( "Rejecting possible sentence because earliest VP like tag occurs before earliest NP like tag (%d vs %d) and sentence parse SINV" % (earliest_index, latest_index), 3) return False else: return True except ValueError: log( "Rejecting possible sentence because the head structure doesn't look like a valid parse", 3) return False
def check_node_agreement(tree_one, tree_two): # First determine which node is the noun node if tree_one.node in noun_tags and tree_two.node in noun_tags: best_pair = select_best_noun_verb(tree_one, tree_two) if best_pair: noun_tree, verb_tree = best_pair else: return False elif tree_one.node in noun_tags: noun_tree, verb_tree = tree_one, tree_two elif tree_two.node in noun_tags: verb_tree, noun_tree = tree_one, tree_two else: raise Exception("No noun tree in this agreement pair!") if noun_tree.node in singluar_noun_tags: noun_3rd_person = True noun_singular = True elif noun_tree.node in plural_noun_tags: noun_3rd_person = True noun_singular = False # In pronoun siutation and need to disambiguate elif noun_tree.node == "PRP": noun_3rd_person = not is_pronoun_first_person(noun_tree) noun_singular = is_pronoun_singluar(noun_tree) else: raise Exception("Received some unrecognized noun tag: %s" % (noun_tree.node,)) if verb_tree.node not in verb_tags: closest_verb_tree = find_commanding_verb_tree(verb_tree) if closest_verb_tree: verb_tree = closest_verb_tree[0] if not verb_tree.node in verb_tags: raise Exception("No verb in this agrement pair!") if verb_tree.node in singular_verb_tags: verb_singular = True elif verb_tree.node in plural_verb_tags: verb_singular = False else: verb_singular = True log("Noun: Looks like '%s-%s' is %s (%s)" % (noun_tree[0], noun_tree.node, 'Singular' if noun_singular else 'Plural', "3rd" if noun_3rd_person else "1st"), 2) log("Verb: Looks like '%s-%s' is '%s" % (verb_tree[0], verb_tree.node, 'Singular' if verb_singular else 'Plural'), 2) noun_1st_person = not noun_3rd_person is_vbp = verb_tree.node == "VBP" is_vbz = verb_tree.node == "VBZ" if verb_tree.node in general_verb_tags: return True elif noun_singular and noun_1st_person and is_vbp: return True elif noun_singular and noun_3rd_person and is_vbz: return True elif not noun_singular and noun_3rd_person and is_vbp: return True else: log("DONT LIKE COMBO: %s" % ({"verb_tag": verb_tree.node, "noun_1st_person": noun_1st_person, "noun_singular": noun_singular},), 2) return False
sentence_prob_boost = boost_for_sentence_tree(sentence_tree) attempt_sentence_prob *= sentence_prob_boost prob_for_sentences.append(attempt_sentence_prob) stored_probs[possible_sentence] = attempt_sentence_prob if use_cache: cache_set('possible_sentences', possible_sentence, attempt_sentence_prob) weighted_score = prod(prob_for_sentences) * (weight ** (len(possible_sentences) - 1)) if weighted_score > 0: log("Valid Parse: %s" % (possible_sentences,), 2) log(weighted_score, 2) all_possible_sentence_probs.append(weighted_score) max_prob = max(all_possible_sentence_probs) parse_for_max_prob = all_possible_sentences[all_possible_sentence_probs.index(max_prob)] log("All Probs: %s" % (all_possible_sentence_probs,), 2) log("MAX Prob: %f" % (max_prob,), 2) log("Parse for max prob: %s" % (parse_for_max_prob,), 2) log("Best Guess Num Sentences: %d" % (len(parse_for_max_prob),), 1) log("-------------\n\n", 1) if use_cache: cache_set("sentence_tokenizer", line, (parse_for_max_prob, max_prob)) return (parse_for_max_prob, max_prob) if include_prob else parse_for_max_prob if __name__ == '__main__': ## Simple method for testing from STDIN if use_stdin: print parse_sentences(cmd_utils.get_stdin()) else:
prob_for_sentences.append(attempt_sentence_prob) stored_probs[possible_sentence] = attempt_sentence_prob if use_cache: cache_set('possible_sentences', possible_sentence, attempt_sentence_prob) weighted_score = prod(prob_for_sentences) * (weight**( len(possible_sentences) - 1)) if weighted_score > 0: log("Valid Parse: %s" % (possible_sentences, ), 2) log(weighted_score, 2) all_possible_sentence_probs.append(weighted_score) max_prob = max(all_possible_sentence_probs) parse_for_max_prob = all_possible_sentences[ all_possible_sentence_probs.index(max_prob)] log("All Probs: %s" % (all_possible_sentence_probs, ), 2) log("MAX Prob: %f" % (max_prob, ), 2) log("Parse for max prob: %s" % (parse_for_max_prob, ), 2) log("Best Guess Num Sentences: %d" % (len(parse_for_max_prob), ), 1) log("-------------\n\n", 1) if use_cache: cache_set("sentence_tokenizer", line, (parse_for_max_prob, max_prob)) return (parse_for_max_prob, max_prob) if include_prob else parse_for_max_prob if __name__ == '__main__': ## Simple method for testing from STDIN if use_stdin: print parse_sentences(cmd_utils.get_stdin())
def parse(text, use_cache=True): num_agrees = 0 num_not_agrees = 0 num_unsure = 0 lines = text.split("\n") for line in lines: sentences = sentence_tokenizer.parse(line, use_cache=use_cache) for sentence in sentences: line_agreements, line_non_agreements, line_unsure = 0, 0, 0 # Possession seems to be tricky for the parser, so we fudge # a little here sentence = sentence.replace("'s", '') if sentence[-1] != ".": sentence += "." if use_cache: cache_rs = cache_utils.cache_get('sub_verb_agreement', sentence) if cache_rs: line_agreements, line_non_agreements, line_unsure = cache_rs num_agrees += line_agreements num_not_agrees += line_non_agreements num_unsure += line_unsure continue log("Looking for Sub-Verb agreement in '%s'" % (sentence, ), 1) tree = parsers.parse(sentence)[0] dependencies = parsers.dependences(sentence) sub_verb_deps = [ dep for dep in dependencies if dep['dep_name'] == 'nsubj' ] if len(sub_verb_deps) == 0: log("Couldn't find Subject-Verb dependency info", 1) cache_utils.cache_set('sub_verb_agreement', sentence, (0, 0, 0)) continue for sub_verb in sub_verb_deps: first_node = node_in_tree(tree, sub_verb['first_word']) sec_node = node_in_tree(tree, sub_verb['second_word']) if first_node and sec_node: log("First Dep Node: %s" % (first_node, ), 2) log("Sec Dep Node: %s" % (sec_node, ), 2) try: is_agreement = check_node_agreement( first_node, sec_node) if is_agreement: line_agreements += 1 else: line_non_agreements += 1 log("Agreement in sentence? %s" % (is_agreement, ), 1) except Exception as e: line_unsure += 1 log("Error looking for agreement? %s" % (e.message, ), 2) # No agreement in pair. Not sure how to handle. # More exhaustive search? if use_cache: cache_utils.cache_set( 'sub_verb_agreement', sentence, (line_agreements, line_non_agreements, line_unsure)) num_agrees += line_agreements num_not_agrees += line_non_agreements num_unsure += line_unsure return num_agrees, num_not_agrees, num_unsure
def check_node_agreement(tree_one, tree_two): # First determine which node is the noun node if tree_one.node in noun_tags and tree_two.node in noun_tags: best_pair = select_best_noun_verb(tree_one, tree_two) if best_pair: noun_tree, verb_tree = best_pair else: return False elif tree_one.node in noun_tags: noun_tree, verb_tree = tree_one, tree_two elif tree_two.node in noun_tags: verb_tree, noun_tree = tree_one, tree_two else: raise Exception("No noun tree in this agreement pair!") if noun_tree.node in singluar_noun_tags: noun_3rd_person = True noun_singular = True elif noun_tree.node in plural_noun_tags: noun_3rd_person = True noun_singular = False # In pronoun siutation and need to disambiguate elif noun_tree.node == "PRP": noun_3rd_person = not is_pronoun_first_person(noun_tree) noun_singular = is_pronoun_singluar(noun_tree) else: raise Exception("Received some unrecognized noun tag: %s" % (noun_tree.node, )) if verb_tree.node not in verb_tags: closest_verb_tree = find_commanding_verb_tree(verb_tree) if closest_verb_tree: verb_tree = closest_verb_tree[0] if not verb_tree.node in verb_tags: raise Exception("No verb in this agrement pair!") if verb_tree.node in singular_verb_tags: verb_singular = True elif verb_tree.node in plural_verb_tags: verb_singular = False else: verb_singular = True log( "Noun: Looks like '%s-%s' is %s (%s)" % (noun_tree[0], noun_tree.node, 'Singular' if noun_singular else 'Plural', "3rd" if noun_3rd_person else "1st"), 2) log( "Verb: Looks like '%s-%s' is '%s" % (verb_tree[0], verb_tree.node, 'Singular' if verb_singular else 'Plural'), 2) noun_1st_person = not noun_3rd_person is_vbp = verb_tree.node == "VBP" is_vbz = verb_tree.node == "VBZ" if verb_tree.node in general_verb_tags: return True elif noun_singular and noun_1st_person and is_vbp: return True elif noun_singular and noun_3rd_person and is_vbz: return True elif not noun_singular and noun_3rd_person and is_vbp: return True else: log( "DONT LIKE COMBO: %s" % ({ "verb_tag": verb_tree.node, "noun_1st_person": noun_1st_person, "noun_singular": noun_singular }, ), 2) return False
def simplify_tree(tree, remove_starting_cc=False, trim_adjecent_prop_nouns=False, normalize_sent_roots=False, normalize_case=False, normalize_plural=False, collapse_redundant_sbar=True): """Do some transformations on a parse tree to normalize it. Currently: - Remove CD when they're paired with a noun - Stripping off conjunction from the beginning of the sentence / root of the tree - Remove proper nouns that are next to each other """ if normalize_plural: plural_transforms = dict( NNS='NN', NNPS='NNP', ) for a_tree in tree.subtrees(lambda x: x.node in plural_transforms): a_tree.node = plural_transforms[a_tree.node] if normalize_case: case_transforms = dict(VBD='VB', VBG='VB', VBN='VB', VBP='VB', VBZ='VB') for a_tree in tree.subtrees(lambda x: x.node in case_transforms): a_tree.node = case_transforms[a_tree.node] if normalize_sent_roots: for a_tree in tree.subtrees(lambda x: x.node in semi_tree_roots): a_tree.node = "S" if trim_adjecent_prop_nouns: np_trees = list(tree.subtrees(lambda x: x.node == "NP")) if len(np_trees) > 0: for np_tree in np_trees: num_leaves = len(np_tree) change = False if num_leaves >= 2: for i in range(0, num_leaves - 1): if not change: if np_tree[i].node == "NNP" and np_tree[ i + 1].node == "NNP": np_tree.remove(np_tree[i + 1]) change = True if remove_starting_cc: useful_roots = list(tree.subtrees(lambda x: x.node in semi_tree_roots)) if len(useful_roots) > 0: useful_root = useful_roots[0] if useful_root[0].node == "CC": useful_root.remove(useful_root[0]) log("REMOVED CC from start of sentence", 2) cd_trees = tree.subtrees(lambda x: x.node == "CD") for cd_tree in cd_trees: parent_node = cd_tree.parent() if parent_node.node == "NP": parent_node_children = [ parent_node[i].node for i in range(0, len(parent_node)) ] if "CD" in parent_node_children and ("NN" in parent_node_children or "NNS" in parent_node_children): parent_node.remove(cd_tree) log("REMOVED only child CD node", 2) if collapse_redundant_sbar: for sbar_tree in tree.subtrees(lambda x: x.node == 'SBAR'): if len(sbar_tree) == 1 and sbar_tree[0].node in semi_tree_roots: sbar_child = sbar_tree[0] sbar_parent = sbar_tree.parent() index = sbar_parent.index(sbar_tree) sbar_parent.remove(sbar_tree) sbar_tree.remove(sbar_child) sbar_parent.insert(index, sbar_child) log("Collapsed SBAR", 2)