def parse(sentence, use_cache=True, parser='stanford'): cache_key = "parse_trees_{0}".format(parser) valid_lines = None if use_cache: cache_attempt = cache_get(cache_key, sentence) if cache_attempt: valid_lines = cache_attempt if valid_lines is None: if parser == "stanford": response = parse_stanford(sentence, use_cache=use_cache) elif parser == "malt": response = parse_malt(sentence, use_cache=use_cache) else: return [] valid_lines = [line for line in response.split("\n") if len(line) > 2 and line[0] == "(" and line[-1] == ")"] if use_cache: cache_set(cache_key, sentence, valid_lines) # throw away the garbgage we don't want from the parser's response. # this could probably get us in trouble since it'll hide errors etc, # but we got deadlines.... trees = [ParentedTree.parse(line) for line in valid_lines] return trees
def parse_coref(sentence, use_cache=True): cache_key = "coreferences" if use_cache: cache_attempt = cache_get(cache_key, sentence) if cache_attempt is not None: return cache_attempt import re if not hasattr(parse_coref, '_regex'): parse_coref._regex = re.compile(r'\((?P<pronoun_sentence>\d+),(?P<pronoun_loc>\d+),.*?-> \((?P<ref_sentence>\d+),(?P<ref_loc>\d+).*?"(?P<pronoun>.*?)" -> "(?P<ref>.*?)"') parser_dir = os.path.realpath(os.path.join("contrib", "stanford-corenlp")) cmd_parts = ('java', '-Xmx3g', '-cp', parser_dir + "/*:", 'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators', 'tokenize,ssplit,pos,lemma,ner,parse,dcoref', 'dcoref.logFile', '/dev/null', '-') response = _exec_cmd(cmd_parts, sentence) matches = [] start = False for line in response.split("\n"): if line == "NLP> Annotation pipeline timing information:": break elif line == "Coreference set:": start = True elif start: a_match = parse_coref._regex.search(line.strip()) if a_match: matches.append(a_match.groupdict()) if use_cache: cache_set(cache_key, sentence, matches) return matches
def stemmed_words(words, key=False, use_cache=False): if use_cache: stemmed_wordbank = cache_get('stemmed_wordbank', 'words_{0}'.format(key)) if stemmed_wordbank is not None: return stemmed_wordbank words = set([stemmer.stem(a_word) for a_word in build_words(words, key)]) if use_cache: cache_set('stemmed_wordbank', 'words_{0}'.format(key), words) return words
def get_treebank_rules(cutoff=0, include_counts=False): all_rules = cache_utils.cache_get('treebank_rules', 'rules') if not all_rules: log('Generating lexical rules from Penn Treebank', 4) from nltk.corpus import treebank all_rules = dict() for tree in treebank.parsed_sents(): for rule, count in lexical_rules(tree).items(): all_rules[rule] = all_rules.get(rule, 0) + count cache_utils.cache_set('treebank_rules', 'rules', all_rules) if include_counts: return {k: v for (k, v) in all_rules.items() if v > cutoff} else: rules_set = set([rule for rule, count in all_rules.items() if count > cutoff]) return rules_set
def get_treebank_rules(cutoff=0, include_counts=False): all_rules = cache_utils.cache_get('treebank_rules', 'rules') if not all_rules: log('Generating lexical rules from Penn Treebank', 4) from nltk.corpus import treebank all_rules = dict() for tree in treebank.parsed_sents(): for rule, count in lexical_rules(tree).items(): all_rules[rule] = all_rules.get(rule, 0) + count cache_utils.cache_set('treebank_rules', 'rules', all_rules) if include_counts: return {k: v for (k, v) in all_rules.items() if v > cutoff} else: rules_set = set( [rule for rule, count in all_rules.items() if count > cutoff]) return rules_set
def build_words(start_words, key=None, use_cache=True, max_depth=3): if use_cache: wordbank = cache_get('wordbank', 'words_{0}'.format(key)) if wordbank is not None: return wordbank words = set() for start_word in start_words: words.add(start_word) for wb_word in wordnet.synsets(start_word, wordnet.NOUN): for method in methods: new_words = getattr(wb_word, method)() for a_word in [a_new_word for a_new_word in new_words]: words.add(a_word.name.split(".")[0].replace("_", "-")) if max_depth > 0: words.update(build_words([a_word.name], key=None, use_cache=False, max_depth=(max_depth-1))) # for lemma in a_word.lemmas: # words.update(build_words(lemma.name.split(".")[-1].replace("_", "-"), use_cache=False, max_depth=(max_depth-1))) if use_cache: cache_set('wordbank', 'words', 'words_{0}'.format(key)) return words
def build_words(start_words, key=None, use_cache=True, max_depth=3): if use_cache: wordbank = cache_get('wordbank', 'words_{0}'.format(key)) if wordbank is not None: return wordbank words = set() for start_word in start_words: words.add(start_word) for wb_word in wordnet.synsets(start_word, wordnet.NOUN): for method in methods: new_words = getattr(wb_word, method)() for a_word in [a_new_word for a_new_word in new_words]: words.add(a_word.name.split(".")[0].replace("_", "-")) if max_depth > 0: words.update( build_words([a_word.name], key=None, use_cache=False, max_depth=(max_depth - 1))) # for lemma in a_word.lemmas: # words.update(build_words(lemma.name.split(".")[-1].replace("_", "-"), use_cache=False, max_depth=(max_depth-1))) if use_cache: cache_set('wordbank', 'words', 'words_{0}'.format(key)) return words
def parse(text, use_cache=True): num_agrees = 0 num_not_agrees = 0 num_unsure = 0 lines = text.split("\n") for line in lines: sentences = sentence_tokenizer.parse(line, use_cache=use_cache) for sentence in sentences: line_agreements, line_non_agreements, line_unsure = 0, 0, 0 # Possession seems to be tricky for the parser, so we fudge # a little here sentence = sentence.replace("'s", '') if sentence[-1] != ".": sentence += "." if use_cache: cache_rs = cache_utils.cache_get('sub_verb_agreement', sentence) if cache_rs: line_agreements, line_non_agreements, line_unsure = cache_rs num_agrees += line_agreements num_not_agrees += line_non_agreements num_unsure += line_unsure continue log("Looking for Sub-Verb agreement in '%s'" % (sentence, ), 1) tree = parsers.parse(sentence)[0] dependencies = parsers.dependences(sentence) sub_verb_deps = [ dep for dep in dependencies if dep['dep_name'] == 'nsubj' ] if len(sub_verb_deps) == 0: log("Couldn't find Subject-Verb dependency info", 1) cache_utils.cache_set('sub_verb_agreement', sentence, (0, 0, 0)) continue for sub_verb in sub_verb_deps: first_node = node_in_tree(tree, sub_verb['first_word']) sec_node = node_in_tree(tree, sub_verb['second_word']) if first_node and sec_node: log("First Dep Node: %s" % (first_node, ), 2) log("Sec Dep Node: %s" % (sec_node, ), 2) try: is_agreement = check_node_agreement( first_node, sec_node) if is_agreement: line_agreements += 1 else: line_non_agreements += 1 log("Agreement in sentence? %s" % (is_agreement, ), 1) except Exception as e: line_unsure += 1 log("Error looking for agreement? %s" % (e.message, ), 2) # No agreement in pair. Not sure how to handle. # More exhaustive search? if use_cache: cache_utils.cache_set( 'sub_verb_agreement', sentence, (line_agreements, line_non_agreements, line_unsure)) num_agrees += line_agreements num_not_agrees += line_non_agreements num_unsure += line_unsure return num_agrees, num_not_agrees, num_unsure
def issues_in_sentence(sentence, use_cache=True): """'Brute force' check for a bunch of possible word ordering issues. Specifically, looking for the following: - VP coming before NP in standard sentence - NP coming before VP in inverted sentence - JJ coming after Nount in NP - VB before PP in VP - VB before NP in VP - VP before S in standard sentence (with embedded sentences) - NN before CD in NP - NNP before CD in NP """ if use_cache: result = cache_get('word_order_issues', sentence) if result is not None: return result tree = parsers.parse(sentence)[0] tree_utils.simplify_tree(tree, trim_adjecent_prop_nouns=True, normalize_sent_roots=True, normalize_plural=True, normalize_case=True) log("Looking for order issues in: %s" % (sentence,), 1) if cmd_log_level() >= 4: print "Simplified Parse Tree" print tree problems = [] problems += ["VP->NP in S"] * num_forbidden_orders(tree, ("S",), ('VP', 'NP')) problems += ["NP->VP in SINV"] * num_forbidden_orders(tree, ('SINV',), ('NP', 'VP')) problems += ["NN->JJ in NP"] * num_forbidden_orders(tree, ('NP',), ('NN', 'JP')) problems += ["PP->VB in VP"] * num_forbidden_orders(tree, ('VP',), ('PP', 'VB')) problems += ["NP->VP in VP"] * num_forbidden_orders(tree, ('VP',), ('NP', 'VP')) problems += ["S->VP in S"] * num_forbidden_orders(tree, ('S',), ('S', 'VP')) problems += ["S->VB in VP"] * num_forbidden_orders(tree, ('VP',), ('S', 'VB')) # problems += ["VB->VP in VP"] * num_forbidden_orders(tree, ('VP',), ('VB', 'VP')) problems += ["NP->RBR in ADVP"] * num_forbidden_orders(tree, ('ADVP',), ('NP', 'RBR')) problems += ["NN->DT in NP"] * num_forbidden_orders(tree, ('NP',), ('NN', 'DT')) problems += ["NNP->DT in NP"] * num_forbidden_orders(tree, ('NP',), ('NNP', 'DT')) problems += ["NN->CD in NP"] * num_forbidden_orders(tree, ('NP',), ('NN', 'CD')) problems += ["NNP->CD in NP"] * num_forbidden_orders(tree, ('NP',), ('NNP', 'CD')) problems += ['PP->NP in S'] * num_forbidden_orders(tree, ('S',), ('PP', 'NP')) # Toggle? problems += ['NP->VP in NP'] * num_forbidden_orders(tree, ('NP',), ('NP', 'VP')) # Seems like it should be VB->ADVP->PP problems += ['VB->PP->ADVP in VP'] * num_forbidden_orders(tree, ('VP',), ('VB', 'PP', 'ADVP')) problems += ['VB->PP->SBAR in VP'] * num_forbidden_orders(tree, ('VP',), ('VB', 'PP', 'SBAR')) problems += ['NP->S in NP'] * num_forbidden_orders(tree, ('NP',), ('NP', 'S')) # Seems like the ADJP should be in a NP or somewhere else, not a sibling # of a noun phrase problems += ['NP->ADJP in S'] * num_forbidden_orders(tree, ('S',), ('NP', 'ADJP')) # Last, if there is an S w/ only one child, we call it a word order problem... problems += ['Single Child S'] * len(list(tree.subtrees(lambda x: x in tree_utils.semi_tree_roots and len(x) == 1))) if tree[0].node not in tree_utils.semi_tree_roots and not hasattr(tree[0], '_has_error'): tree[0]._has_error = True problems += ['No S Root'] log("Found %d order issues" % (len(problems),), 1) log("Issues: %s", (problems,), 2) if use_cache: cache_set('word_order_issues', sentence, problems) return problems
def parse_sentences(line, use_cache=True, include_prob=False): log("Working on: %s" % (line,), 2) if use_cache: correct_parse = cache_get("sentence_tokenizer", line) if correct_parse: log("Cache Hit: %s" % (correct_parse[0],), 4) log("-------------\n", 4) return correct_parse if include_prob else correct_parse[0] all_possible_sentences = _possible_sentences_in_line(line) all_possible_sentence_probs = [] invalid_possible_sentences = [] stored_probs = {} for possible_sentences in all_possible_sentences: log("Examining: %s" % (possible_sentences,), 1) prob_for_sentences = [] sent_is_impossible = False for possible_sentence in possible_sentences: if use_cache: possible_sentence_prob = cache_get('possible_sentences', possible_sentence) if possible_sentence_prob is not None: log("Cache Hit: %s (from %s)" % (possible_sentence, 'possible sentences'), 4) prob_for_sentences.append(possible_sentence_prob) continue if contains_any_invalid_setences(possible_sentences, invalid_possible_sentences) or sent_is_impossible: prob_for_sentences.append(0) continue elif possible_sentence in stored_probs: prob_for_sentences.append(stored_probs[possible_sentence]) continue sentence_trees = parsers.parse(possible_sentence) if len(sentence_trees) == 0: log("Wasn't able to parse input %s" % (possible_sentence,), 0) prob_for_sentences.append(0) invalid_possible_sentences.append(possible_sentence) sent_is_impossible = True continue else: sentence_tree = sentence_trees[0] if cmd_log_level() >= 4: print "--------" print "Pre Simplified Tree" print sentence_tree tree_utils.simplify_tree(sentence_tree, remove_starting_cc=possible_sentences.index(possible_sentence) == 0) if cmd_log_level() >= 4: print "--------" print "Post Simplified Tree" print sentence_tree sentence_transitions = tree_utils.transitions_in_tree(sentence_tree) if not is_possible_sentence(sentence_tree): log("%s" % (sentence_transitions,), 2) log("Invalid parse", 2) prob_for_sentences.append(0) invalid_possible_sentences.append(possible_sentence) sent_is_impossible = True if use_cache: cache_set('possible_sentences', possible_sentence, 0) else: log("%s" % (sentence_transitions,), 2) sentence_probs = [] for transition in sentence_transitions: try: probs = hmm_utils.prob_of_all_transitions(transition, counts, gram_size=3) except KeyError, e: log("'Imposible' Tag order", 2, sep=' ** ') log("%s" % (e,), 2, sep=' ** ') probs = [0] sentence_probs += probs log("Transitions: %s" % (transition,), 3) log("Probabilities: %s" % (probs,), 3) attempt_sentence_prob = prod(sentence_probs) sentence_prob_boost = boost_for_sentence_tree(sentence_tree) attempt_sentence_prob *= sentence_prob_boost prob_for_sentences.append(attempt_sentence_prob) stored_probs[possible_sentence] = attempt_sentence_prob if use_cache: cache_set('possible_sentences', possible_sentence, attempt_sentence_prob) weighted_score = prod(prob_for_sentences) * (weight ** (len(possible_sentences) - 1)) if weighted_score > 0: log("Valid Parse: %s" % (possible_sentences,), 2) log(weighted_score, 2) all_possible_sentence_probs.append(weighted_score)
weighted_score = prod(prob_for_sentences) * (weight ** (len(possible_sentences) - 1)) if weighted_score > 0: log("Valid Parse: %s" % (possible_sentences,), 2) log(weighted_score, 2) all_possible_sentence_probs.append(weighted_score) max_prob = max(all_possible_sentence_probs) parse_for_max_prob = all_possible_sentences[all_possible_sentence_probs.index(max_prob)] log("All Probs: %s" % (all_possible_sentence_probs,), 2) log("MAX Prob: %f" % (max_prob,), 2) log("Parse for max prob: %s" % (parse_for_max_prob,), 2) log("Best Guess Num Sentences: %d" % (len(parse_for_max_prob),), 1) log("-------------\n\n", 1) if use_cache: cache_set("sentence_tokenizer", line, (parse_for_max_prob, max_prob)) return (parse_for_max_prob, max_prob) if include_prob else parse_for_max_prob if __name__ == '__main__': ## Simple method for testing from STDIN if use_stdin: print parse_sentences(cmd_utils.get_stdin()) else: essays = essay_utils.essays if essay_index == -1 else [essay_utils.essays[essay_index]] essays_in_corpus = [] for essay in essays: sentences_for_essay = []
def parse(text, use_cache=True): num_agrees = 0 num_not_agrees = 0 num_unsure = 0 lines = text.split("\n") for line in lines: sentences = sentence_tokenizer.parse(line, use_cache=use_cache) for sentence in sentences: line_agreements, line_non_agreements, line_unsure = 0, 0, 0 # Possession seems to be tricky for the parser, so we fudge # a little here sentence = sentence.replace("'s", '') if sentence[-1] != ".": sentence += "." if use_cache: cache_rs = cache_utils.cache_get('sub_verb_agreement', sentence) if cache_rs: line_agreements, line_non_agreements, line_unsure = cache_rs num_agrees += line_agreements num_not_agrees += line_non_agreements num_unsure += line_unsure continue log("Looking for Sub-Verb agreement in '%s'" % (sentence,), 1) tree = parsers.parse(sentence)[0] dependencies = parsers.dependences(sentence) sub_verb_deps = [dep for dep in dependencies if dep['dep_name'] == 'nsubj'] if len(sub_verb_deps) == 0: log("Couldn't find Subject-Verb dependency info", 1) cache_utils.cache_set('sub_verb_agreement', sentence, (0, 0, 0)) continue for sub_verb in sub_verb_deps: first_node = node_in_tree(tree, sub_verb['first_word']) sec_node = node_in_tree(tree, sub_verb['second_word']) if first_node and sec_node: log("First Dep Node: %s" % (first_node,), 2) log("Sec Dep Node: %s" % (sec_node,), 2) try: is_agreement = check_node_agreement(first_node, sec_node) if is_agreement: line_agreements += 1 else: line_non_agreements += 1 log("Agreement in sentence? %s" % (is_agreement,), 1) except Exception as e: line_unsure += 1 log("Error looking for agreement? %s" % (e.message,), 2) # No agreement in pair. Not sure how to handle. # More exhaustive search? if use_cache: cache_utils.cache_set('sub_verb_agreement', sentence, (line_agreements, line_non_agreements, line_unsure)) num_agrees += line_agreements num_not_agrees += line_non_agreements num_unsure += line_unsure return num_agrees, num_not_agrees, num_unsure
def parse_sentences(line, use_cache=True, include_prob=False): log("Working on: %s" % (line, ), 2) if use_cache: correct_parse = cache_get("sentence_tokenizer", line) if correct_parse: log("Cache Hit: %s" % (correct_parse[0], ), 4) log("-------------\n", 4) return correct_parse if include_prob else correct_parse[0] all_possible_sentences = _possible_sentences_in_line(line) all_possible_sentence_probs = [] invalid_possible_sentences = [] stored_probs = {} for possible_sentences in all_possible_sentences: log("Examining: %s" % (possible_sentences, ), 1) prob_for_sentences = [] sent_is_impossible = False for possible_sentence in possible_sentences: if use_cache: possible_sentence_prob = cache_get('possible_sentences', possible_sentence) if possible_sentence_prob is not None: log( "Cache Hit: %s (from %s)" % (possible_sentence, 'possible sentences'), 4) prob_for_sentences.append(possible_sentence_prob) continue if contains_any_invalid_setences( possible_sentences, invalid_possible_sentences) or sent_is_impossible: prob_for_sentences.append(0) continue elif possible_sentence in stored_probs: prob_for_sentences.append(stored_probs[possible_sentence]) continue sentence_trees = parsers.parse(possible_sentence) if len(sentence_trees) == 0: log("Wasn't able to parse input %s" % (possible_sentence, ), 0) prob_for_sentences.append(0) invalid_possible_sentences.append(possible_sentence) sent_is_impossible = True continue else: sentence_tree = sentence_trees[0] if cmd_log_level() >= 4: print "--------" print "Pre Simplified Tree" print sentence_tree tree_utils.simplify_tree( sentence_tree, remove_starting_cc=possible_sentences.index( possible_sentence) == 0) if cmd_log_level() >= 4: print "--------" print "Post Simplified Tree" print sentence_tree sentence_transitions = tree_utils.transitions_in_tree( sentence_tree) if not is_possible_sentence(sentence_tree): log("%s" % (sentence_transitions, ), 2) log("Invalid parse", 2) prob_for_sentences.append(0) invalid_possible_sentences.append(possible_sentence) sent_is_impossible = True if use_cache: cache_set('possible_sentences', possible_sentence, 0) else: log("%s" % (sentence_transitions, ), 2) sentence_probs = [] for transition in sentence_transitions: try: probs = hmm_utils.prob_of_all_transitions(transition, counts, gram_size=3) except KeyError, e: log("'Imposible' Tag order", 2, sep=' ** ') log("%s" % (e, ), 2, sep=' ** ') probs = [0] sentence_probs += probs log("Transitions: %s" % (transition, ), 3) log("Probabilities: %s" % (probs, ), 3) attempt_sentence_prob = prod(sentence_probs) sentence_prob_boost = boost_for_sentence_tree(sentence_tree) attempt_sentence_prob *= sentence_prob_boost prob_for_sentences.append(attempt_sentence_prob) stored_probs[possible_sentence] = attempt_sentence_prob if use_cache: cache_set('possible_sentences', possible_sentence, attempt_sentence_prob) weighted_score = prod(prob_for_sentences) * (weight**( len(possible_sentences) - 1)) if weighted_score > 0: log("Valid Parse: %s" % (possible_sentences, ), 2) log(weighted_score, 2) all_possible_sentence_probs.append(weighted_score)
if weighted_score > 0: log("Valid Parse: %s" % (possible_sentences, ), 2) log(weighted_score, 2) all_possible_sentence_probs.append(weighted_score) max_prob = max(all_possible_sentence_probs) parse_for_max_prob = all_possible_sentences[ all_possible_sentence_probs.index(max_prob)] log("All Probs: %s" % (all_possible_sentence_probs, ), 2) log("MAX Prob: %f" % (max_prob, ), 2) log("Parse for max prob: %s" % (parse_for_max_prob, ), 2) log("Best Guess Num Sentences: %d" % (len(parse_for_max_prob), ), 1) log("-------------\n\n", 1) if use_cache: cache_set("sentence_tokenizer", line, (parse_for_max_prob, max_prob)) return (parse_for_max_prob, max_prob) if include_prob else parse_for_max_prob if __name__ == '__main__': ## Simple method for testing from STDIN if use_stdin: print parse_sentences(cmd_utils.get_stdin()) else: essays = essay_utils.essays if essay_index == -1 else [ essay_utils.essays[essay_index] ] essays_in_corpus = []