Beispiel #1
0
def parse(sentence, use_cache=True, parser='stanford'):

    cache_key = "parse_trees_{0}".format(parser)
    valid_lines = None

    if use_cache:
        cache_attempt = cache_get(cache_key, sentence)
        if cache_attempt:
            valid_lines = cache_attempt

    if valid_lines is None:
        if parser == "stanford":
            response = parse_stanford(sentence, use_cache=use_cache)
        elif parser == "malt":
            response = parse_malt(sentence, use_cache=use_cache)
        else:
            return []

        valid_lines = [line for line in response.split("\n") if len(line) > 2 and line[0] == "(" and line[-1] == ")"]

        if use_cache:
            cache_set(cache_key, sentence, valid_lines)

    # throw away the garbgage we don't want from the parser's response.
    # this could probably get us in trouble since it'll hide errors etc,
    # but we got deadlines....
    trees = [ParentedTree.parse(line) for line in valid_lines]

    return trees
Beispiel #2
0
def parse_coref(sentence, use_cache=True):
    cache_key = "coreferences"
    if use_cache:
        cache_attempt = cache_get(cache_key, sentence)
        if cache_attempt is not None:
            return cache_attempt

    import re
    if not hasattr(parse_coref, '_regex'):
        parse_coref._regex = re.compile(r'\((?P<pronoun_sentence>\d+),(?P<pronoun_loc>\d+),.*?-> \((?P<ref_sentence>\d+),(?P<ref_loc>\d+).*?"(?P<pronoun>.*?)" -> "(?P<ref>.*?)"')

    parser_dir = os.path.realpath(os.path.join("contrib", "stanford-corenlp"))
    cmd_parts = ('java', '-Xmx3g', '-cp', parser_dir + "/*:",
                 'edu.stanford.nlp.pipeline.StanfordCoreNLP',
                 '-annotators', 'tokenize,ssplit,pos,lemma,ner,parse,dcoref',
                 'dcoref.logFile', '/dev/null', '-')
    response = _exec_cmd(cmd_parts, sentence)

    matches = []
    start = False
    for line in response.split("\n"):
        if line == "NLP> Annotation pipeline timing information:":
            break
        elif line == "Coreference set:":
            start = True
        elif start:
            a_match = parse_coref._regex.search(line.strip())
            if a_match:
                matches.append(a_match.groupdict())

    if use_cache:
        cache_set(cache_key, sentence, matches)
    return matches
def stemmed_words(words, key=False, use_cache=False):
    if use_cache:
        stemmed_wordbank = cache_get('stemmed_wordbank', 'words_{0}'.format(key))
        if stemmed_wordbank is not None:
            return stemmed_wordbank
    words = set([stemmer.stem(a_word) for a_word in build_words(words, key)])
    if use_cache:
        cache_set('stemmed_wordbank', 'words_{0}'.format(key), words)
    return words
Beispiel #4
0
def stemmed_words(words, key=False, use_cache=False):
    if use_cache:
        stemmed_wordbank = cache_get('stemmed_wordbank',
                                     'words_{0}'.format(key))
        if stemmed_wordbank is not None:
            return stemmed_wordbank
    words = set([stemmer.stem(a_word) for a_word in build_words(words, key)])
    if use_cache:
        cache_set('stemmed_wordbank', 'words_{0}'.format(key), words)
    return words
def get_treebank_rules(cutoff=0, include_counts=False):
    all_rules = cache_utils.cache_get('treebank_rules', 'rules')
    if not all_rules:
        log('Generating lexical rules from Penn Treebank', 4)
        from nltk.corpus import treebank
        all_rules = dict()
        for tree in treebank.parsed_sents():
            for rule, count in lexical_rules(tree).items():
                all_rules[rule] = all_rules.get(rule, 0) + count

        cache_utils.cache_set('treebank_rules', 'rules', all_rules)

    if include_counts:
        return {k: v for (k, v) in all_rules.items() if v > cutoff}
    else:
        rules_set = set([rule for rule, count in all_rules.items() if count > cutoff])
        return rules_set
def get_treebank_rules(cutoff=0, include_counts=False):
    all_rules = cache_utils.cache_get('treebank_rules', 'rules')
    if not all_rules:
        log('Generating lexical rules from Penn Treebank', 4)
        from nltk.corpus import treebank
        all_rules = dict()
        for tree in treebank.parsed_sents():
            for rule, count in lexical_rules(tree).items():
                all_rules[rule] = all_rules.get(rule, 0) + count

        cache_utils.cache_set('treebank_rules', 'rules', all_rules)

    if include_counts:
        return {k: v for (k, v) in all_rules.items() if v > cutoff}
    else:
        rules_set = set(
            [rule for rule, count in all_rules.items() if count > cutoff])
        return rules_set
def build_words(start_words, key=None, use_cache=True, max_depth=3):
    if use_cache:
        wordbank = cache_get('wordbank', 'words_{0}'.format(key))
        if wordbank is not None:
            return wordbank
    words = set()
    for start_word in start_words:
        words.add(start_word)
        for wb_word in wordnet.synsets(start_word, wordnet.NOUN):
            for method in methods:
                new_words = getattr(wb_word, method)()
                for a_word in [a_new_word for a_new_word in new_words]:
                    words.add(a_word.name.split(".")[0].replace("_", "-"))
                    if max_depth > 0:
                        words.update(build_words([a_word.name], key=None, use_cache=False, max_depth=(max_depth-1)))
                        # for lemma in a_word.lemmas:
                        #     words.update(build_words(lemma.name.split(".")[-1].replace("_", "-"), use_cache=False, max_depth=(max_depth-1)))
    if use_cache:
        cache_set('wordbank', 'words', 'words_{0}'.format(key))
    return words
Beispiel #8
0
def build_words(start_words, key=None, use_cache=True, max_depth=3):
    if use_cache:
        wordbank = cache_get('wordbank', 'words_{0}'.format(key))
        if wordbank is not None:
            return wordbank
    words = set()
    for start_word in start_words:
        words.add(start_word)
        for wb_word in wordnet.synsets(start_word, wordnet.NOUN):
            for method in methods:
                new_words = getattr(wb_word, method)()
                for a_word in [a_new_word for a_new_word in new_words]:
                    words.add(a_word.name.split(".")[0].replace("_", "-"))
                    if max_depth > 0:
                        words.update(
                            build_words([a_word.name],
                                        key=None,
                                        use_cache=False,
                                        max_depth=(max_depth - 1)))
                        # for lemma in a_word.lemmas:
                        #     words.update(build_words(lemma.name.split(".")[-1].replace("_", "-"), use_cache=False, max_depth=(max_depth-1)))
    if use_cache:
        cache_set('wordbank', 'words', 'words_{0}'.format(key))
    return words
Beispiel #9
0
def parse(text, use_cache=True):
    num_agrees = 0
    num_not_agrees = 0
    num_unsure = 0

    lines = text.split("\n")
    for line in lines:
        sentences = sentence_tokenizer.parse(line, use_cache=use_cache)
        for sentence in sentences:

            line_agreements, line_non_agreements, line_unsure = 0, 0, 0

            # Possession seems to be tricky for the parser, so we fudge
            # a little here
            sentence = sentence.replace("'s", '')
            if sentence[-1] != ".":
                sentence += "."

            if use_cache:
                cache_rs = cache_utils.cache_get('sub_verb_agreement',
                                                 sentence)
                if cache_rs:
                    line_agreements, line_non_agreements, line_unsure = cache_rs
                    num_agrees += line_agreements
                    num_not_agrees += line_non_agreements
                    num_unsure += line_unsure
                    continue

            log("Looking for Sub-Verb agreement in '%s'" % (sentence, ), 1)

            tree = parsers.parse(sentence)[0]
            dependencies = parsers.dependences(sentence)
            sub_verb_deps = [
                dep for dep in dependencies if dep['dep_name'] == 'nsubj'
            ]

            if len(sub_verb_deps) == 0:
                log("Couldn't find Subject-Verb dependency info", 1)
                cache_utils.cache_set('sub_verb_agreement', sentence,
                                      (0, 0, 0))
                continue

            for sub_verb in sub_verb_deps:
                first_node = node_in_tree(tree, sub_verb['first_word'])
                sec_node = node_in_tree(tree, sub_verb['second_word'])
                if first_node and sec_node:

                    log("First Dep Node: %s" % (first_node, ), 2)
                    log("Sec Dep Node: %s" % (sec_node, ), 2)

                    try:
                        is_agreement = check_node_agreement(
                            first_node, sec_node)
                        if is_agreement:
                            line_agreements += 1
                        else:
                            line_non_agreements += 1
                        log("Agreement in sentence? %s" % (is_agreement, ), 1)
                    except Exception as e:
                        line_unsure += 1
                        log("Error looking for agreement? %s" % (e.message, ),
                            2)

                        # No agreement in pair.  Not sure how to handle.
                        # More exhaustive search?
            if use_cache:
                cache_utils.cache_set(
                    'sub_verb_agreement', sentence,
                    (line_agreements, line_non_agreements, line_unsure))
            num_agrees += line_agreements
            num_not_agrees += line_non_agreements
            num_unsure += line_unsure

    return num_agrees, num_not_agrees, num_unsure
Beispiel #10
0
def issues_in_sentence(sentence, use_cache=True):
    """'Brute force' check for a bunch of possible word ordering issues.
    Specifically, looking for the following:
        - VP coming before NP in standard sentence
        - NP coming before VP in inverted sentence
        - JJ coming after Nount in NP
        - VB before PP in VP
        - VB before NP in VP
        - VP before S in standard sentence (with embedded sentences)
        - NN before CD in NP
        - NNP before CD in NP
    """
    if use_cache:
        result = cache_get('word_order_issues', sentence)
        if result is not None:
            return result

    tree = parsers.parse(sentence)[0]
    tree_utils.simplify_tree(tree, trim_adjecent_prop_nouns=True,
                             normalize_sent_roots=True,
                             normalize_plural=True,
                             normalize_case=True)

    log("Looking for order issues in: %s" % (sentence,), 1)
    if cmd_log_level() >= 4:
        print "Simplified Parse Tree"
        print tree

    problems = []
    problems += ["VP->NP in S"] * num_forbidden_orders(tree, ("S",), ('VP', 'NP'))
    problems += ["NP->VP in SINV"] * num_forbidden_orders(tree, ('SINV',), ('NP', 'VP'))
    problems += ["NN->JJ in NP"] * num_forbidden_orders(tree, ('NP',), ('NN', 'JP'))

    problems += ["PP->VB in VP"] * num_forbidden_orders(tree, ('VP',), ('PP', 'VB'))
    problems += ["NP->VP in VP"] * num_forbidden_orders(tree, ('VP',), ('NP', 'VP'))

    problems += ["S->VP in S"] * num_forbidden_orders(tree, ('S',), ('S', 'VP'))

    problems += ["S->VB in VP"] * num_forbidden_orders(tree, ('VP',), ('S', 'VB'))
    # problems += ["VB->VP in VP"] * num_forbidden_orders(tree, ('VP',), ('VB', 'VP'))

    problems += ["NP->RBR in ADVP"] * num_forbidden_orders(tree, ('ADVP',), ('NP', 'RBR'))
    problems += ["NN->DT in NP"] * num_forbidden_orders(tree, ('NP',), ('NN', 'DT'))
    problems += ["NNP->DT in NP"] * num_forbidden_orders(tree, ('NP',), ('NNP', 'DT'))
    problems += ["NN->CD in NP"] * num_forbidden_orders(tree, ('NP',), ('NN', 'CD'))
    problems += ["NNP->CD in NP"] * num_forbidden_orders(tree, ('NP',), ('NNP', 'CD'))

    problems += ['PP->NP in S'] * num_forbidden_orders(tree, ('S',), ('PP', 'NP'))

    # Toggle?
    problems += ['NP->VP in NP'] * num_forbidden_orders(tree, ('NP',), ('NP', 'VP'))

    # Seems like it should be VB->ADVP->PP
    problems += ['VB->PP->ADVP in VP'] * num_forbidden_orders(tree, ('VP',), ('VB', 'PP', 'ADVP'))
    problems += ['VB->PP->SBAR in VP'] * num_forbidden_orders(tree, ('VP',), ('VB', 'PP', 'SBAR'))

    problems += ['NP->S in NP'] * num_forbidden_orders(tree, ('NP',), ('NP', 'S'))

    # Seems like the ADJP should be in a NP or somewhere else, not a sibling
    # of a noun phrase
    problems += ['NP->ADJP in S'] * num_forbidden_orders(tree, ('S',), ('NP', 'ADJP'))

    # Last, if there is an S w/ only one child, we call it a word order problem...
    problems += ['Single Child S'] * len(list(tree.subtrees(lambda x: x in tree_utils.semi_tree_roots and len(x) == 1)))

    if tree[0].node not in tree_utils.semi_tree_roots and not hasattr(tree[0], '_has_error'):
        tree[0]._has_error = True
        problems += ['No S Root']

    log("Found %d order issues" % (len(problems),), 1)
    log("Issues: %s", (problems,), 2)

    if use_cache:
        cache_set('word_order_issues', sentence, problems)

    return problems
def parse_sentences(line, use_cache=True, include_prob=False):

    log("Working on: %s" % (line,), 2)

    if use_cache:
        correct_parse = cache_get("sentence_tokenizer", line)
        if correct_parse:
            log("Cache Hit: %s" % (correct_parse[0],), 4)
            log("-------------\n", 4)
            return correct_parse if include_prob else correct_parse[0]

    all_possible_sentences = _possible_sentences_in_line(line)
    all_possible_sentence_probs = []
    invalid_possible_sentences = []
    stored_probs = {}

    for possible_sentences in all_possible_sentences:

        log("Examining: %s" % (possible_sentences,), 1)
        prob_for_sentences = []
        sent_is_impossible = False

        for possible_sentence in possible_sentences:

            if use_cache:
                possible_sentence_prob = cache_get('possible_sentences', possible_sentence)
                if possible_sentence_prob is not None:
                    log("Cache Hit: %s (from %s)" % (possible_sentence, 'possible sentences'), 4)
                    prob_for_sentences.append(possible_sentence_prob)
                    continue

            if contains_any_invalid_setences(possible_sentences, invalid_possible_sentences) or sent_is_impossible:
                prob_for_sentences.append(0)
                continue
            elif possible_sentence in stored_probs:
                prob_for_sentences.append(stored_probs[possible_sentence])
                continue

            sentence_trees = parsers.parse(possible_sentence)
            if len(sentence_trees) == 0:
                log("Wasn't able to parse input %s" % (possible_sentence,), 0)
                prob_for_sentences.append(0)
                invalid_possible_sentences.append(possible_sentence)
                sent_is_impossible = True
                continue
            else:
                sentence_tree = sentence_trees[0]

            if cmd_log_level() >= 4:
                print "--------"
                print "Pre Simplified Tree"
                print sentence_tree

            tree_utils.simplify_tree(sentence_tree,
                                     remove_starting_cc=possible_sentences.index(possible_sentence) == 0)

            if cmd_log_level() >= 4:
                print "--------"
                print "Post Simplified Tree"
                print sentence_tree

            sentence_transitions = tree_utils.transitions_in_tree(sentence_tree)

            if not is_possible_sentence(sentence_tree):
                log("%s" % (sentence_transitions,), 2)
                log("Invalid parse", 2)
                prob_for_sentences.append(0)
                invalid_possible_sentences.append(possible_sentence)
                sent_is_impossible = True
                if use_cache:
                    cache_set('possible_sentences', possible_sentence, 0)
            else:
                log("%s" % (sentence_transitions,), 2)
                sentence_probs = []
                for transition in sentence_transitions:
                    try:
                        probs = hmm_utils.prob_of_all_transitions(transition, counts, gram_size=3)
                    except KeyError, e:
                        log("'Imposible' Tag order", 2, sep=' ** ')
                        log("%s" % (e,), 2, sep=' ** ')
                        probs = [0]
                    sentence_probs += probs
                    log("Transitions: %s" % (transition,), 3)
                    log("Probabilities: %s" % (probs,), 3)

                attempt_sentence_prob = prod(sentence_probs)

                sentence_prob_boost = boost_for_sentence_tree(sentence_tree)
                attempt_sentence_prob *= sentence_prob_boost

                prob_for_sentences.append(attempt_sentence_prob)
                stored_probs[possible_sentence] = attempt_sentence_prob
                if use_cache:
                    cache_set('possible_sentences', possible_sentence, attempt_sentence_prob)
        weighted_score = prod(prob_for_sentences) * (weight ** (len(possible_sentences) - 1))
        if weighted_score > 0:
            log("Valid Parse: %s" % (possible_sentences,), 2)
            log(weighted_score, 2)

        all_possible_sentence_probs.append(weighted_score)
Beispiel #12
0
def parse(text, use_cache=True):
    num_agrees = 0
    num_not_agrees = 0
    num_unsure = 0

    lines = text.split("\n")
    for line in lines:
        sentences = sentence_tokenizer.parse(line, use_cache=use_cache)
        for sentence in sentences:

            line_agreements, line_non_agreements, line_unsure = 0, 0, 0

            # Possession seems to be tricky for the parser, so we fudge
            # a little here
            sentence = sentence.replace("'s", '')
            if sentence[-1] != ".":
                sentence += "."

            if use_cache:
                cache_rs = cache_utils.cache_get('sub_verb_agreement', sentence)
                if cache_rs:
                    line_agreements, line_non_agreements, line_unsure = cache_rs
                    num_agrees += line_agreements
                    num_not_agrees += line_non_agreements
                    num_unsure += line_unsure
                    continue

            log("Looking for Sub-Verb agreement in '%s'" % (sentence,), 1)

            tree = parsers.parse(sentence)[0]
            dependencies = parsers.dependences(sentence)
            sub_verb_deps = [dep for dep in dependencies if dep['dep_name'] == 'nsubj']

            if len(sub_verb_deps) == 0:
                log("Couldn't find Subject-Verb dependency info", 1)
                cache_utils.cache_set('sub_verb_agreement', sentence, (0, 0, 0))
                continue

            for sub_verb in sub_verb_deps:
                first_node = node_in_tree(tree, sub_verb['first_word'])
                sec_node = node_in_tree(tree, sub_verb['second_word'])
                if first_node and sec_node:

                    log("First Dep Node: %s" % (first_node,), 2)
                    log("Sec Dep Node: %s" % (sec_node,), 2)

                    try:
                        is_agreement = check_node_agreement(first_node, sec_node)
                        if is_agreement:
                            line_agreements += 1
                        else:
                            line_non_agreements += 1
                        log("Agreement in sentence? %s" % (is_agreement,), 1)
                    except Exception as e:
                        line_unsure += 1
                        log("Error looking for agreement? %s" % (e.message,), 2)

                        # No agreement in pair.  Not sure how to handle.
                        # More exhaustive search?
            if use_cache:
                cache_utils.cache_set('sub_verb_agreement', sentence, (line_agreements, line_non_agreements, line_unsure))
            num_agrees += line_agreements
            num_not_agrees += line_non_agreements
            num_unsure += line_unsure

    return num_agrees, num_not_agrees, num_unsure
Beispiel #13
0
def parse_sentences(line, use_cache=True, include_prob=False):

    log("Working on: %s" % (line, ), 2)

    if use_cache:
        correct_parse = cache_get("sentence_tokenizer", line)
        if correct_parse:
            log("Cache Hit: %s" % (correct_parse[0], ), 4)
            log("-------------\n", 4)
            return correct_parse if include_prob else correct_parse[0]

    all_possible_sentences = _possible_sentences_in_line(line)
    all_possible_sentence_probs = []
    invalid_possible_sentences = []
    stored_probs = {}

    for possible_sentences in all_possible_sentences:

        log("Examining: %s" % (possible_sentences, ), 1)
        prob_for_sentences = []
        sent_is_impossible = False

        for possible_sentence in possible_sentences:

            if use_cache:
                possible_sentence_prob = cache_get('possible_sentences',
                                                   possible_sentence)
                if possible_sentence_prob is not None:
                    log(
                        "Cache Hit: %s (from %s)" %
                        (possible_sentence, 'possible sentences'), 4)
                    prob_for_sentences.append(possible_sentence_prob)
                    continue

            if contains_any_invalid_setences(
                    possible_sentences,
                    invalid_possible_sentences) or sent_is_impossible:
                prob_for_sentences.append(0)
                continue
            elif possible_sentence in stored_probs:
                prob_for_sentences.append(stored_probs[possible_sentence])
                continue

            sentence_trees = parsers.parse(possible_sentence)
            if len(sentence_trees) == 0:
                log("Wasn't able to parse input %s" % (possible_sentence, ), 0)
                prob_for_sentences.append(0)
                invalid_possible_sentences.append(possible_sentence)
                sent_is_impossible = True
                continue
            else:
                sentence_tree = sentence_trees[0]

            if cmd_log_level() >= 4:
                print "--------"
                print "Pre Simplified Tree"
                print sentence_tree

            tree_utils.simplify_tree(
                sentence_tree,
                remove_starting_cc=possible_sentences.index(
                    possible_sentence) == 0)

            if cmd_log_level() >= 4:
                print "--------"
                print "Post Simplified Tree"
                print sentence_tree

            sentence_transitions = tree_utils.transitions_in_tree(
                sentence_tree)

            if not is_possible_sentence(sentence_tree):
                log("%s" % (sentence_transitions, ), 2)
                log("Invalid parse", 2)
                prob_for_sentences.append(0)
                invalid_possible_sentences.append(possible_sentence)
                sent_is_impossible = True
                if use_cache:
                    cache_set('possible_sentences', possible_sentence, 0)
            else:
                log("%s" % (sentence_transitions, ), 2)
                sentence_probs = []
                for transition in sentence_transitions:
                    try:
                        probs = hmm_utils.prob_of_all_transitions(transition,
                                                                  counts,
                                                                  gram_size=3)
                    except KeyError, e:
                        log("'Imposible' Tag order", 2, sep=' ** ')
                        log("%s" % (e, ), 2, sep=' ** ')
                        probs = [0]
                    sentence_probs += probs
                    log("Transitions: %s" % (transition, ), 3)
                    log("Probabilities: %s" % (probs, ), 3)

                attempt_sentence_prob = prod(sentence_probs)

                sentence_prob_boost = boost_for_sentence_tree(sentence_tree)
                attempt_sentence_prob *= sentence_prob_boost

                prob_for_sentences.append(attempt_sentence_prob)
                stored_probs[possible_sentence] = attempt_sentence_prob
                if use_cache:
                    cache_set('possible_sentences', possible_sentence,
                              attempt_sentence_prob)
        weighted_score = prod(prob_for_sentences) * (weight**(
            len(possible_sentences) - 1))
        if weighted_score > 0:
            log("Valid Parse: %s" % (possible_sentences, ), 2)
            log(weighted_score, 2)

        all_possible_sentence_probs.append(weighted_score)