Beispiel #1
0
def get_events_and_text(sent):
    """
    sent is a spacy parsed sentence (parsed through the default English spacy pipeline)
    Extract the events and the text of the events from a line of COPA
    """
    text = sent.text
    sorels = ['nsubj', 'dobj', 'iobj']
    outputs = []
    pp = PredPatt.from_sentence(text)
    events = pp.events
    for event in events:
        position = event.position
        args = event.arguments
        event_rels = {}
        for a in args:
            head = a.root
            govrel = head.gov_rel
            event_rels[govrel] = head
        lemma = sent[position].lemma_
        if 'nsubj' in event_rels:
            e1 = lemma + '->nsubj'
            e1_text = predpatt2text(event)
        elif 'dobj' in event_rels:
            e1 = lemma + '->dobj'
            e1_text = predpatt2text(event)
        elif 'iobj' in event_rels:
            e1 = lemma + '->iobj'
            e1_text = predpatt2text(event)
        else:
            e1 = lemma + '->nsubj'
            e1_text = predpatt2text(event)

        outputs.append({'e1': e1, 'e1_text': e1_text})
    return outputs
Beispiel #2
0
def extract_triples(input_remaining, params):
    opts = PredPattOpts(
        resolve_relcl=True,  # relative clauses
        resolve_appos=True,  # appositional modifiers
        resolve_amod=True,  # adjectival modifiers
        resolve_conj=True,  # conjuction
        resolve_poss=True,  # possessives
        ud=dep_v1.VERSION,  # the version of UD
    )
    triples = {}
    remaining = {}
    for idx in input_remaining:
        for line in input_remaining[idx]:
            if line.strip():
                try:
                    pp = PredPatt.from_sentence(line,
                                                opts=opts,
                                                cacheable=False)
                    extractions = get_predpatt_triples(pp, line)
                    if extractions:
                        triples.setdefault(idx, []).extend(extractions)
                except KeyError:
                    pass
        if idx not in triples:
            remaining[idx] = input_remaining[idx]
            triples[idx] = []
    return triples, remaining
Beispiel #3
0
def predpatt_visualize(s):
    sid = '{:x}'.format(zlib.adler32(s.encode()))
    pp = PredPatt.from_sentence(s)
    for i, e in enumerate(pp.events):
        tree = pp_dot_tree(e)
        tree.add_node(pydot.Node('label', label=s, shape='plaintext'))
        tree.add_edge(pydot.Edge('label', e.root.__repr__(), style='invis'))
        try:
            tree.write_png('tree_{}_{}.png'.format(sid, i))
        except AssertionError:
            print('AssertionError for: {}'.format(s))
            pass  # pydot errors are useless
Beispiel #4
0
def get_vector(sentence):
    global DEPENDENCIES, verbs_classes, class_index
    sent = PredPatt.from_sentence(sentence)
    #print sent.pprint()
    return_vector = numpy.zeros(len(DEPENDENCIES), dtype='float64')
    classes_vector = numpy.zeros(4, dtype='float64')
    google_vector = numpy.zeros(300, dtype='float64')
    for predicate in sent.events:
        #print "Predicate: ", predicate
        #print "Predicate Root Text: ", predicate.root.text
        lemmatised_word = lemmatizer.lemmatize(predicate.root.text.lower())
        for mclass in verbs_classes.keys():
            if lemmatised_word.upper() in verbs_classes[mclass]:
                classes_vector[class_dict[mclass]] += 1
        google_vector += get_word_vector(predicate.root.text)
        for argument in sent.argument_extract(predicate):
            #print "Argument: ", argument
            google_vector += get_word_vector(argument.root.text)
            for rule in argument.rules:
                #print "Rule: ", rule
                try:
                    rule_name = rule.edge
                except:
                    continue
                #print "Rule Name: ", rule_name
                try:
                    return_vector[DEPENDENCIES[rule_name.rel]] += 1
                except:
                    pass
    #print "Google Vector: ", len(google_vector)
    #print "Classes Vector: ", len(classes_vector)
    #print "Return Vector: ", len(return_vector)
    ans = numpy.append(google_vector,
                       numpy.append(return_vector, classes_vector))
    if numpy.all(ans == 0): return None
    return ans
def foo(docs_path):
    """ - foo
    """

    print('checking file length')
    num_lines = sum(1 for line in open(docs_path))

    print('staring')
    with open(docs_path) as f:
        # arg_num_dict = {}
        pred_num_dict = {}
        subj_num_dict = {}
        obj_num_dict = {}
        claim_num_dict = {}
        pp_total_time = 0
        timeouts = 0
        bad_patterns = 0
        for idx, line in enumerate(f):
            aid, adjacent, in_doc, text = line.split('\u241E')
            t1 = datetime.datetime.now()
            signal.signal(signal.SIGALRM, signal_handler)
            signal.alarm(60)
            try:
                pp = PredPatt.from_sentence(text, cacheable=False)
            except Exception as msg:
                signal.alarm(0)
                timeouts += 1
                continue
            signal.alarm(0)
            t2 = datetime.datetime.now()
            d = t2 - t1
            pp_total_time += d.total_seconds()
            for pred, patt in pp.event_dict.items():
                # TODO: rework with following dependency trees
                #       and evaluating relevance of nodes with
                #       regards to cited doc
                if not patt.has_subj() or not patt.has_obj():
                    bad_patterns += 1
                    continue
                pred_norm = normalize(pred.text)
                if pred_norm not in pred_num_dict:
                    pred_num_dict[pred_norm] = 0
                pred_num_dict[pred_norm] += 1
                subj = normalize(patt.subj().phrase())
                obj = normalize(patt.obj().phrase())
                if subj not in subj_num_dict:
                    subj_num_dict[subj] = 0
                subj_num_dict[subj] += 1
                if obj not in obj_num_dict:
                    obj_num_dict[obj] = 0
                obj_num_dict[obj] += 1
                claim = '{} {} {}'.format(subj, pred_norm, obj)
                if claim not in claim_num_dict:
                    claim_num_dict[claim] = 0
                claim_num_dict[claim] += 1
            #     for arg in patt.arguments:
            #         arg_norm = normalize(arg.phrase())
            #         if arg_norm not in arg_num_dict:
            #             arg_num_dict[arg_norm] = 0
            #         arg_num_dict[arg_norm] += 1
            print('- - - - {}/{} lines - - - -'.format(idx, num_lines))
            pp_avg_time = pp_total_time / (idx + 1)
            print('# timeouts {}'.format(timeouts))
            print('# bad_patterns {}'.format(bad_patterns))
            print('avg time per context: {:.2f}s'.format(pp_avg_time))
            # sorted_arg = sorted(arg_num_dict.items(),
            #                     key=operator.itemgetter(1),
            #                     reverse=True)
            sorted_pred = sorted(pred_num_dict.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True)
            sorted_subj = sorted(subj_num_dict.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True)
            sorted_obj = sorted(obj_num_dict.items(),
                                key=operator.itemgetter(1),
                                reverse=True)
            sorted_claim = sorted(claim_num_dict.items(),
                                  key=operator.itemgetter(1),
                                  reverse=True)
            print('- - top 10 subjects - -')
            for subj, num in sorted_subj[:10]:
                print('{}: {}'.format(num, subj[:30]))
            print('- - top 10 predicates - -')
            for pred, num in sorted_pred[:10]:
                print('{}: {}'.format(num, pred[:30]))
            print('- - top 10 objects - -')
            for obj, num in sorted_obj[:10]:
                print('{}: {}'.format(num, obj[:30]))
            print('- - top 10 claims - -')
            for claim, num in sorted_claim[:10]:
                print('{}: {}'.format(num, claim[:100]))
            # print('- - top 10 args - -')
            # for arg, num in sorted_arg[:10]:
            #     print('{}: {}'.format(num, arg[:30]))
            # if idx%100 == 0:
            #     with open('arg_num_dict.json', 'w') as f:
            #         f.write(json.dumps(arg_num_dict))
            #     with open('pred_num_dict.json', 'w') as f:
            #         f.write(json.dumps(pred_num_dict))
        # sorted_arg = sorted(arg_num_dict.items(),
        #                     key=operator.itemgetter(1),
        #                     reverse=True)
        sorted_pred = sorted(pred_num_dict.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
        sorted_subj = sorted(subj_num_dict.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
        sorted_obj = sorted(obj_num_dict.items(),
                            key=operator.itemgetter(1),
                            reverse=True)
        sorted_claim = sorted(claim_num_dict.items(),
                              key=operator.itemgetter(1),
                              reverse=True)
        print('- - top 100 subjects - -')
        for subj, num in sorted_subj[:100]:
            print('{}: {}'.format(num, subj[:30]))
        print('- - top 100 predicates - -')
        for pred, num in sorted_pred[:100]:
            print('{}: {}'.format(num, pred[:30]))
        print('- - top 100 objects - -')
        for obj, num in sorted_obj[:100]:
            print('{}: {}'.format(num, obj[:30]))
        print('- - top 100 claims - -')
        for claim, num in sorted_claim[:100]:
            print('{}: {}'.format(num, claim[:100]))
Beispiel #6
0
"""
Example of programmatic PredPatt usage.
"""

# Run PredPatt on sentence
from predpatt import PredPatt
sentence = 'Chris loves silly dogs and clever cats .'
P = PredPatt.from_sentence(sentence)

# Pretty-print output
print P.pprint(track_rule=True, color=True)

print '______________________________________________________________________________'

# A deeper look into PredPatt's internal representations.
#
# Each extraction is kept in a list called instances. Below we will loop through
# each instance and print it's arguments.
for x in P.instances:
    print
    print x, x.phrase()
    for a in x.arguments:
        print ' ', a, a.phrase()

        # Uncomment to list rules which fired on this proposition. Along with
        # an explanation.
        #for r in a.rules:
        #    print '    %s: %s' % (r, r.explain())

print '______________________________________________________________________________'
print
Beispiel #7
0
from predpatt import PredPatt

pp = PredPatt.from_sentence(
    'At the Pentagon briefing today, General Stanley McChrystal said that it looked a lot like terrorism.'
)
#print(pp.pprint())
# print(" ".join([token.text for token in pp.tokens]))
# print(pp.events)
# print(pp.event_dict)
# print(pp.events)

for event in pp.events:
    print(event)
    for argument in event.arguments:
        print(argument)
def build_sentence_representation(s):
    """ Build representation of a sentence by analyzing predpatt output.

        Returns a weighted list of lists of terms.
    """

    s = merge_citation_token_lists(s)
    s = remove_qutation_marks(s)
    lemmatizer = WordNetLemmatizer()
    raw_lists = []
    rep_lists = []
    rep_lists_alt = []  # to be consistent with double annotating for 3 and 3.1
    try:
        pp = PredPatt.from_sentence(s, cacheable=False)  # for speed tests
    except Exception as e:
        print('= = = PredPatt exception = = =')
        print('input:\n{}'.format(s))
        print('exception:\n{}'.format(e))
        return rep_lists, rep_lists_alt
    if len(pp.events) == 0:
        return rep_lists, rep_lists_alt
    if CIT_BASED:
        for e in pp.events:
            depth, rep = build_tree_representation(e)
            if INCLUDE_PREDICATE:
                pred = get_predicate(e.root)
                rep = ['{}:{}'.format(pred, r) for r in rep]
            if len(rep) > 0:
                raw_lists.append([depth, rep])
        weight = 1
        for rl in sorted(raw_lists, key=itemgetter(0)):
            rep_lists.append([weight, rl[1]])
            weight *= .5
        if len(rep_lists) == 0:
            fallback = build_noun_representation(pp.events[0],
                                                 global_root=True)
            if INCLUDE_PREDICATE:
                pred = get_predicate(pp.events[0].root)
                fallback = ['{}:{}'.format(pred, f) for f in fallback]
            if len(fallback) > 0:
                rep_lists = [[.25, fallback]]
    else:
        # make a PPv3 and a PPv3.1 representation
        # - - - 3.1 - - -
        reps = []
        for e in pp.events:
            rep = build_noun_representation(e)  # 3.1
            if INCLUDE_PREDICATE:
                pred = get_predicate(e.root)
                rep = ['{}:{}'.format(pred, f) for f in rep]
            reps.extend(rep)
        if len(reps) > 0:
            rep_lists = [[1, reps]]
        # - - - 3 - - -
        reps_alt = []
        for e in pp.events:
            rep = build_noun_representation(e, global_root=True)  # 3
            if INCLUDE_PREDICATE:
                pred = get_predicate(e.root)
                rep = ['{}:{}'.format(pred, f) for f in rep]
            reps_alt.extend(rep)
        if len(reps) > 0:
            rep_lists_alt = [[1, reps_alt]]

    rep_lists = normalize_rep_lists(rep_lists, lemmatizer)
    rep_lists_alt = normalize_rep_lists(rep_lists_alt, lemmatizer)
    return rep_lists, rep_lists_alt