Ejemplo n.º 1
0
def parse(text):
    """
    Primary function to run syntaxnet and PredPatt over input sentences.
    """
    parse_tree, trace = annotate_text(text)
    conll_parsed = parse_to_conll(parse_tree)

    conll_pp = [ud_parse for sent_id, ud_parse in load_conllu(conll_parsed)][0]

    #PredPatt options. Modify as needed.
    resolve_relcl = True  # relative clauses
    resolve_appos = True  # appositional modifiers
    resolve_amod = True   # adjectival modifiers
    resolve_conj = True   # conjuction
    resolve_poss = True   # possessives
    ud = dep_v2.VERSION   # the version of UD
    opts = PredPattOpts(resolve_relcl=resolve_relcl,
                        resolve_appos=resolve_appos,
                        resolve_amod=resolve_amod,
                        resolve_conj=resolve_conj,
                        resolve_poss=resolve_poss,
                        ud=ud)
    ppatt = PredPatt(conll_pp, opts=opts)

    #NOTE:
    #This returns the pretty print formatted string from PredPatt. This is done
    #largely as a place holder for JSON compatability within the REST API. 
    return {'predpatt': ppatt.pprint(), 'conll': conll_parsed, 'original': text}
Ejemplo n.º 2
0
def test(data):
    from predpatt import PredPatt, load_conllu

    def fail(g, t):
        if len(g) != len(t):
            return True
        else:
            for i in g:
                if i not in t:
                    return True

    no_color = lambda x, _: x
    count, failed = 0, 0
    ret = ""
    for sent_id, ud_parse in load_conllu(data):
        count += 1
        pp = PredPatt(ud_parse)
        sent = ' '.join(t.text for t in pp.tokens)
        linearized_pp = linearize(pp)
        gold_preds = [
            predicate.format(C=no_color, track_rule=False)
            for predicate in pp.instances if likely_to_be_pred(predicate)
        ]
        test_preds = pprint_preds(
            construct_pred_from_flat(linearized_pp.split()))
        if fail(gold_preds, test_preds):
            failed += 1
            ret += (
                "Sent: %s\nLinearized PredPatt:\n\t%s\nGold:\n%s\nYours:\n%s\n\n"
                % (sent, linearized_pp, "\n".join(gold_preds),
                   "\n".join(test_preds)))
    print(ret)
    print("You have test %d instances, and %d failed the test." %
          (count, failed))
Ejemplo n.º 3
0
def extract_predpatt_text(row, eid_num:int):
    '''
    Given a pandas dataframe of TB data
    and eid_num (1 or 2)
    
    output predpatt predicate text
    (adds copula fillers in text)
    '''
    tokenid = getattr(row, f'eid{eid_num}_token_id')
    conllu_string = getattr(row, f'eid{eid_num}_sent_conllu')
    parsed_tb = [PredPatt(ud_parse, opts=options) for sent_id, ud_parse in load_conllu(conllu_string)]
    pred_objects = parsed_tb[0].instances
    
    curr_text = getattr(row, f'eid{eid_num}_text')
    
    pred_match = False
    #print(f"{(row['docid'], row['eventInstanceID'], row['relatedToEventInstance'])}")
    if pred_objects:
        for pred in pred_objects:
            if int(pred.root.position)==int(tokenid):
                pred_match = True
                pred_object = pred
                break
            else:
                pred_match=False
        
        if pred_match:
            pred_text, _, _, _ = predicate_info(pred_object)
            return pred_text
        else:
            return curr_text

    else:
        return getattr(row, f'eid{eid_num}_text')
Ejemplo n.º 4
0
def get_events_and_text(sent):
    """
    sent is a spacy parsed sentence (parsed through the default English spacy pipeline)
    Extract the events and the text of the events from a line of COPA
    """
    text = sent.text
    sorels = ['nsubj', 'dobj', 'iobj']
    outputs = []
    pp = PredPatt.from_sentence(text)
    events = pp.events
    for event in events:
        position = event.position
        args = event.arguments
        event_rels = {}
        for a in args:
            head = a.root
            govrel = head.gov_rel
            event_rels[govrel] = head
        lemma = sent[position].lemma_
        if 'nsubj' in event_rels:
            e1 = lemma + '->nsubj'
            e1_text = predpatt2text(event)
        elif 'dobj' in event_rels:
            e1 = lemma + '->dobj'
            e1_text = predpatt2text(event)
        elif 'iobj' in event_rels:
            e1 = lemma + '->iobj'
            e1_text = predpatt2text(event)
        else:
            e1 = lemma + '->nsubj'
            e1_text = predpatt2text(event)

        outputs.append({'e1': e1, 'e1_text': e1_text})
    return outputs
Ejemplo n.º 5
0
def extract_triples(input_remaining, params):
    opts = PredPattOpts(
        resolve_relcl=True,  # relative clauses
        resolve_appos=True,  # appositional modifiers
        resolve_amod=True,  # adjectival modifiers
        resolve_conj=True,  # conjuction
        resolve_poss=True,  # possessives
        ud=dep_v1.VERSION,  # the version of UD
    )
    triples = {}
    remaining = {}
    for idx in input_remaining:
        for line in input_remaining[idx]:
            if line.strip():
                try:
                    pp = PredPatt.from_sentence(line,
                                                opts=opts,
                                                cacheable=False)
                    extractions = get_predpatt_triples(pp, line)
                    if extractions:
                        triples.setdefault(idx, []).extend(extractions)
                except KeyError:
                    pass
        if idx not in triples:
            remaining[idx] = input_remaining[idx]
            triples[idx] = []
    return triples, remaining
Ejemplo n.º 6
0
    def from_conll(cls,
                   corpus: Union[str, TextIO],
                   name: str = 'ewt',
                   options: Optional[PredPattOpts] = None) -> 'PredPattCorpus':
        """Load a CoNLL dependency corpus and apply predpatt

        Parameters
        ----------
        corpus
            (path to) a .conllu file
        name
            the name of the corpus; used in constructing treeids
        options
            options for predpatt extraction
        """

        options = DEFAULT_PREDPATT_OPTIONS if options is None else options

        corp_is_str = isinstance(corpus, str)

        if corp_is_str and splitext(basename(corpus))[1] == '.conllu':
            with open(corpus) as infile:
                data = infile.read()

        elif corp_is_str:
            data = corpus

        else:
            data = corpus.read()

        # load the CoNLL dependency parses as graphs
        ud_corp = {name+'-'+str(i+1): [line.split()
                                       for line in block.split('\n')
                                       if len(line) > 0
                                       if line[0] != '#']
                   for i, block in enumerate(data.split('\n\n'))}
        ud_corp = CoNLLDependencyTreeCorpus(ud_corp)

        # extract the predpatt for those dependency parses
        try:
            predpatt = {name+'-'+sid.split('_')[1]: PredPatt(ud_parse,
                                                             opts=options)
                        for sid, ud_parse in load_conllu(data)}

        except ValueError:
            errmsg = 'PredPatt was unable to parse the CoNLL you provided.' +\
                     ' This is likely due to using a version of UD that is' +\
                     ' incompatible with PredPatt. Use of version 1.2 is' +\
                     ' suggested.'

            raise ValueError(errmsg)
            
        return cls({n: (pp, ud_corp[n])
                    for n, pp in predpatt.items()})
Ejemplo n.º 7
0
def generate_predicates(
    abstract_text:str,
    pred_patt_opts=None
)->Iterable[Tuple[str, str, str]]:
  "Requires that pred_util:nlp and pred_util:stopwords be initialized"
  nlp = dpg.get("pred_util:nlp")
  parser = Spacy2ConllParser(nlp=nlp)
  stopwords = dpg.get("pred_util:stopwords")

  doc = nlp(abstract_text)
  for sent in doc.sents:
    # if the sentence is very long
    if len(sent) >= 20:
      word_count = defaultdict(int)
      for tok in sent:
        word_count[str(tok)] += 1
        # if one word dominates the long sentence
      if max(word_count.values()) >= len(sent)*0.2:
        continue  # we likely generated the same word over-and-over
    conllu = "".join(list(parser.parse(input_str=str(sent))))
    for _, pred_patt_parse in load_conllu(conllu):
      predicates = PredPatt(
        pred_patt_parse,
        opts=pred_patt_opts
      ).instances
      for predicate in predicates:
        # We only care about 2-entity predicates
        if len(predicate.arguments) == 2:
          a_ents, b_ents = [
              # Get the set of entities
              filter(
                # Not in the stopword list
                lambda x: x not in stopwords,
                [str(e).strip() for e in nlp(args.phrase()).ents]
              )
              # For each argument
              for args in predicate.arguments
          ]
          # Slight cleaning needed to better match the predicate phrase
          # Note, that PredPatt predicates use ?a and ?b placeholders
          predicate_stmt = (
              re.match(
                r".*\?a(.*)\?b.*", # get text between placeholders
                predicate.phrase()
              )
              .group(1) # get the group matched between the placeholders
              .strip()
          )
          if len(predicate_stmt) > 0:
            # We're going to iterate all predicates
            for a, b in product(a_ents, b_ents):
              if a != b:
                yield (a, predicate_stmt, b)
Ejemplo n.º 8
0
def setup_graph():
    ud = DependencyGraphBuilder.from_conll(listtree, 'tree1')

    pp = PredPatt(next(load_conllu(rawtree))[1],
                  opts=PredPattOpts(resolve_relcl=True,
                                    borrow_arg_for_relcl=True,
                                    resolve_conj=False,
                                    cut=True))

    graph = PredPattGraphBuilder.from_predpatt(pp, ud, 'tree1')

    return pp, graph
Ejemplo n.º 9
0
def predpatt_visualize(s):
    sid = '{:x}'.format(zlib.adler32(s.encode()))
    pp = PredPatt.from_sentence(s)
    for i, e in enumerate(pp.events):
        tree = pp_dot_tree(e)
        tree.add_node(pydot.Node('label', label=s, shape='plaintext'))
        tree.add_edge(pydot.Edge('label', e.root.__repr__(), style='invis'))
        try:
            tree.write_png('tree_{}_{}.png'.format(sid, i))
        except AssertionError:
            print('AssertionError for: {}'.format(s))
            pass  # pydot errors are useless
Ejemplo n.º 10
0
def extract_predpatt(path='../../data/corpora/ud/UD_English-EWT-r1.2/'):
    '''
        Extract PredPatt objects from CONLLU files
    '''

    options = PredPattOpts(resolve_relcl=True,
                           borrow_arg_for_relcl=True,
                           resolve_conj=False,
                           cut=True)  # Resolve relative clause
    patt = {}

    for file in os.listdir(path):
        if file.endswith('.conllu'):
            with open(path + file, 'r') as infile:
                for sent_id, ud_parse in load_conllu(infile.read()):
                    patt[file + " " + sent_id] = \
                                                PredPatt(ud_parse, opts=options)

    return patt
Ejemplo n.º 11
0
    def extract(self, sentence: str) -> List[Dict[str, Any]]:
        processed = self.pipeline.process(sentence, self._error)
        if self._error.occurred():
            print(f"=== Error occurred: {self._error.message}")
            self._error = ProcessingError()
            return None
        else:
            conll_example = [ud_parse for sent_id, ud_parse in load_conllu(processed)][
                0
            ]
            ppatt = PredPatt(conll_example, opts=self._opts)
            result = []
            for predicate in ppatt.instances:
                structure = {
                    "predicate": predicate.tokens,
                    "arguments": [x.tokens for x in predicate.arguments],
                }
                result.append(structure)

            return result
Ejemplo n.º 12
0
def get_vector(sentence):
    global DEPENDENCIES, verbs_classes, class_index
    sent = PredPatt.from_sentence(sentence)
    #print sent.pprint()
    return_vector = numpy.zeros(len(DEPENDENCIES), dtype='float64')
    classes_vector = numpy.zeros(4, dtype='float64')
    google_vector = numpy.zeros(300, dtype='float64')
    for predicate in sent.events:
        #print "Predicate: ", predicate
        #print "Predicate Root Text: ", predicate.root.text
        lemmatised_word = lemmatizer.lemmatize(predicate.root.text.lower())
        for mclass in verbs_classes.keys():
            if lemmatised_word.upper() in verbs_classes[mclass]:
                classes_vector[class_dict[mclass]] += 1
        google_vector += get_word_vector(predicate.root.text)
        for argument in sent.argument_extract(predicate):
            #print "Argument: ", argument
            google_vector += get_word_vector(argument.root.text)
            for rule in argument.rules:
                #print "Rule: ", rule
                try:
                    rule_name = rule.edge
                except:
                    continue
                #print "Rule Name: ", rule_name
                try:
                    return_vector[DEPENDENCIES[rule_name.rel]] += 1
                except:
                    pass
    #print "Google Vector: ", len(google_vector)
    #print "Classes Vector: ", len(classes_vector)
    #print "Return Vector: ", len(return_vector)
    ans = numpy.append(google_vector,
                       numpy.append(return_vector, classes_vector))
    if numpy.all(ans == 0): return None
    return ans
Ejemplo n.º 13
0
                feats = line.split('\t')
                features[feats[0]] = [feats[1].split(), feats[2].split()]

        # Load the predpatt objects for creating features
        files = ['/Downloads/UD_English-r1.2/en-ud-train.conllu',
                 '/Downloads/UD_English-r1.2/en-ud-dev.conllu',
                 '/Downloads/UD_English-r1.2/en-ud-test.conllu']

        options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True)  # Resolve relative clause
        patt = {}

        for file in files:
            path = home + file
            with open(path, 'r') as infile:
                for sent_id, ud_parse in load_conllu(infile.read()):
                    patt[file[27:] + " " + sent_id] = PredPatt(ud_parse, opts=options)

        data['Structure'] = data['Sentence.ID'].map(lambda x: (patt[x], features[x]))

        # Split the datasets into train, dev, test
        data_test = data[data['Split'] == 'test'].reset_index(drop=True)
        data_dev = data[data['Split'] == 'dev'].reset_index(drop=True)
        data = data[data['Split'] == 'train'].reset_index(drop=True)

#         Ridit scoring annotations and confidence ratings
#         for attr in attributes:
#             resp = attr_map[attr]
#             resp_conf = attr_conf[attr]
#             data[resp_conf + ".norm"] = data.groupby('Annotator.ID')[resp_conf].transform(ridit)
#             data_dev[resp_conf + ".norm"] = data_dev.groupby('Annotator.ID')[resp_conf].transform(ridit)
#             data_test[resp_conf + ".norm"] = data_test.groupby('Annotator.ID')[resp_conf].transform(ridit)
Ejemplo n.º 14
0
parser = Parser.get_instance()
bad_sentence = {}
with io.open(args.out_fn, 'w', encoding='utf-8') as fout:
    with io.open(args.in_fn, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            row = line.split(args.d1)
            sentence = row[args.sentence_col][1:].rstrip()
            begin_mention = int(row[args.begin_mention_col])
            end_mention = int(row[args.end_mention_col])
            mention = sentence[begin_mention:end_mention]
            try:
                if sentence in bad_sentence:
                    raise Exception('bad sentence')
                parse = parser(sentence, tokenized=False)
                P = PredPatt(parse)
                predicates = [I.root.text
                              for I
                              in P.instances
                              if any((mention in e.phrase())
                                     for e
                                     in I.arguments)]
            except:
                bad_sentence[sentence]=1
                predicates = []
                pass
            s = ' ||| '.join([line, ';'.join(predicates)])
            fout.write(s)
            fout.write(u'\n')
# After everything print the bad_sentences
for s in bad_sentence:
Ejemplo n.º 15
0
def foo(docs_path):
    """ - foo
    """

    print('checking file length')
    num_lines = sum(1 for line in open(docs_path))

    print('staring')
    with open(docs_path) as f:
        # arg_num_dict = {}
        pred_num_dict = {}
        subj_num_dict = {}
        obj_num_dict = {}
        claim_num_dict = {}
        pp_total_time = 0
        timeouts = 0
        bad_patterns = 0
        for idx, line in enumerate(f):
            aid, adjacent, in_doc, text = line.split('\u241E')
            t1 = datetime.datetime.now()
            signal.signal(signal.SIGALRM, signal_handler)
            signal.alarm(60)
            try:
                pp = PredPatt.from_sentence(text, cacheable=False)
            except Exception as msg:
                signal.alarm(0)
                timeouts += 1
                continue
            signal.alarm(0)
            t2 = datetime.datetime.now()
            d = t2 - t1
            pp_total_time += d.total_seconds()
            for pred, patt in pp.event_dict.items():
                # TODO: rework with following dependency trees
                #       and evaluating relevance of nodes with
                #       regards to cited doc
                if not patt.has_subj() or not patt.has_obj():
                    bad_patterns += 1
                    continue
                pred_norm = normalize(pred.text)
                if pred_norm not in pred_num_dict:
                    pred_num_dict[pred_norm] = 0
                pred_num_dict[pred_norm] += 1
                subj = normalize(patt.subj().phrase())
                obj = normalize(patt.obj().phrase())
                if subj not in subj_num_dict:
                    subj_num_dict[subj] = 0
                subj_num_dict[subj] += 1
                if obj not in obj_num_dict:
                    obj_num_dict[obj] = 0
                obj_num_dict[obj] += 1
                claim = '{} {} {}'.format(subj, pred_norm, obj)
                if claim not in claim_num_dict:
                    claim_num_dict[claim] = 0
                claim_num_dict[claim] += 1
            #     for arg in patt.arguments:
            #         arg_norm = normalize(arg.phrase())
            #         if arg_norm not in arg_num_dict:
            #             arg_num_dict[arg_norm] = 0
            #         arg_num_dict[arg_norm] += 1
            print('- - - - {}/{} lines - - - -'.format(idx, num_lines))
            pp_avg_time = pp_total_time / (idx + 1)
            print('# timeouts {}'.format(timeouts))
            print('# bad_patterns {}'.format(bad_patterns))
            print('avg time per context: {:.2f}s'.format(pp_avg_time))
            # sorted_arg = sorted(arg_num_dict.items(),
            #                     key=operator.itemgetter(1),
            #                     reverse=True)
            sorted_pred = sorted(pred_num_dict.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True)
            sorted_subj = sorted(subj_num_dict.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True)
            sorted_obj = sorted(obj_num_dict.items(),
                                key=operator.itemgetter(1),
                                reverse=True)
            sorted_claim = sorted(claim_num_dict.items(),
                                  key=operator.itemgetter(1),
                                  reverse=True)
            print('- - top 10 subjects - -')
            for subj, num in sorted_subj[:10]:
                print('{}: {}'.format(num, subj[:30]))
            print('- - top 10 predicates - -')
            for pred, num in sorted_pred[:10]:
                print('{}: {}'.format(num, pred[:30]))
            print('- - top 10 objects - -')
            for obj, num in sorted_obj[:10]:
                print('{}: {}'.format(num, obj[:30]))
            print('- - top 10 claims - -')
            for claim, num in sorted_claim[:10]:
                print('{}: {}'.format(num, claim[:100]))
            # print('- - top 10 args - -')
            # for arg, num in sorted_arg[:10]:
            #     print('{}: {}'.format(num, arg[:30]))
            # if idx%100 == 0:
            #     with open('arg_num_dict.json', 'w') as f:
            #         f.write(json.dumps(arg_num_dict))
            #     with open('pred_num_dict.json', 'w') as f:
            #         f.write(json.dumps(pred_num_dict))
        # sorted_arg = sorted(arg_num_dict.items(),
        #                     key=operator.itemgetter(1),
        #                     reverse=True)
        sorted_pred = sorted(pred_num_dict.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
        sorted_subj = sorted(subj_num_dict.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
        sorted_obj = sorted(obj_num_dict.items(),
                            key=operator.itemgetter(1),
                            reverse=True)
        sorted_claim = sorted(claim_num_dict.items(),
                              key=operator.itemgetter(1),
                              reverse=True)
        print('- - top 100 subjects - -')
        for subj, num in sorted_subj[:100]:
            print('{}: {}'.format(num, subj[:30]))
        print('- - top 100 predicates - -')
        for pred, num in sorted_pred[:100]:
            print('{}: {}'.format(num, pred[:30]))
        print('- - top 100 objects - -')
        for obj, num in sorted_obj[:100]:
            print('{}: {}'.format(num, obj[:30]))
        print('- - top 100 claims - -')
        for claim, num in sorted_claim[:100]:
            print('{}: {}'.format(num, claim[:100]))
Ejemplo n.º 16
0
def test():
    from argparse import ArgumentParser
    p = ArgumentParser()
    p.add_argument('--filename', default='doc/DOCTEST.md')
    args = p.parse_args()

    sentences = re.findall(
        '^> (.*)\n([\w\W]*?)(?=^>|<END>)',
        codecs.open(args.filename, encoding='utf-8').read() + '\n<END>',
        re.MULTILINE)

    # TODO: Use PredPatt.from_string instead of duplicating code here.
    parser = Parser.get_instance()

    passed = 0
    failed = 0
    blank = 0
    for s, chunk in sentences:
        s = s.strip()
        if not s:
            continue

        # use cached parse listed in doctest chunk.
        parse_chunk = re.findall('<\!--parse=([\w\W]+?)-->', chunk)
        if parse_chunk:
            from predpatt.UDParse import DepTriple, UDParse
            [parse_chunk] = parse_chunk
            triples = [
                DepTriple(r, int(b), int(a)) for r, a, b in re.findall(
                    '(\S+)\(\S+?/(\d+), \S+?/(\d+)\)', parse_chunk)
            ]
            tokens = s.split()
            [tags_chunk] = re.findall('<\!--tags=([\w\W]+?)-->', chunk)
            tags = re.findall('\S+/(\S+)', tags_chunk)
            parse = UDParse(tokens, tags, triples)

        else:
            parse = parser(s)

        P = PredPatt(parse, ppattopts)
        relations = P.pprint(track_rule=True)
        tags = ' '.join('%s/%s' % x for x in zip(parse.tokens, parse.tags))
        parse = parse.pprint(K=4)

        relations = relations.replace('\t', '    ')
        relations = '\n'.join(line[4:].rstrip()
                              for line in relations.split('\n'))

        expected = []
        chunk = chunk.replace('\t', '    ')
        for line in chunk.split('\n'):
            if line.startswith('    '):
                line = line[4:].rstrip()
                expected.append(line)

        expected = '\n'.join(expected)

        if not expected.strip():
            blank += 1

        #got = '%s\n%s\n%s' % (tags, parse, relations)
        got = relations.strip() or '<empty>'
        got = re.sub(r'\s*\[.*\]', '', got)

        if expected.strip() == got.strip():
            #print colored('pass', 'green')
            passed += 1
        else:
            print()
            print(colored('> ' + s, 'yellow'))
            print(colored('fail', 'red'))
            print('expected:')
            for line in expected.split('\n'):
                print('   ', colored(line, 'blue'))
            print('got:')
            for line in got.split('\n'):
                print('   ', line)
            print()
            print(colored(tags, 'magenta'))
            print()
            print(colored(parse, 'magenta'))
            failed += 1

    msg = '[doctest] %.f%% (%s/%s) passed' % (passed * 100.0 /
                                              (passed + failed), passed,
                                              passed + failed)
    if failed == 0:
        print(msg)
    else:
        print()
        print(msg)
        print()
        if blank:
            print('blank:', blank)
def main():
    # Data Locations
    parser = argparse.ArgumentParser(
        description='Recast UDS-Time duration to NLI format.')
    parser.add_argument('--udstime',
                        type=str,
                        default='time_eng_ud_v1.2_2015_10_30.tsv',
                        help='UDS-Time tsv dataset file location.')

    parser.add_argument(
        '--split',
        type=str,
        default='',
        help='If specified (train, dev, test), only that split is recasted')

    parser.add_argument('--out_train',
                        type=str,
                        default='train/',
                        help='recasted train data folder location ')

    parser.add_argument('--out_dev',
                        type=str,
                        default='dev/',
                        help='recasted train data folder location')

    parser.add_argument('--out_test',
                        type=str,
                        default='test/',
                        help='recasted train data folder location ')

    args = parser.parse_args()

    # ### Import UDS Time
    uds_time = pd.read_csv(args.udstime, sep="\t")
    ewt = doc_utils.Corpus(uds_time=uds_time)
    df = ewt.process_data

    #######################################################
    ## Add features to UDS-time dataframe
    #######################################################

    df['Pred1.UPOS'] = df.apply(
        lambda row: get_predicate_pos(row, ewt, event=1), axis=1)
    df['Pred2.UPOS'] = df.apply(
        lambda row: get_predicate_pos(row, ewt, event=2), axis=1)

    ## Extract Predicate Full Text
    predicate_dict = {}
    for ud_data_path in ud_data:
        covered_set = set()
        fname = ud_data_path.split("/")[-1]
        data_name = fname.split(".")[0].split("-")[-1]

        #print(f"Start processing: {data_name}")
        with open(ud_data_path) as infile:
            data = infile.read()
            parsed = [(PredPatt(ud_parse, opts=options), sent_id)
                      for sent_id, ud_parse in load_conllu(data)]

        for pred_object, sentid in parsed:
            sentnum = sentid.split("_")[-1]
            sentenceid = fname + " " + sentnum
            for predicate_object in pred_object.instances:
                #print(f"sentenceid: {sentenceid}, pred: {predicate_object}")
                pred_text, _, pred_root_token, _ = predicate_info(
                    predicate_object)
                predicate_dict[sentenceid + "_" +
                               str(pred_root_token)] = pred_text
                #print(f"error at sentid :{sentenceid}")

        print(f"Finished creating predicate dictionary for : {data_name}\n")

    df['Pred1.Text.Full'] = df['Event1.ID'].map(lambda x: predicate_dict[x])
    df['Pred2.Text.Full'] = df['Event2.ID'].map(lambda x: predicate_dict[x])

    #######################################################
    ## Recast Data
    #######################################################

    pairid = -1  # count total pair ids
    # Count event-pairs skipped due to ambiguous text for highlighting predicate.
    skipcount = 0

    if args.split:
        splits = [args.split]
    else:
        splits = ['train', 'dev', 'test']

    for split in splits:
        data = []
        metadata = []

        curr_df = df[df['Split'] == split]
        print(f"Creating NLI instances for Data split: {split}")
        event_pair_ids = list(curr_df.groupby(['Event.Pair.ID']).groups.keys())

        pbar = tqdm(total=len(event_pair_ids))

        for idx, event_pair_id in enumerate(event_pair_ids):
            ## Predicate 1

            recasted_data, recasted_metadata, pairid, skipcount = create_duration_NLI(
                event_pair_id,
                df,
                ewt,
                pairid=pairid,
                skipcount=skipcount,
                event=1,
                sliding_window=1)
            if recasted_data:
                data += recasted_data
                metadata += recasted_metadata
            ## Predicate 2
            recasted_data, recasted_metadata, pairid, skipcount = create_duration_NLI(
                event_pair_id,
                df,
                ewt,
                pairid=pairid,
                skipcount=skipcount,
                event=2,
                sliding_window=1)
            if recasted_data:
                data += recasted_data
                metadata += recasted_metadata

            # if pairid%10000==0:
            # 	print(f"Total pair-ids processed so far: {pairid}, skipped so far: {skipcount}")
            pbar.update(1)

        out_folder = {
            'train': args.out_train,
            'dev': args.out_dev,
            'test': args.out_test
        }

        print(
            f"Total pair-ids processed so far: {pairid}, skipped so far: {skipcount}"
        )

        with open(out_folder[split] + "recast_temporal-duration_data.json",
                  'w') as out_data:
            json.dump(data, out_data, indent=4)

        with open(out_folder[split] + "recast_temporal-duration_metadata.json",
                  'w') as out_metadata:
            json.dump(metadata, out_metadata, indent=4)

    print(f"Total pair-ids: {pairid}")
    print(f'Total events skipped: {skipcount}')
Ejemplo n.º 18
0
def hand_engineering(prot, batch_size, data, data_dev):
    '''
        Hand engineered feature extraction. Supports the following - UD,
        Verbnet classids, Wordnet supersenses, concreteness ratings, LCS
        eventivity scores
    '''
    home = expanduser("~")
    framnet_posdict = {
        'V': 'VERB',
        'N': 'NOUN',
        'A': 'ADJ',
        'ADV': 'ADV',
        'PREP': 'ADP',
        'NUM': 'NUM',
        'INTJ': 'INTJ',
        'ART': 'DET',
        'C': 'CCONJ',
        'SCON': 'SCONJ',
        'PRON': 'PRON',
        'IDIO': 'X',
        'AVP': 'ADV'
    }
    # Load the features
    features = {}
    with open(home + '/Desktop/protocols/data/features-2.tsv', 'r') as f:
        for line in f.readlines():
            feats = line.split('\t')
            features[feats[0]] = (feats[1].split(), feats[2].split())

    # Load the predpatt objects for creating features
    files = [
        '/Downloads/UD_English-r1.2/en-ud-train.conllu',
        '/Downloads/UD_English-r1.2/en-ud-dev.conllu',
        '/Downloads/UD_English-r1.2/en-ud-test.conllu'
    ]
    home = expanduser("~")
    options = PredPattOpts(resolve_relcl=True,
                           borrow_arg_for_relcl=True,
                           resolve_conj=False,
                           cut=True)  # Resolve relative clause
    patt = {}

    for file in files:
        path = home + file
        with open(path, 'r') as infile:
            for sent_id, ud_parse in load_conllu(infile.read()):
                patt[file[33:][:-7] + " " + sent_id] = PredPatt(ud_parse,
                                                                opts=options)

    data['Structure'] = data['Split.Sentence.ID'].map(lambda x:
                                                      (patt[x], features[x]))
    data_dev['Structure'] = data_dev['Split.Sentence.ID'].map(
        lambda x: (patt[x], features[x]))

    raw_x = data['Structure'].tolist()
    raw_dev_x = data_dev['Structure'].tolist()

    all_x = raw_x + raw_dev_x
    all_feats = '|'.join(['|'.join(all_x[i][1][0]) for i in range(len(all_x))])
    feature_cols = Counter(all_feats.split('|'))

    # All UD dataset features
    all_ud_feature_cols = list(
        feature_cols.keys()) + [(a + "_dep") for a in feature_cols.keys()]

    # Concreteness
    f = open(home + '/Desktop/protocols/data/concrete.pkl', 'rb')
    concreteness = pickle.load(f)
    if prot == 'arg':
        conc_cols = ['concreteness']
    else:
        conc_cols = ['concreteness', 'max_conc', 'min_conc']
    f.close()

    # LCS eventivity
    from lcsreader import LexicalConceptualStructureLexicon
    lcs = LexicalConceptualStructureLexicon(
        home + '/Desktop/protocols/data/verbs-English.lcs')
    lcs_feats = ['lcs_eventive', 'lcs_stative']

    # Wordnet supersenses(lexicographer names)
    supersenses = list(
        set(['supersense=' + x.lexname() for x in wordnet.all_synsets()]))

    # Framenet
    lem2frame = {}
    for lm in framenet.lus():
        for lemma in lm['lexemes']:
            lem2frame[lemma['name'] + '.' +
                      framnet_posdict[lemma['POS']]] = lm['frame']['name']
    frame_names = ['frame=' + x.name for x in framenet.frames()]

    # Verbnet classids
    verbnet_classids = ['classid=' + vcid for vcid in verbnet.classids()]

    # Lexical features
    lexical_feats = [
        'can', 'could', 'should', 'would', 'will', 'may', 'might', 'must',
        'ought', 'dare', 'need'
    ] + [
        'the', 'an', 'a', 'few', 'another', 'some', 'many', 'each', 'every',
        'this', 'that', 'any', 'most', 'all', 'both', 'these'
    ]

    dict_feats = {}
    for f in verbnet_classids + lexical_feats + supersenses + frame_names + lcs_feats + all_ud_feature_cols + conc_cols:
        dict_feats[f] = 0

    x_pd = pd.DataFrame([
        features_func(sent_feat=sent,
                      token=token,
                      lemma=lemma,
                      dict_feats=dict_feats.copy(),
                      prot=prot,
                      concreteness=concreteness,
                      lcs=lcs,
                      l2f=lem2frame) for sent, token, lemma in
        zip(raw_x, data['Root.Token'].tolist(), data['Lemma'].tolist())
    ])

    dev_x_pd = pd.DataFrame([
        features_func(sent_feat=sent,
                      token=token,
                      lemma=lemma,
                      dict_feats=dict_feats.copy(),
                      prot=prot,
                      concreteness=concreteness,
                      lcs=lcs,
                      l2f=lem2frame)
        for sent, token, lemma in zip(raw_dev_x, data_dev['Root.Token'].tolist(
        ), data_dev['Lemma'].tolist())
    ])

    # Figure out which columns to drop(they're always zero)
    todrop1 = dev_x_pd.columns[(dev_x_pd == 0).all()].values.tolist()
    todrop = x_pd.columns[(x_pd == 0).all()].values.tolist()
    intdrop = [a for a in todrop if a not in todrop1]
    cols_to_drop = cols_to_drop = list(set(todrop) - set(intdrop))

    x = x_pd.drop(cols_to_drop, axis=1).values.tolist()
    dev_x = dev_x_pd.drop(cols_to_drop, axis=1).values.tolist()

    x = [[a[:] for a in x[i:i + batch_size]]
         for i in range(0, len(data), batch_size)]
    dev_x = [[a[:] for a in dev_x[i:i + batch_size]]
             for i in range(0, len(data_dev), batch_size)]
    return x, dev_x
Ejemplo n.º 19
0
"""
Example of programmatic PredPatt usage.
"""

# Run PredPatt on sentence
from predpatt import PredPatt
sentence = 'Chris loves silly dogs and clever cats .'
P = PredPatt.from_sentence(sentence)

# Pretty-print output
print P.pprint(track_rule=True, color=True)

print '______________________________________________________________________________'

# A deeper look into PredPatt's internal representations.
#
# Each extraction is kept in a list called instances. Below we will loop through
# each instance and print it's arguments.
for x in P.instances:
    print
    print x, x.phrase()
    for a in x.arguments:
        print ' ', a, a.phrase()

        # Uncomment to list rules which fired on this proposition. Along with
        # an explanation.
        #for r in a.rules:
        #    print '    %s: %s' % (r, r.explain())

print '______________________________________________________________________________'
print
Ejemplo n.º 20
0
    '/UD_English-r1.2/en-ud-dev.conllu', '/UD_English-r1.2/en-ud-test.conllu'
]
home = expanduser("~/Downloads/")
parsed = {'train': [], 'devte': []}
out_data = []

options = PredPattOpts(resolve_relcl=True,
                       borrow_arg_for_relcl=True,
                       resolve_conj=False,
                       cut=True)  # Resolve relative clause

path = home + '/UD_English-r1.2/en-ud-train.conllu'
with open(path, 'r') as infile:
    data = infile.read()
    parsed['train'] += [('en-ud-train.conllu' + " " + sent_id,
                         PredPatt(ud_parse, opts=options))
                        for sent_id, ud_parse in load_conllu(data)]

for file in files:
    path = home + file
    with open(path, 'r') as infile:
        data = infile.read()
        parsed['devte'] += [(file[17:] + " " + sent_id,
                             PredPatt(ud_parse, opts=options))
                            for sent_id, ud_parse in load_conllu(data)]

c = {'train': 0, 'devte': 0}
d = {'train': 0, 'dev': 0, 'test': 0}
ign = {'train': 0, 'devte': 0}
prons_incl = [
    "you", "they", "yourself", "themselves", "them", "themself", "theirself",
Ejemplo n.º 21
0
from predpatt import PredPatt

pp = PredPatt.from_sentence(
    'At the Pentagon briefing today, General Stanley McChrystal said that it looked a lot like terrorism.'
)
#print(pp.pprint())
# print(" ".join([token.text for token in pp.tokens]))
# print(pp.events)
# print(pp.event_dict)
# print(pp.events)

for event in pp.events:
    print(event)
    for argument in event.arguments:
        print(argument)
Ejemplo n.º 22
0
def main():

    patterns = ''
    sentence = 'The quick brown fox jumped over the lazy dog .'
    tags = ''
    parse = ''
    if request.GET.get('sentence', '').strip():
        sentence = request.GET.get('sentence', '').strip()

    pp_opts = PredPattOpts()
    for k, v in sorted(PredPattOpts().__dict__.iteritems()):
        v = int(float(request.GET.get(
            k, v)))  # all options are true/false for now.
        setattr(pp_opts, k, v)

    if sentence:

        #for sent in sent_detector.tokenize('"John saw Mary", said Jason. Larry met Sally for dinner.'):
        #    print tokenize(sent)

        original_sentence = sentence
        parse = parser(sentence, tokenized=False)

        P = PredPatt(parse, opts=pp_opts)
        patterns = P.pprint(track_rule=True)
        tags = ' '.join('%s/%s' % x for x in zip(parse.tokens, parse.tags))
        parse = parse.pprint(K=3)

        # remove predpatt's bracketed comments
        patterns = re.sub(r'\s*\[.*?\]', '', patterns)
        patterns = dedent(patterns)

    opts = []
    for k, v in sorted(pp_opts.__dict__.iteritems()):
        # Create a hidden textbox with the false value because the values of
        # "unchecked" boxes don't get posted with form.
        opts.append('<input type="hidden" value="0" name="%s">' % (k, ))
        opts.append('<input type="checkbox" name="%s" value="1" %s> %s<br/>' %
                    (k, 'checked' if v else '', k))

    options = '\n'.join(opts)

    return template("""
<html>
<head>


<!-- JQuery -->
<script src="//code.jquery.com/jquery-2.1.4.min.js"></script>
<!-- Bootstrap -->
<link rel="stylesheet" href="//maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css"/>
<link rel="stylesheet" href="//maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap-theme.min.css"/>
<script src="//maxcdn.bootstrapcdn.com/bootstrap/3.3.1/js/bootstrap.min.js"></script>
<!-- Chosen Dropdown Library -->
<link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/chosen/1.4.2/chosen.css"/>
<script src="//cdnjs.cloudflare.com/ajax/libs/chosen/1.4.2/chosen.jquery.min.js"></script>

<style>
html {
     overflow: -moz-scrollbars-vertical;
     overflow: scroll;
}
</style>
</head>
<body>
<div style="width: 800px; padding: 10px; margin-left: auto; margin-right: auto;">
<h1>PredPatt</h1>
<strong>Sentence</strong>
<pre>{{sentence}}</pre>

<strong>Propositions</strong>
<div id="propositions">
<pre>
{{patterns}}
</pre>

<div>
<button class="btn" data-toggle="collapse" data-target="#parse" style="margin-bottom: 10px;">Toggle Parse</button>
<div id="parse" class="collapse">
<strong>Tags</strong>
<pre>
{{tags}}
</pre>
<strong>Parse</strong>
<pre>
{{parse}}
</pre>
</div>
</div>
<strong>Input</strong>
<form action="/" method="GET">
<textarea type="text" name="sentence" style="height:50px; width: 100%;"
placeholder="e.g., The quick brown fox jumped over the lazy dog."
class="form-control"
autofocus>{{original_sentence}}</textarea>
<div style="padding: 10px;"><strong>Options</strong><br/>""" + options + """
</div>
<br/>
<input type="submit" name="save" value="submit">
</form>
</div>
</body>
</html>
    """,
                    sentence=sentence,
                    original_sentence=original_sentence,
                    patterns=patterns,
                    tags=tags,
                    parse=parse,
                    options=options)

id = 1
files = ['/UD_English-r1.2/en-ud-dev.conllu',
         '/UD_English-r1.2/en-ud-test.conllu']
home = expanduser("~/Downloads/")
parsed = {'train': [], 'devte': []}
out_data = []

# Resolve relative clause
options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True)

path = home + '/UD_English-r1.2/en-ud-train.conllu'
with open(path, 'r') as infile:
    data = infile.read()
    parsed['train'] += [('en-ud-train.conllu' + " " + sent_id, PredPatt(ud_parse, opts=options)) for sent_id, ud_parse in load_conllu(data)]

for file in files:
    path = home + file
    with open(path, 'r') as infile:
        data = infile.read()
        parsed['devte'] += [(file[17:] + " " + sent_id, PredPatt(ud_parse, opts=options)) for sent_id, ud_parse in load_conllu(data)]
# random.shuffle(parsed['train'])
c = {'train': 0, 'devte': 0}
d = {'train': 0, 'dev': 0, 'test': 0}
copp = {'train': 0, 'devte': 0}
auxverb = {'train': 0, 'devte': 0}
ign = {'train': 0, 'devte': 0}
adj = {'train': 0, 'devte': 0}

for write_file in ['pred_train_data.csv', 'pred_devte_data.csv']:
Ejemplo n.º 24
0
            decomp_lines_json_chunk = [
                x for x in decomp_lines_json if x['doc-id'] == doc_id
            ]  #get the lines associated with this chunk
            line_idx = 0  #Where we are in the decomp json file

            valid_instance = True
            for sent_id, parse in conll_iter:
                sent_id = int(sent_id.split('_')[1])

                if line_idx >= len(decomp_lines_json_chunk):
                    break

                if decomp_lines_json_chunk[line_idx][
                        'sent-id'] == sent_id:  #check if there is a matching decomp extraction for this conll line
                    json_line = decomp_lines_json_chunk[line_idx]
                    ppat = PredPatt(parse)
                    pred_heads = json_line['predicate-head-idxs']
                    pred_args = json_line['pred-args']
                    assert len(pred_heads) <= len(pred_args)
                    event_text = []
                    event_args = []
                    for idx, head in enumerate(pred_heads):
                        head_args = [x for x in pred_args if x[0] == head]
                        assert len(head_args) > 0
                        head_arg_id = head_args[0][1]
                        if head < len(ppat.tokens) and ppat.tokens[
                                head] in ppat.event_dict.keys(
                                ) and head_arg_id < len(ppat.tokens):
                            predicate = ppat.event_dict[ppat.tokens[head]]
                            pred_text = predpatt2text(predicate)
                            event_text.append(pred_text)
def build_sentence_representation(s):
    """ Build representation of a sentence by analyzing predpatt output.

        Returns a weighted list of lists of terms.
    """

    s = merge_citation_token_lists(s)
    s = remove_qutation_marks(s)
    lemmatizer = WordNetLemmatizer()
    raw_lists = []
    rep_lists = []
    rep_lists_alt = []  # to be consistent with double annotating for 3 and 3.1
    try:
        pp = PredPatt.from_sentence(s, cacheable=False)  # for speed tests
    except Exception as e:
        print('= = = PredPatt exception = = =')
        print('input:\n{}'.format(s))
        print('exception:\n{}'.format(e))
        return rep_lists, rep_lists_alt
    if len(pp.events) == 0:
        return rep_lists, rep_lists_alt
    if CIT_BASED:
        for e in pp.events:
            depth, rep = build_tree_representation(e)
            if INCLUDE_PREDICATE:
                pred = get_predicate(e.root)
                rep = ['{}:{}'.format(pred, r) for r in rep]
            if len(rep) > 0:
                raw_lists.append([depth, rep])
        weight = 1
        for rl in sorted(raw_lists, key=itemgetter(0)):
            rep_lists.append([weight, rl[1]])
            weight *= .5
        if len(rep_lists) == 0:
            fallback = build_noun_representation(pp.events[0],
                                                 global_root=True)
            if INCLUDE_PREDICATE:
                pred = get_predicate(pp.events[0].root)
                fallback = ['{}:{}'.format(pred, f) for f in fallback]
            if len(fallback) > 0:
                rep_lists = [[.25, fallback]]
    else:
        # make a PPv3 and a PPv3.1 representation
        # - - - 3.1 - - -
        reps = []
        for e in pp.events:
            rep = build_noun_representation(e)  # 3.1
            if INCLUDE_PREDICATE:
                pred = get_predicate(e.root)
                rep = ['{}:{}'.format(pred, f) for f in rep]
            reps.extend(rep)
        if len(reps) > 0:
            rep_lists = [[1, reps]]
        # - - - 3 - - -
        reps_alt = []
        for e in pp.events:
            rep = build_noun_representation(e, global_root=True)  # 3
            if INCLUDE_PREDICATE:
                pred = get_predicate(e.root)
                rep = ['{}:{}'.format(pred, f) for f in rep]
            reps_alt.extend(rep)
        if len(reps) > 0:
            rep_lists_alt = [[1, reps_alt]]

    rep_lists = normalize_rep_lists(rep_lists, lemmatizer)
    rep_lists_alt = normalize_rep_lists(rep_lists_alt, lemmatizer)
    return rep_lists, rep_lists_alt