def parse(text): """ Primary function to run syntaxnet and PredPatt over input sentences. """ parse_tree, trace = annotate_text(text) conll_parsed = parse_to_conll(parse_tree) conll_pp = [ud_parse for sent_id, ud_parse in load_conllu(conll_parsed)][0] #PredPatt options. Modify as needed. resolve_relcl = True # relative clauses resolve_appos = True # appositional modifiers resolve_amod = True # adjectival modifiers resolve_conj = True # conjuction resolve_poss = True # possessives ud = dep_v2.VERSION # the version of UD opts = PredPattOpts(resolve_relcl=resolve_relcl, resolve_appos=resolve_appos, resolve_amod=resolve_amod, resolve_conj=resolve_conj, resolve_poss=resolve_poss, ud=ud) ppatt = PredPatt(conll_pp, opts=opts) #NOTE: #This returns the pretty print formatted string from PredPatt. This is done #largely as a place holder for JSON compatability within the REST API. return {'predpatt': ppatt.pprint(), 'conll': conll_parsed, 'original': text}
def test(data): from predpatt import PredPatt, load_conllu def fail(g, t): if len(g) != len(t): return True else: for i in g: if i not in t: return True no_color = lambda x, _: x count, failed = 0, 0 ret = "" for sent_id, ud_parse in load_conllu(data): count += 1 pp = PredPatt(ud_parse) sent = ' '.join(t.text for t in pp.tokens) linearized_pp = linearize(pp) gold_preds = [ predicate.format(C=no_color, track_rule=False) for predicate in pp.instances if likely_to_be_pred(predicate) ] test_preds = pprint_preds( construct_pred_from_flat(linearized_pp.split())) if fail(gold_preds, test_preds): failed += 1 ret += ( "Sent: %s\nLinearized PredPatt:\n\t%s\nGold:\n%s\nYours:\n%s\n\n" % (sent, linearized_pp, "\n".join(gold_preds), "\n".join(test_preds))) print(ret) print("You have test %d instances, and %d failed the test." % (count, failed))
def extract_predpatt_text(row, eid_num:int): ''' Given a pandas dataframe of TB data and eid_num (1 or 2) output predpatt predicate text (adds copula fillers in text) ''' tokenid = getattr(row, f'eid{eid_num}_token_id') conllu_string = getattr(row, f'eid{eid_num}_sent_conllu') parsed_tb = [PredPatt(ud_parse, opts=options) for sent_id, ud_parse in load_conllu(conllu_string)] pred_objects = parsed_tb[0].instances curr_text = getattr(row, f'eid{eid_num}_text') pred_match = False #print(f"{(row['docid'], row['eventInstanceID'], row['relatedToEventInstance'])}") if pred_objects: for pred in pred_objects: if int(pred.root.position)==int(tokenid): pred_match = True pred_object = pred break else: pred_match=False if pred_match: pred_text, _, _, _ = predicate_info(pred_object) return pred_text else: return curr_text else: return getattr(row, f'eid{eid_num}_text')
def get_events_and_text(sent): """ sent is a spacy parsed sentence (parsed through the default English spacy pipeline) Extract the events and the text of the events from a line of COPA """ text = sent.text sorels = ['nsubj', 'dobj', 'iobj'] outputs = [] pp = PredPatt.from_sentence(text) events = pp.events for event in events: position = event.position args = event.arguments event_rels = {} for a in args: head = a.root govrel = head.gov_rel event_rels[govrel] = head lemma = sent[position].lemma_ if 'nsubj' in event_rels: e1 = lemma + '->nsubj' e1_text = predpatt2text(event) elif 'dobj' in event_rels: e1 = lemma + '->dobj' e1_text = predpatt2text(event) elif 'iobj' in event_rels: e1 = lemma + '->iobj' e1_text = predpatt2text(event) else: e1 = lemma + '->nsubj' e1_text = predpatt2text(event) outputs.append({'e1': e1, 'e1_text': e1_text}) return outputs
def extract_triples(input_remaining, params): opts = PredPattOpts( resolve_relcl=True, # relative clauses resolve_appos=True, # appositional modifiers resolve_amod=True, # adjectival modifiers resolve_conj=True, # conjuction resolve_poss=True, # possessives ud=dep_v1.VERSION, # the version of UD ) triples = {} remaining = {} for idx in input_remaining: for line in input_remaining[idx]: if line.strip(): try: pp = PredPatt.from_sentence(line, opts=opts, cacheable=False) extractions = get_predpatt_triples(pp, line) if extractions: triples.setdefault(idx, []).extend(extractions) except KeyError: pass if idx not in triples: remaining[idx] = input_remaining[idx] triples[idx] = [] return triples, remaining
def from_conll(cls, corpus: Union[str, TextIO], name: str = 'ewt', options: Optional[PredPattOpts] = None) -> 'PredPattCorpus': """Load a CoNLL dependency corpus and apply predpatt Parameters ---------- corpus (path to) a .conllu file name the name of the corpus; used in constructing treeids options options for predpatt extraction """ options = DEFAULT_PREDPATT_OPTIONS if options is None else options corp_is_str = isinstance(corpus, str) if corp_is_str and splitext(basename(corpus))[1] == '.conllu': with open(corpus) as infile: data = infile.read() elif corp_is_str: data = corpus else: data = corpus.read() # load the CoNLL dependency parses as graphs ud_corp = {name+'-'+str(i+1): [line.split() for line in block.split('\n') if len(line) > 0 if line[0] != '#'] for i, block in enumerate(data.split('\n\n'))} ud_corp = CoNLLDependencyTreeCorpus(ud_corp) # extract the predpatt for those dependency parses try: predpatt = {name+'-'+sid.split('_')[1]: PredPatt(ud_parse, opts=options) for sid, ud_parse in load_conllu(data)} except ValueError: errmsg = 'PredPatt was unable to parse the CoNLL you provided.' +\ ' This is likely due to using a version of UD that is' +\ ' incompatible with PredPatt. Use of version 1.2 is' +\ ' suggested.' raise ValueError(errmsg) return cls({n: (pp, ud_corp[n]) for n, pp in predpatt.items()})
def generate_predicates( abstract_text:str, pred_patt_opts=None )->Iterable[Tuple[str, str, str]]: "Requires that pred_util:nlp and pred_util:stopwords be initialized" nlp = dpg.get("pred_util:nlp") parser = Spacy2ConllParser(nlp=nlp) stopwords = dpg.get("pred_util:stopwords") doc = nlp(abstract_text) for sent in doc.sents: # if the sentence is very long if len(sent) >= 20: word_count = defaultdict(int) for tok in sent: word_count[str(tok)] += 1 # if one word dominates the long sentence if max(word_count.values()) >= len(sent)*0.2: continue # we likely generated the same word over-and-over conllu = "".join(list(parser.parse(input_str=str(sent)))) for _, pred_patt_parse in load_conllu(conllu): predicates = PredPatt( pred_patt_parse, opts=pred_patt_opts ).instances for predicate in predicates: # We only care about 2-entity predicates if len(predicate.arguments) == 2: a_ents, b_ents = [ # Get the set of entities filter( # Not in the stopword list lambda x: x not in stopwords, [str(e).strip() for e in nlp(args.phrase()).ents] ) # For each argument for args in predicate.arguments ] # Slight cleaning needed to better match the predicate phrase # Note, that PredPatt predicates use ?a and ?b placeholders predicate_stmt = ( re.match( r".*\?a(.*)\?b.*", # get text between placeholders predicate.phrase() ) .group(1) # get the group matched between the placeholders .strip() ) if len(predicate_stmt) > 0: # We're going to iterate all predicates for a, b in product(a_ents, b_ents): if a != b: yield (a, predicate_stmt, b)
def setup_graph(): ud = DependencyGraphBuilder.from_conll(listtree, 'tree1') pp = PredPatt(next(load_conllu(rawtree))[1], opts=PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True)) graph = PredPattGraphBuilder.from_predpatt(pp, ud, 'tree1') return pp, graph
def predpatt_visualize(s): sid = '{:x}'.format(zlib.adler32(s.encode())) pp = PredPatt.from_sentence(s) for i, e in enumerate(pp.events): tree = pp_dot_tree(e) tree.add_node(pydot.Node('label', label=s, shape='plaintext')) tree.add_edge(pydot.Edge('label', e.root.__repr__(), style='invis')) try: tree.write_png('tree_{}_{}.png'.format(sid, i)) except AssertionError: print('AssertionError for: {}'.format(s)) pass # pydot errors are useless
def extract_predpatt(path='../../data/corpora/ud/UD_English-EWT-r1.2/'): ''' Extract PredPatt objects from CONLLU files ''' options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True) # Resolve relative clause patt = {} for file in os.listdir(path): if file.endswith('.conllu'): with open(path + file, 'r') as infile: for sent_id, ud_parse in load_conllu(infile.read()): patt[file + " " + sent_id] = \ PredPatt(ud_parse, opts=options) return patt
def extract(self, sentence: str) -> List[Dict[str, Any]]: processed = self.pipeline.process(sentence, self._error) if self._error.occurred(): print(f"=== Error occurred: {self._error.message}") self._error = ProcessingError() return None else: conll_example = [ud_parse for sent_id, ud_parse in load_conllu(processed)][ 0 ] ppatt = PredPatt(conll_example, opts=self._opts) result = [] for predicate in ppatt.instances: structure = { "predicate": predicate.tokens, "arguments": [x.tokens for x in predicate.arguments], } result.append(structure) return result
def get_vector(sentence): global DEPENDENCIES, verbs_classes, class_index sent = PredPatt.from_sentence(sentence) #print sent.pprint() return_vector = numpy.zeros(len(DEPENDENCIES), dtype='float64') classes_vector = numpy.zeros(4, dtype='float64') google_vector = numpy.zeros(300, dtype='float64') for predicate in sent.events: #print "Predicate: ", predicate #print "Predicate Root Text: ", predicate.root.text lemmatised_word = lemmatizer.lemmatize(predicate.root.text.lower()) for mclass in verbs_classes.keys(): if lemmatised_word.upper() in verbs_classes[mclass]: classes_vector[class_dict[mclass]] += 1 google_vector += get_word_vector(predicate.root.text) for argument in sent.argument_extract(predicate): #print "Argument: ", argument google_vector += get_word_vector(argument.root.text) for rule in argument.rules: #print "Rule: ", rule try: rule_name = rule.edge except: continue #print "Rule Name: ", rule_name try: return_vector[DEPENDENCIES[rule_name.rel]] += 1 except: pass #print "Google Vector: ", len(google_vector) #print "Classes Vector: ", len(classes_vector) #print "Return Vector: ", len(return_vector) ans = numpy.append(google_vector, numpy.append(return_vector, classes_vector)) if numpy.all(ans == 0): return None return ans
feats = line.split('\t') features[feats[0]] = [feats[1].split(), feats[2].split()] # Load the predpatt objects for creating features files = ['/Downloads/UD_English-r1.2/en-ud-train.conllu', '/Downloads/UD_English-r1.2/en-ud-dev.conllu', '/Downloads/UD_English-r1.2/en-ud-test.conllu'] options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True) # Resolve relative clause patt = {} for file in files: path = home + file with open(path, 'r') as infile: for sent_id, ud_parse in load_conllu(infile.read()): patt[file[27:] + " " + sent_id] = PredPatt(ud_parse, opts=options) data['Structure'] = data['Sentence.ID'].map(lambda x: (patt[x], features[x])) # Split the datasets into train, dev, test data_test = data[data['Split'] == 'test'].reset_index(drop=True) data_dev = data[data['Split'] == 'dev'].reset_index(drop=True) data = data[data['Split'] == 'train'].reset_index(drop=True) # Ridit scoring annotations and confidence ratings # for attr in attributes: # resp = attr_map[attr] # resp_conf = attr_conf[attr] # data[resp_conf + ".norm"] = data.groupby('Annotator.ID')[resp_conf].transform(ridit) # data_dev[resp_conf + ".norm"] = data_dev.groupby('Annotator.ID')[resp_conf].transform(ridit) # data_test[resp_conf + ".norm"] = data_test.groupby('Annotator.ID')[resp_conf].transform(ridit)
parser = Parser.get_instance() bad_sentence = {} with io.open(args.out_fn, 'w', encoding='utf-8') as fout: with io.open(args.in_fn, encoding='utf-8') as f: for line in f: line = line.strip() row = line.split(args.d1) sentence = row[args.sentence_col][1:].rstrip() begin_mention = int(row[args.begin_mention_col]) end_mention = int(row[args.end_mention_col]) mention = sentence[begin_mention:end_mention] try: if sentence in bad_sentence: raise Exception('bad sentence') parse = parser(sentence, tokenized=False) P = PredPatt(parse) predicates = [I.root.text for I in P.instances if any((mention in e.phrase()) for e in I.arguments)] except: bad_sentence[sentence]=1 predicates = [] pass s = ' ||| '.join([line, ';'.join(predicates)]) fout.write(s) fout.write(u'\n') # After everything print the bad_sentences for s in bad_sentence:
def foo(docs_path): """ - foo """ print('checking file length') num_lines = sum(1 for line in open(docs_path)) print('staring') with open(docs_path) as f: # arg_num_dict = {} pred_num_dict = {} subj_num_dict = {} obj_num_dict = {} claim_num_dict = {} pp_total_time = 0 timeouts = 0 bad_patterns = 0 for idx, line in enumerate(f): aid, adjacent, in_doc, text = line.split('\u241E') t1 = datetime.datetime.now() signal.signal(signal.SIGALRM, signal_handler) signal.alarm(60) try: pp = PredPatt.from_sentence(text, cacheable=False) except Exception as msg: signal.alarm(0) timeouts += 1 continue signal.alarm(0) t2 = datetime.datetime.now() d = t2 - t1 pp_total_time += d.total_seconds() for pred, patt in pp.event_dict.items(): # TODO: rework with following dependency trees # and evaluating relevance of nodes with # regards to cited doc if not patt.has_subj() or not patt.has_obj(): bad_patterns += 1 continue pred_norm = normalize(pred.text) if pred_norm not in pred_num_dict: pred_num_dict[pred_norm] = 0 pred_num_dict[pred_norm] += 1 subj = normalize(patt.subj().phrase()) obj = normalize(patt.obj().phrase()) if subj not in subj_num_dict: subj_num_dict[subj] = 0 subj_num_dict[subj] += 1 if obj not in obj_num_dict: obj_num_dict[obj] = 0 obj_num_dict[obj] += 1 claim = '{} {} {}'.format(subj, pred_norm, obj) if claim not in claim_num_dict: claim_num_dict[claim] = 0 claim_num_dict[claim] += 1 # for arg in patt.arguments: # arg_norm = normalize(arg.phrase()) # if arg_norm not in arg_num_dict: # arg_num_dict[arg_norm] = 0 # arg_num_dict[arg_norm] += 1 print('- - - - {}/{} lines - - - -'.format(idx, num_lines)) pp_avg_time = pp_total_time / (idx + 1) print('# timeouts {}'.format(timeouts)) print('# bad_patterns {}'.format(bad_patterns)) print('avg time per context: {:.2f}s'.format(pp_avg_time)) # sorted_arg = sorted(arg_num_dict.items(), # key=operator.itemgetter(1), # reverse=True) sorted_pred = sorted(pred_num_dict.items(), key=operator.itemgetter(1), reverse=True) sorted_subj = sorted(subj_num_dict.items(), key=operator.itemgetter(1), reverse=True) sorted_obj = sorted(obj_num_dict.items(), key=operator.itemgetter(1), reverse=True) sorted_claim = sorted(claim_num_dict.items(), key=operator.itemgetter(1), reverse=True) print('- - top 10 subjects - -') for subj, num in sorted_subj[:10]: print('{}: {}'.format(num, subj[:30])) print('- - top 10 predicates - -') for pred, num in sorted_pred[:10]: print('{}: {}'.format(num, pred[:30])) print('- - top 10 objects - -') for obj, num in sorted_obj[:10]: print('{}: {}'.format(num, obj[:30])) print('- - top 10 claims - -') for claim, num in sorted_claim[:10]: print('{}: {}'.format(num, claim[:100])) # print('- - top 10 args - -') # for arg, num in sorted_arg[:10]: # print('{}: {}'.format(num, arg[:30])) # if idx%100 == 0: # with open('arg_num_dict.json', 'w') as f: # f.write(json.dumps(arg_num_dict)) # with open('pred_num_dict.json', 'w') as f: # f.write(json.dumps(pred_num_dict)) # sorted_arg = sorted(arg_num_dict.items(), # key=operator.itemgetter(1), # reverse=True) sorted_pred = sorted(pred_num_dict.items(), key=operator.itemgetter(1), reverse=True) sorted_subj = sorted(subj_num_dict.items(), key=operator.itemgetter(1), reverse=True) sorted_obj = sorted(obj_num_dict.items(), key=operator.itemgetter(1), reverse=True) sorted_claim = sorted(claim_num_dict.items(), key=operator.itemgetter(1), reverse=True) print('- - top 100 subjects - -') for subj, num in sorted_subj[:100]: print('{}: {}'.format(num, subj[:30])) print('- - top 100 predicates - -') for pred, num in sorted_pred[:100]: print('{}: {}'.format(num, pred[:30])) print('- - top 100 objects - -') for obj, num in sorted_obj[:100]: print('{}: {}'.format(num, obj[:30])) print('- - top 100 claims - -') for claim, num in sorted_claim[:100]: print('{}: {}'.format(num, claim[:100]))
def test(): from argparse import ArgumentParser p = ArgumentParser() p.add_argument('--filename', default='doc/DOCTEST.md') args = p.parse_args() sentences = re.findall( '^> (.*)\n([\w\W]*?)(?=^>|<END>)', codecs.open(args.filename, encoding='utf-8').read() + '\n<END>', re.MULTILINE) # TODO: Use PredPatt.from_string instead of duplicating code here. parser = Parser.get_instance() passed = 0 failed = 0 blank = 0 for s, chunk in sentences: s = s.strip() if not s: continue # use cached parse listed in doctest chunk. parse_chunk = re.findall('<\!--parse=([\w\W]+?)-->', chunk) if parse_chunk: from predpatt.UDParse import DepTriple, UDParse [parse_chunk] = parse_chunk triples = [ DepTriple(r, int(b), int(a)) for r, a, b in re.findall( '(\S+)\(\S+?/(\d+), \S+?/(\d+)\)', parse_chunk) ] tokens = s.split() [tags_chunk] = re.findall('<\!--tags=([\w\W]+?)-->', chunk) tags = re.findall('\S+/(\S+)', tags_chunk) parse = UDParse(tokens, tags, triples) else: parse = parser(s) P = PredPatt(parse, ppattopts) relations = P.pprint(track_rule=True) tags = ' '.join('%s/%s' % x for x in zip(parse.tokens, parse.tags)) parse = parse.pprint(K=4) relations = relations.replace('\t', ' ') relations = '\n'.join(line[4:].rstrip() for line in relations.split('\n')) expected = [] chunk = chunk.replace('\t', ' ') for line in chunk.split('\n'): if line.startswith(' '): line = line[4:].rstrip() expected.append(line) expected = '\n'.join(expected) if not expected.strip(): blank += 1 #got = '%s\n%s\n%s' % (tags, parse, relations) got = relations.strip() or '<empty>' got = re.sub(r'\s*\[.*\]', '', got) if expected.strip() == got.strip(): #print colored('pass', 'green') passed += 1 else: print() print(colored('> ' + s, 'yellow')) print(colored('fail', 'red')) print('expected:') for line in expected.split('\n'): print(' ', colored(line, 'blue')) print('got:') for line in got.split('\n'): print(' ', line) print() print(colored(tags, 'magenta')) print() print(colored(parse, 'magenta')) failed += 1 msg = '[doctest] %.f%% (%s/%s) passed' % (passed * 100.0 / (passed + failed), passed, passed + failed) if failed == 0: print(msg) else: print() print(msg) print() if blank: print('blank:', blank)
def main(): # Data Locations parser = argparse.ArgumentParser( description='Recast UDS-Time duration to NLI format.') parser.add_argument('--udstime', type=str, default='time_eng_ud_v1.2_2015_10_30.tsv', help='UDS-Time tsv dataset file location.') parser.add_argument( '--split', type=str, default='', help='If specified (train, dev, test), only that split is recasted') parser.add_argument('--out_train', type=str, default='train/', help='recasted train data folder location ') parser.add_argument('--out_dev', type=str, default='dev/', help='recasted train data folder location') parser.add_argument('--out_test', type=str, default='test/', help='recasted train data folder location ') args = parser.parse_args() # ### Import UDS Time uds_time = pd.read_csv(args.udstime, sep="\t") ewt = doc_utils.Corpus(uds_time=uds_time) df = ewt.process_data ####################################################### ## Add features to UDS-time dataframe ####################################################### df['Pred1.UPOS'] = df.apply( lambda row: get_predicate_pos(row, ewt, event=1), axis=1) df['Pred2.UPOS'] = df.apply( lambda row: get_predicate_pos(row, ewt, event=2), axis=1) ## Extract Predicate Full Text predicate_dict = {} for ud_data_path in ud_data: covered_set = set() fname = ud_data_path.split("/")[-1] data_name = fname.split(".")[0].split("-")[-1] #print(f"Start processing: {data_name}") with open(ud_data_path) as infile: data = infile.read() parsed = [(PredPatt(ud_parse, opts=options), sent_id) for sent_id, ud_parse in load_conllu(data)] for pred_object, sentid in parsed: sentnum = sentid.split("_")[-1] sentenceid = fname + " " + sentnum for predicate_object in pred_object.instances: #print(f"sentenceid: {sentenceid}, pred: {predicate_object}") pred_text, _, pred_root_token, _ = predicate_info( predicate_object) predicate_dict[sentenceid + "_" + str(pred_root_token)] = pred_text #print(f"error at sentid :{sentenceid}") print(f"Finished creating predicate dictionary for : {data_name}\n") df['Pred1.Text.Full'] = df['Event1.ID'].map(lambda x: predicate_dict[x]) df['Pred2.Text.Full'] = df['Event2.ID'].map(lambda x: predicate_dict[x]) ####################################################### ## Recast Data ####################################################### pairid = -1 # count total pair ids # Count event-pairs skipped due to ambiguous text for highlighting predicate. skipcount = 0 if args.split: splits = [args.split] else: splits = ['train', 'dev', 'test'] for split in splits: data = [] metadata = [] curr_df = df[df['Split'] == split] print(f"Creating NLI instances for Data split: {split}") event_pair_ids = list(curr_df.groupby(['Event.Pair.ID']).groups.keys()) pbar = tqdm(total=len(event_pair_ids)) for idx, event_pair_id in enumerate(event_pair_ids): ## Predicate 1 recasted_data, recasted_metadata, pairid, skipcount = create_duration_NLI( event_pair_id, df, ewt, pairid=pairid, skipcount=skipcount, event=1, sliding_window=1) if recasted_data: data += recasted_data metadata += recasted_metadata ## Predicate 2 recasted_data, recasted_metadata, pairid, skipcount = create_duration_NLI( event_pair_id, df, ewt, pairid=pairid, skipcount=skipcount, event=2, sliding_window=1) if recasted_data: data += recasted_data metadata += recasted_metadata # if pairid%10000==0: # print(f"Total pair-ids processed so far: {pairid}, skipped so far: {skipcount}") pbar.update(1) out_folder = { 'train': args.out_train, 'dev': args.out_dev, 'test': args.out_test } print( f"Total pair-ids processed so far: {pairid}, skipped so far: {skipcount}" ) with open(out_folder[split] + "recast_temporal-duration_data.json", 'w') as out_data: json.dump(data, out_data, indent=4) with open(out_folder[split] + "recast_temporal-duration_metadata.json", 'w') as out_metadata: json.dump(metadata, out_metadata, indent=4) print(f"Total pair-ids: {pairid}") print(f'Total events skipped: {skipcount}')
def hand_engineering(prot, batch_size, data, data_dev): ''' Hand engineered feature extraction. Supports the following - UD, Verbnet classids, Wordnet supersenses, concreteness ratings, LCS eventivity scores ''' home = expanduser("~") framnet_posdict = { 'V': 'VERB', 'N': 'NOUN', 'A': 'ADJ', 'ADV': 'ADV', 'PREP': 'ADP', 'NUM': 'NUM', 'INTJ': 'INTJ', 'ART': 'DET', 'C': 'CCONJ', 'SCON': 'SCONJ', 'PRON': 'PRON', 'IDIO': 'X', 'AVP': 'ADV' } # Load the features features = {} with open(home + '/Desktop/protocols/data/features-2.tsv', 'r') as f: for line in f.readlines(): feats = line.split('\t') features[feats[0]] = (feats[1].split(), feats[2].split()) # Load the predpatt objects for creating features files = [ '/Downloads/UD_English-r1.2/en-ud-train.conllu', '/Downloads/UD_English-r1.2/en-ud-dev.conllu', '/Downloads/UD_English-r1.2/en-ud-test.conllu' ] home = expanduser("~") options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True) # Resolve relative clause patt = {} for file in files: path = home + file with open(path, 'r') as infile: for sent_id, ud_parse in load_conllu(infile.read()): patt[file[33:][:-7] + " " + sent_id] = PredPatt(ud_parse, opts=options) data['Structure'] = data['Split.Sentence.ID'].map(lambda x: (patt[x], features[x])) data_dev['Structure'] = data_dev['Split.Sentence.ID'].map( lambda x: (patt[x], features[x])) raw_x = data['Structure'].tolist() raw_dev_x = data_dev['Structure'].tolist() all_x = raw_x + raw_dev_x all_feats = '|'.join(['|'.join(all_x[i][1][0]) for i in range(len(all_x))]) feature_cols = Counter(all_feats.split('|')) # All UD dataset features all_ud_feature_cols = list( feature_cols.keys()) + [(a + "_dep") for a in feature_cols.keys()] # Concreteness f = open(home + '/Desktop/protocols/data/concrete.pkl', 'rb') concreteness = pickle.load(f) if prot == 'arg': conc_cols = ['concreteness'] else: conc_cols = ['concreteness', 'max_conc', 'min_conc'] f.close() # LCS eventivity from lcsreader import LexicalConceptualStructureLexicon lcs = LexicalConceptualStructureLexicon( home + '/Desktop/protocols/data/verbs-English.lcs') lcs_feats = ['lcs_eventive', 'lcs_stative'] # Wordnet supersenses(lexicographer names) supersenses = list( set(['supersense=' + x.lexname() for x in wordnet.all_synsets()])) # Framenet lem2frame = {} for lm in framenet.lus(): for lemma in lm['lexemes']: lem2frame[lemma['name'] + '.' + framnet_posdict[lemma['POS']]] = lm['frame']['name'] frame_names = ['frame=' + x.name for x in framenet.frames()] # Verbnet classids verbnet_classids = ['classid=' + vcid for vcid in verbnet.classids()] # Lexical features lexical_feats = [ 'can', 'could', 'should', 'would', 'will', 'may', 'might', 'must', 'ought', 'dare', 'need' ] + [ 'the', 'an', 'a', 'few', 'another', 'some', 'many', 'each', 'every', 'this', 'that', 'any', 'most', 'all', 'both', 'these' ] dict_feats = {} for f in verbnet_classids + lexical_feats + supersenses + frame_names + lcs_feats + all_ud_feature_cols + conc_cols: dict_feats[f] = 0 x_pd = pd.DataFrame([ features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_x, data['Root.Token'].tolist(), data['Lemma'].tolist()) ]) dev_x_pd = pd.DataFrame([ features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_dev_x, data_dev['Root.Token'].tolist( ), data_dev['Lemma'].tolist()) ]) # Figure out which columns to drop(they're always zero) todrop1 = dev_x_pd.columns[(dev_x_pd == 0).all()].values.tolist() todrop = x_pd.columns[(x_pd == 0).all()].values.tolist() intdrop = [a for a in todrop if a not in todrop1] cols_to_drop = cols_to_drop = list(set(todrop) - set(intdrop)) x = x_pd.drop(cols_to_drop, axis=1).values.tolist() dev_x = dev_x_pd.drop(cols_to_drop, axis=1).values.tolist() x = [[a[:] for a in x[i:i + batch_size]] for i in range(0, len(data), batch_size)] dev_x = [[a[:] for a in dev_x[i:i + batch_size]] for i in range(0, len(data_dev), batch_size)] return x, dev_x
""" Example of programmatic PredPatt usage. """ # Run PredPatt on sentence from predpatt import PredPatt sentence = 'Chris loves silly dogs and clever cats .' P = PredPatt.from_sentence(sentence) # Pretty-print output print P.pprint(track_rule=True, color=True) print '______________________________________________________________________________' # A deeper look into PredPatt's internal representations. # # Each extraction is kept in a list called instances. Below we will loop through # each instance and print it's arguments. for x in P.instances: print print x, x.phrase() for a in x.arguments: print ' ', a, a.phrase() # Uncomment to list rules which fired on this proposition. Along with # an explanation. #for r in a.rules: # print ' %s: %s' % (r, r.explain()) print '______________________________________________________________________________' print
'/UD_English-r1.2/en-ud-dev.conllu', '/UD_English-r1.2/en-ud-test.conllu' ] home = expanduser("~/Downloads/") parsed = {'train': [], 'devte': []} out_data = [] options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True) # Resolve relative clause path = home + '/UD_English-r1.2/en-ud-train.conllu' with open(path, 'r') as infile: data = infile.read() parsed['train'] += [('en-ud-train.conllu' + " " + sent_id, PredPatt(ud_parse, opts=options)) for sent_id, ud_parse in load_conllu(data)] for file in files: path = home + file with open(path, 'r') as infile: data = infile.read() parsed['devte'] += [(file[17:] + " " + sent_id, PredPatt(ud_parse, opts=options)) for sent_id, ud_parse in load_conllu(data)] c = {'train': 0, 'devte': 0} d = {'train': 0, 'dev': 0, 'test': 0} ign = {'train': 0, 'devte': 0} prons_incl = [ "you", "they", "yourself", "themselves", "them", "themself", "theirself",
from predpatt import PredPatt pp = PredPatt.from_sentence( 'At the Pentagon briefing today, General Stanley McChrystal said that it looked a lot like terrorism.' ) #print(pp.pprint()) # print(" ".join([token.text for token in pp.tokens])) # print(pp.events) # print(pp.event_dict) # print(pp.events) for event in pp.events: print(event) for argument in event.arguments: print(argument)
def main(): patterns = '' sentence = 'The quick brown fox jumped over the lazy dog .' tags = '' parse = '' if request.GET.get('sentence', '').strip(): sentence = request.GET.get('sentence', '').strip() pp_opts = PredPattOpts() for k, v in sorted(PredPattOpts().__dict__.iteritems()): v = int(float(request.GET.get( k, v))) # all options are true/false for now. setattr(pp_opts, k, v) if sentence: #for sent in sent_detector.tokenize('"John saw Mary", said Jason. Larry met Sally for dinner.'): # print tokenize(sent) original_sentence = sentence parse = parser(sentence, tokenized=False) P = PredPatt(parse, opts=pp_opts) patterns = P.pprint(track_rule=True) tags = ' '.join('%s/%s' % x for x in zip(parse.tokens, parse.tags)) parse = parse.pprint(K=3) # remove predpatt's bracketed comments patterns = re.sub(r'\s*\[.*?\]', '', patterns) patterns = dedent(patterns) opts = [] for k, v in sorted(pp_opts.__dict__.iteritems()): # Create a hidden textbox with the false value because the values of # "unchecked" boxes don't get posted with form. opts.append('<input type="hidden" value="0" name="%s">' % (k, )) opts.append('<input type="checkbox" name="%s" value="1" %s> %s<br/>' % (k, 'checked' if v else '', k)) options = '\n'.join(opts) return template(""" <html> <head> <!-- JQuery --> <script src="//code.jquery.com/jquery-2.1.4.min.js"></script> <!-- Bootstrap --> <link rel="stylesheet" href="//maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css"/> <link rel="stylesheet" href="//maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap-theme.min.css"/> <script src="//maxcdn.bootstrapcdn.com/bootstrap/3.3.1/js/bootstrap.min.js"></script> <!-- Chosen Dropdown Library --> <link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/chosen/1.4.2/chosen.css"/> <script src="//cdnjs.cloudflare.com/ajax/libs/chosen/1.4.2/chosen.jquery.min.js"></script> <style> html { overflow: -moz-scrollbars-vertical; overflow: scroll; } </style> </head> <body> <div style="width: 800px; padding: 10px; margin-left: auto; margin-right: auto;"> <h1>PredPatt</h1> <strong>Sentence</strong> <pre>{{sentence}}</pre> <strong>Propositions</strong> <div id="propositions"> <pre> {{patterns}} </pre> <div> <button class="btn" data-toggle="collapse" data-target="#parse" style="margin-bottom: 10px;">Toggle Parse</button> <div id="parse" class="collapse"> <strong>Tags</strong> <pre> {{tags}} </pre> <strong>Parse</strong> <pre> {{parse}} </pre> </div> </div> <strong>Input</strong> <form action="/" method="GET"> <textarea type="text" name="sentence" style="height:50px; width: 100%;" placeholder="e.g., The quick brown fox jumped over the lazy dog." class="form-control" autofocus>{{original_sentence}}</textarea> <div style="padding: 10px;"><strong>Options</strong><br/>""" + options + """ </div> <br/> <input type="submit" name="save" value="submit"> </form> </div> </body> </html> """, sentence=sentence, original_sentence=original_sentence, patterns=patterns, tags=tags, parse=parse, options=options)
id = 1 files = ['/UD_English-r1.2/en-ud-dev.conllu', '/UD_English-r1.2/en-ud-test.conllu'] home = expanduser("~/Downloads/") parsed = {'train': [], 'devte': []} out_data = [] # Resolve relative clause options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True) path = home + '/UD_English-r1.2/en-ud-train.conllu' with open(path, 'r') as infile: data = infile.read() parsed['train'] += [('en-ud-train.conllu' + " " + sent_id, PredPatt(ud_parse, opts=options)) for sent_id, ud_parse in load_conllu(data)] for file in files: path = home + file with open(path, 'r') as infile: data = infile.read() parsed['devte'] += [(file[17:] + " " + sent_id, PredPatt(ud_parse, opts=options)) for sent_id, ud_parse in load_conllu(data)] # random.shuffle(parsed['train']) c = {'train': 0, 'devte': 0} d = {'train': 0, 'dev': 0, 'test': 0} copp = {'train': 0, 'devte': 0} auxverb = {'train': 0, 'devte': 0} ign = {'train': 0, 'devte': 0} adj = {'train': 0, 'devte': 0} for write_file in ['pred_train_data.csv', 'pred_devte_data.csv']:
decomp_lines_json_chunk = [ x for x in decomp_lines_json if x['doc-id'] == doc_id ] #get the lines associated with this chunk line_idx = 0 #Where we are in the decomp json file valid_instance = True for sent_id, parse in conll_iter: sent_id = int(sent_id.split('_')[1]) if line_idx >= len(decomp_lines_json_chunk): break if decomp_lines_json_chunk[line_idx][ 'sent-id'] == sent_id: #check if there is a matching decomp extraction for this conll line json_line = decomp_lines_json_chunk[line_idx] ppat = PredPatt(parse) pred_heads = json_line['predicate-head-idxs'] pred_args = json_line['pred-args'] assert len(pred_heads) <= len(pred_args) event_text = [] event_args = [] for idx, head in enumerate(pred_heads): head_args = [x for x in pred_args if x[0] == head] assert len(head_args) > 0 head_arg_id = head_args[0][1] if head < len(ppat.tokens) and ppat.tokens[ head] in ppat.event_dict.keys( ) and head_arg_id < len(ppat.tokens): predicate = ppat.event_dict[ppat.tokens[head]] pred_text = predpatt2text(predicate) event_text.append(pred_text)
def build_sentence_representation(s): """ Build representation of a sentence by analyzing predpatt output. Returns a weighted list of lists of terms. """ s = merge_citation_token_lists(s) s = remove_qutation_marks(s) lemmatizer = WordNetLemmatizer() raw_lists = [] rep_lists = [] rep_lists_alt = [] # to be consistent with double annotating for 3 and 3.1 try: pp = PredPatt.from_sentence(s, cacheable=False) # for speed tests except Exception as e: print('= = = PredPatt exception = = =') print('input:\n{}'.format(s)) print('exception:\n{}'.format(e)) return rep_lists, rep_lists_alt if len(pp.events) == 0: return rep_lists, rep_lists_alt if CIT_BASED: for e in pp.events: depth, rep = build_tree_representation(e) if INCLUDE_PREDICATE: pred = get_predicate(e.root) rep = ['{}:{}'.format(pred, r) for r in rep] if len(rep) > 0: raw_lists.append([depth, rep]) weight = 1 for rl in sorted(raw_lists, key=itemgetter(0)): rep_lists.append([weight, rl[1]]) weight *= .5 if len(rep_lists) == 0: fallback = build_noun_representation(pp.events[0], global_root=True) if INCLUDE_PREDICATE: pred = get_predicate(pp.events[0].root) fallback = ['{}:{}'.format(pred, f) for f in fallback] if len(fallback) > 0: rep_lists = [[.25, fallback]] else: # make a PPv3 and a PPv3.1 representation # - - - 3.1 - - - reps = [] for e in pp.events: rep = build_noun_representation(e) # 3.1 if INCLUDE_PREDICATE: pred = get_predicate(e.root) rep = ['{}:{}'.format(pred, f) for f in rep] reps.extend(rep) if len(reps) > 0: rep_lists = [[1, reps]] # - - - 3 - - - reps_alt = [] for e in pp.events: rep = build_noun_representation(e, global_root=True) # 3 if INCLUDE_PREDICATE: pred = get_predicate(e.root) rep = ['{}:{}'.format(pred, f) for f in rep] reps_alt.extend(rep) if len(reps) > 0: rep_lists_alt = [[1, reps_alt]] rep_lists = normalize_rep_lists(rep_lists, lemmatizer) rep_lists_alt = normalize_rep_lists(rep_lists_alt, lemmatizer) return rep_lists, rep_lists_alt