def parse(text): """ Primary function to run syntaxnet and PredPatt over input sentences. """ parse_tree, trace = annotate_text(text) conll_parsed = parse_to_conll(parse_tree) conll_pp = [ud_parse for sent_id, ud_parse in load_conllu(conll_parsed)][0] #PredPatt options. Modify as needed. resolve_relcl = True # relative clauses resolve_appos = True # appositional modifiers resolve_amod = True # adjectival modifiers resolve_conj = True # conjuction resolve_poss = True # possessives ud = dep_v2.VERSION # the version of UD opts = PredPattOpts(resolve_relcl=resolve_relcl, resolve_appos=resolve_appos, resolve_amod=resolve_amod, resolve_conj=resolve_conj, resolve_poss=resolve_poss, ud=ud) ppatt = PredPatt(conll_pp, opts=opts) predicate_deps, arg_deps = get_ud_fragments(ppatt) #NOTE: #This returns the pretty print formatted string from PredPatt. This is done #largely as a place holder for JSON compatability within the REST API. return {'predpatt': {'predicate_deps': predicate_deps, 'arg_deps': arg_deps}, 'conll': conll_parsed, 'original': text}
def extract_predpatt_text(row, eid_num:int): ''' Given a pandas dataframe of TB data and eid_num (1 or 2) output predpatt predicate text (adds copula fillers in text) ''' tokenid = getattr(row, f'eid{eid_num}_token_id') conllu_string = getattr(row, f'eid{eid_num}_sent_conllu') parsed_tb = [PredPatt(ud_parse, opts=options) for sent_id, ud_parse in load_conllu(conllu_string)] pred_objects = parsed_tb[0].instances curr_text = getattr(row, f'eid{eid_num}_text') pred_match = False #print(f"{(row['docid'], row['eventInstanceID'], row['relatedToEventInstance'])}") if pred_objects: for pred in pred_objects: if int(pred.root.position)==int(tokenid): pred_match = True pred_object = pred break else: pred_match=False if pred_match: pred_text, _, _, _ = predicate_info(pred_object) return pred_text else: return curr_text else: return getattr(row, f'eid{eid_num}_text')
def test(data): from predpatt import PredPatt, load_conllu def fail(g, t): if len(g) != len(t): return True else: for i in g: if i not in t: return True no_color = lambda x, _: x count, failed = 0, 0 ret = "" for sent_id, ud_parse in load_conllu(data): count += 1 pp = PredPatt(ud_parse) sent = ' '.join(t.text for t in pp.tokens) linearized_pp = linearize(pp) gold_preds = [ predicate.format(C=no_color, track_rule=False) for predicate in pp.instances if likely_to_be_pred(predicate) ] test_preds = pprint_preds( construct_pred_from_flat(linearized_pp.split())) if fail(gold_preds, test_preds): failed += 1 ret += ( "Sent: %s\nLinearized PredPatt:\n\t%s\nGold:\n%s\nYours:\n%s\n\n" % (sent, linearized_pp, "\n".join(gold_preds), "\n".join(test_preds))) print(ret) print("You have test %d instances, and %d failed the test." % (count, failed))
def from_conll(cls, corpus: Union[str, TextIO], name: str = 'ewt', options: Optional[PredPattOpts] = None) -> 'PredPattCorpus': """Load a CoNLL dependency corpus and apply predpatt Parameters ---------- corpus (path to) a .conllu file name the name of the corpus; used in constructing treeids options options for predpatt extraction """ options = DEFAULT_PREDPATT_OPTIONS if options is None else options corp_is_str = isinstance(corpus, str) if corp_is_str and splitext(basename(corpus))[1] == '.conllu': with open(corpus) as infile: data = infile.read() elif corp_is_str: data = corpus else: data = corpus.read() # load the CoNLL dependency parses as graphs ud_corp = {name+'-'+str(i+1): [line.split() for line in block.split('\n') if len(line) > 0 if line[0] != '#'] for i, block in enumerate(data.split('\n\n'))} ud_corp = CoNLLDependencyTreeCorpus(ud_corp) # extract the predpatt for those dependency parses try: predpatt = {name+'-'+sid.split('_')[1]: PredPatt(ud_parse, opts=options) for sid, ud_parse in load_conllu(data)} except ValueError: errmsg = 'PredPatt was unable to parse the CoNLL you provided.' +\ ' This is likely due to using a version of UD that is' +\ ' incompatible with PredPatt. Use of version 1.2 is' +\ ' suggested.' raise ValueError(errmsg) return cls({n: (pp, ud_corp[n]) for n, pp in predpatt.items()})
def setup_graph(): ud = DependencyGraphBuilder.from_conll(listtree, 'tree1') pp = PredPatt(next(load_conllu(rawtree))[1], opts=PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True)) graph = PredPattGraphBuilder.from_predpatt(pp, ud, 'tree1') return pp, graph
def generate_predicates( abstract_text:str, pred_patt_opts=None )->Iterable[Tuple[str, str, str]]: "Requires that pred_util:nlp and pred_util:stopwords be initialized" nlp = dpg.get("pred_util:nlp") parser = Spacy2ConllParser(nlp=nlp) stopwords = dpg.get("pred_util:stopwords") doc = nlp(abstract_text) for sent in doc.sents: # if the sentence is very long if len(sent) >= 20: word_count = defaultdict(int) for tok in sent: word_count[str(tok)] += 1 # if one word dominates the long sentence if max(word_count.values()) >= len(sent)*0.2: continue # we likely generated the same word over-and-over conllu = "".join(list(parser.parse(input_str=str(sent)))) for _, pred_patt_parse in load_conllu(conllu): predicates = PredPatt( pred_patt_parse, opts=pred_patt_opts ).instances for predicate in predicates: # We only care about 2-entity predicates if len(predicate.arguments) == 2: a_ents, b_ents = [ # Get the set of entities filter( # Not in the stopword list lambda x: x not in stopwords, [str(e).strip() for e in nlp(args.phrase()).ents] ) # For each argument for args in predicate.arguments ] # Slight cleaning needed to better match the predicate phrase # Note, that PredPatt predicates use ?a and ?b placeholders predicate_stmt = ( re.match( r".*\?a(.*)\?b.*", # get text between placeholders predicate.phrase() ) .group(1) # get the group matched between the placeholders .strip() ) if len(predicate_stmt) > 0: # We're going to iterate all predicates for a, b in product(a_ents, b_ents): if a != b: yield (a, predicate_stmt, b)
def extract_predpatt(path='../../data/corpora/ud/UD_English-EWT-r1.2/'): ''' Extract PredPatt objects from CONLLU files ''' options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True) # Resolve relative clause patt = {} for file in os.listdir(path): if file.endswith('.conllu'): with open(path + file, 'r') as infile: for sent_id, ud_parse in load_conllu(infile.read()): patt[file + " " + sent_id] = \ PredPatt(ud_parse, opts=options) return patt
def extract(self, sentence: str) -> List[Dict[str, Any]]: processed = self.pipeline.process(sentence, self._error) if self._error.occurred(): print(f"=== Error occurred: {self._error.message}") self._error = ProcessingError() return None else: conll_example = [ud_parse for sent_id, ud_parse in load_conllu(processed)][ 0 ] ppatt = PredPatt(conll_example, opts=self._opts) result = [] for predicate in ppatt.instances: structure = { "predicate": predicate.tokens, "arguments": [x.tokens for x in predicate.arguments], } result.append(structure) return result
def main(): # Data Locations parser = argparse.ArgumentParser( description='Recast UDS-Time duration to NLI format.') parser.add_argument('--udstime', type=str, default='time_eng_ud_v1.2_2015_10_30.tsv', help='UDS-Time tsv dataset file location.') parser.add_argument( '--split', type=str, default='', help='If specified (train, dev, test), only that split is recasted') parser.add_argument('--out_train', type=str, default='train/', help='recasted train data folder location ') parser.add_argument('--out_dev', type=str, default='dev/', help='recasted train data folder location') parser.add_argument('--out_test', type=str, default='test/', help='recasted train data folder location ') args = parser.parse_args() # ### Import UDS Time uds_time = pd.read_csv(args.udstime, sep="\t") ewt = doc_utils.Corpus(uds_time=uds_time) df = ewt.process_data ####################################################### ## Add features to UDS-time dataframe ####################################################### df['Pred1.UPOS'] = df.apply( lambda row: get_predicate_pos(row, ewt, event=1), axis=1) df['Pred2.UPOS'] = df.apply( lambda row: get_predicate_pos(row, ewt, event=2), axis=1) ## Extract Predicate Full Text predicate_dict = {} for ud_data_path in ud_data: covered_set = set() fname = ud_data_path.split("/")[-1] data_name = fname.split(".")[0].split("-")[-1] #print(f"Start processing: {data_name}") with open(ud_data_path) as infile: data = infile.read() parsed = [(PredPatt(ud_parse, opts=options), sent_id) for sent_id, ud_parse in load_conllu(data)] for pred_object, sentid in parsed: sentnum = sentid.split("_")[-1] sentenceid = fname + " " + sentnum for predicate_object in pred_object.instances: #print(f"sentenceid: {sentenceid}, pred: {predicate_object}") pred_text, _, pred_root_token, _ = predicate_info( predicate_object) predicate_dict[sentenceid + "_" + str(pred_root_token)] = pred_text #print(f"error at sentid :{sentenceid}") print(f"Finished creating predicate dictionary for : {data_name}\n") df['Pred1.Text.Full'] = df['Event1.ID'].map(lambda x: predicate_dict[x]) df['Pred2.Text.Full'] = df['Event2.ID'].map(lambda x: predicate_dict[x]) ####################################################### ## Recast Data ####################################################### pairid = -1 # count total pair ids # Count event-pairs skipped due to ambiguous text for highlighting predicate. skipcount = 0 if args.split: splits = [args.split] else: splits = ['train', 'dev', 'test'] for split in splits: data = [] metadata = [] curr_df = df[df['Split'] == split] print(f"Creating NLI instances for Data split: {split}") event_pair_ids = list(curr_df.groupby(['Event.Pair.ID']).groups.keys()) pbar = tqdm(total=len(event_pair_ids)) for idx, event_pair_id in enumerate(event_pair_ids): ## Predicate 1 recasted_data, recasted_metadata, pairid, skipcount = create_duration_NLI( event_pair_id, df, ewt, pairid=pairid, skipcount=skipcount, event=1, sliding_window=1) if recasted_data: data += recasted_data metadata += recasted_metadata ## Predicate 2 recasted_data, recasted_metadata, pairid, skipcount = create_duration_NLI( event_pair_id, df, ewt, pairid=pairid, skipcount=skipcount, event=2, sliding_window=1) if recasted_data: data += recasted_data metadata += recasted_metadata # if pairid%10000==0: # print(f"Total pair-ids processed so far: {pairid}, skipped so far: {skipcount}") pbar.update(1) out_folder = { 'train': args.out_train, 'dev': args.out_dev, 'test': args.out_test } print( f"Total pair-ids processed so far: {pairid}, skipped so far: {skipcount}" ) with open(out_folder[split] + "recast_temporal-duration_data.json", 'w') as out_data: json.dump(data, out_data, indent=4) with open(out_folder[split] + "recast_temporal-duration_metadata.json", 'w') as out_metadata: json.dump(metadata, out_metadata, indent=4) print(f"Total pair-ids: {pairid}") print(f'Total events skipped: {skipcount}')
] home = expanduser("~/Downloads/") parsed = {'train': [], 'devte': []} out_data = [] options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True) # Resolve relative clause path = home + '/UD_English-r1.2/en-ud-train.conllu' with open(path, 'r') as infile: data = infile.read() parsed['train'] += [('en-ud-train.conllu' + " " + sent_id, PredPatt(ud_parse, opts=options)) for sent_id, ud_parse in load_conllu(data)] for file in files: path = home + file with open(path, 'r') as infile: data = infile.read() parsed['devte'] += [(file[17:] + " " + sent_id, PredPatt(ud_parse, opts=options)) for sent_id, ud_parse in load_conllu(data)] c = {'train': 0, 'devte': 0} d = {'train': 0, 'dev': 0, 'test': 0} ign = {'train': 0, 'devte': 0} prons_incl = [ "you", "they", "yourself", "themselves", "them", "themself", "theirself", "theirselves"
for line in f.readlines(): feats = line.split('\t') features[feats[0]] = [feats[1].split(), feats[2].split()] # Load the predpatt objects for creating features files = ['/Downloads/UD_English-r1.2/en-ud-train.conllu', '/Downloads/UD_English-r1.2/en-ud-dev.conllu', '/Downloads/UD_English-r1.2/en-ud-test.conllu'] options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True) # Resolve relative clause patt = {} for file in files: path = home + file with open(path, 'r') as infile: for sent_id, ud_parse in load_conllu(infile.read()): patt[file[27:] + " " + sent_id] = PredPatt(ud_parse, opts=options) data['Structure'] = data['Sentence.ID'].map(lambda x: (patt[x], features[x])) # Split the datasets into train, dev, test data_test = data[data['Split'] == 'test'].reset_index(drop=True) data_dev = data[data['Split'] == 'dev'].reset_index(drop=True) data = data[data['Split'] == 'train'].reset_index(drop=True) # Ridit scoring annotations and confidence ratings # for attr in attributes: # resp = attr_map[attr] # resp_conf = attr_conf[attr] # data[resp_conf + ".norm"] = data.groupby('Annotator.ID')[resp_conf].transform(ridit) # data_dev[resp_conf + ".norm"] = data_dev.groupby('Annotator.ID')[resp_conf].transform(ridit)
print( "Processing {} of Genre: {}, Progress {}/{} ({} %), Num Skipped: {}" .format(currbook, currgenre, num_processed, num_books, num_processed / (num_books * 1.0), num_skipped)) decomp_lines_json = [json.loads(x) for x in decomp_lines] book_conll_files = [ fi for fi in os.listdir(conlldir) if parse_conll_filename(fi)[1][0] == currbook ] for conllfi in book_conll_files: #For each chunk in the book conllfile = os.path.join(conlldir, conllfi) genre, book, doc_id = parse_conll_filename(conllfile) conll_iter = load_conllu(conllfile) decomp_lines_json_chunk = [ x for x in decomp_lines_json if x['doc-id'] == doc_id ] #get the lines associated with this chunk line_idx = 0 #Where we are in the decomp json file valid_instance = True for sent_id, parse in conll_iter: sent_id = int(sent_id.split('_')[1]) if line_idx >= len(decomp_lines_json_chunk): break if decomp_lines_json_chunk[line_idx][ 'sent-id'] == sent_id: #check if there is a matching decomp extraction for this conll line json_line = decomp_lines_json_chunk[line_idx]
id = 1 files = ['/UD_English-r1.2/en-ud-dev.conllu', '/UD_English-r1.2/en-ud-test.conllu'] home = expanduser("~/Downloads/") parsed = {'train': [], 'devte': []} out_data = [] # Resolve relative clause options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True) path = home + '/UD_English-r1.2/en-ud-train.conllu' with open(path, 'r') as infile: data = infile.read() parsed['train'] += [('en-ud-train.conllu' + " " + sent_id, PredPatt(ud_parse, opts=options)) for sent_id, ud_parse in load_conllu(data)] for file in files: path = home + file with open(path, 'r') as infile: data = infile.read() parsed['devte'] += [(file[17:] + " " + sent_id, PredPatt(ud_parse, opts=options)) for sent_id, ud_parse in load_conllu(data)] # random.shuffle(parsed['train']) c = {'train': 0, 'devte': 0} d = {'train': 0, 'dev': 0, 'test': 0} copp = {'train': 0, 'devte': 0} auxverb = {'train': 0, 'devte': 0} ign = {'train': 0, 'devte': 0} adj = {'train': 0, 'devte': 0} for write_file in ['pred_train_data.csv', 'pred_devte_data.csv']:
def hand_engineering(prot, batch_size, data, data_dev): ''' Hand engineered feature extraction. Supports the following - UD, Verbnet classids, Wordnet supersenses, concreteness ratings, LCS eventivity scores ''' home = expanduser("~") framnet_posdict = { 'V': 'VERB', 'N': 'NOUN', 'A': 'ADJ', 'ADV': 'ADV', 'PREP': 'ADP', 'NUM': 'NUM', 'INTJ': 'INTJ', 'ART': 'DET', 'C': 'CCONJ', 'SCON': 'SCONJ', 'PRON': 'PRON', 'IDIO': 'X', 'AVP': 'ADV' } # Load the features features = {} with open(home + '/Desktop/protocols/data/features-2.tsv', 'r') as f: for line in f.readlines(): feats = line.split('\t') features[feats[0]] = (feats[1].split(), feats[2].split()) # Load the predpatt objects for creating features files = [ '/Downloads/UD_English-r1.2/en-ud-train.conllu', '/Downloads/UD_English-r1.2/en-ud-dev.conllu', '/Downloads/UD_English-r1.2/en-ud-test.conllu' ] home = expanduser("~") options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True) # Resolve relative clause patt = {} for file in files: path = home + file with open(path, 'r') as infile: for sent_id, ud_parse in load_conllu(infile.read()): patt[file[33:][:-7] + " " + sent_id] = PredPatt(ud_parse, opts=options) data['Structure'] = data['Split.Sentence.ID'].map(lambda x: (patt[x], features[x])) data_dev['Structure'] = data_dev['Split.Sentence.ID'].map( lambda x: (patt[x], features[x])) raw_x = data['Structure'].tolist() raw_dev_x = data_dev['Structure'].tolist() all_x = raw_x + raw_dev_x all_feats = '|'.join(['|'.join(all_x[i][1][0]) for i in range(len(all_x))]) feature_cols = Counter(all_feats.split('|')) # All UD dataset features all_ud_feature_cols = list( feature_cols.keys()) + [(a + "_dep") for a in feature_cols.keys()] # Concreteness f = open(home + '/Desktop/protocols/data/concrete.pkl', 'rb') concreteness = pickle.load(f) if prot == 'arg': conc_cols = ['concreteness'] else: conc_cols = ['concreteness', 'max_conc', 'min_conc'] f.close() # LCS eventivity from lcsreader import LexicalConceptualStructureLexicon lcs = LexicalConceptualStructureLexicon( home + '/Desktop/protocols/data/verbs-English.lcs') lcs_feats = ['lcs_eventive', 'lcs_stative'] # Wordnet supersenses(lexicographer names) supersenses = list( set(['supersense=' + x.lexname() for x in wordnet.all_synsets()])) # Framenet lem2frame = {} for lm in framenet.lus(): for lemma in lm['lexemes']: lem2frame[lemma['name'] + '.' + framnet_posdict[lemma['POS']]] = lm['frame']['name'] frame_names = ['frame=' + x.name for x in framenet.frames()] # Verbnet classids verbnet_classids = ['classid=' + vcid for vcid in verbnet.classids()] # Lexical features lexical_feats = [ 'can', 'could', 'should', 'would', 'will', 'may', 'might', 'must', 'ought', 'dare', 'need' ] + [ 'the', 'an', 'a', 'few', 'another', 'some', 'many', 'each', 'every', 'this', 'that', 'any', 'most', 'all', 'both', 'these' ] dict_feats = {} for f in verbnet_classids + lexical_feats + supersenses + frame_names + lcs_feats + all_ud_feature_cols + conc_cols: dict_feats[f] = 0 x_pd = pd.DataFrame([ features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_x, data['Root.Token'].tolist(), data['Lemma'].tolist()) ]) dev_x_pd = pd.DataFrame([ features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_dev_x, data_dev['Root.Token'].tolist( ), data_dev['Lemma'].tolist()) ]) # Figure out which columns to drop(they're always zero) todrop1 = dev_x_pd.columns[(dev_x_pd == 0).all()].values.tolist() todrop = x_pd.columns[(x_pd == 0).all()].values.tolist() intdrop = [a for a in todrop if a not in todrop1] cols_to_drop = cols_to_drop = list(set(todrop) - set(intdrop)) x = x_pd.drop(cols_to_drop, axis=1).values.tolist() dev_x = dev_x_pd.drop(cols_to_drop, axis=1).values.tolist() x = [[a[:] for a in x[i:i + batch_size]] for i in range(0, len(data), batch_size)] dev_x = [[a[:] for a in dev_x[i:i + batch_size]] for i in range(0, len(data_dev), batch_size)] return x, dev_x