def match(self, tree): try: if tree.label() != 'ROOT': raise IndexError if tree[0].label() != 'SBARQ': raise IndexError if tree[0][0][0].label() != 'WRB': raise IndexError if tree[0][0][0][0].lower() != 'when': raise IndexError if tree[0][1].label() != 'SQ': raise IndexError if tree[0][1][0].label() != 'VBD': raise IndexError if tree[0][1][1].label() != 'NP': raise IndexError if tree[0][1][2].label() != 'VP': raise IndexError part = Pattern.Part() part.object = ParentedTree.fromstring(str(tree[0][1][1])) part.property = ParentedTree.fromstring(str(Tree('VP', [ Tree.fromstring(str(tree[0][0][0])), Tree.fromstring(str(tree[0][1][0])), Tree.fromstring(str(tree[0][1][2])) ]))) return [part] except IndexError: return []
def walker(self, parent): if parent.label() == 'IN' and parent.leaves() == ["of"]: pos = parent.parent().treeposition() part = Pattern.Part() part.object = ParentedTree.fromstring(str(parent.right_sibling())) part.property = ParentedTree.fromstring(str(self.get_query_tree())) part.property[pos[:-1]].remove(part.property[pos]) self._parts.append(part) for child in parent: if isinstance(child, ParentedTree): self.walker(child)
def run(self, args): input_text = args["input"]["text"][0] input_parse = args["input"]["parse"][0] output_parse = args["output"]["parse"][0] doc_list = args["doc_list"] tuples = self.get_io_files([input_text, input_parse, output_parse], doc_list) for files in tuples: indexed_parses = [] in_text_file, in_parse_file, out_parse_file = files text = self.read_file(in_text_file) parses = self.read_file(in_parse_file) parses = json.loads(parses) leafreader = LeafReader(text) for parse in parses: tree = ParentedTree.fromstring(parse, read_leaf=leafreader.read_leaf) indexed_parses.append(tree.pprint(margin=float("inf"))) # in-json parses output = json.dumps(indexed_parses) self.write_file(output, out_parse_file)
def match(self, *args, **kwargs): Pattern.match(self, *args, **kwargs) try: if self.get_query_tree().label() != "ROOT": raise IndexError if self.get_query_tree()[0].label() != "SBARQ": raise IndexError if self.get_query_tree()[0][0].label() != "WHNP": raise IndexError if self.get_query_tree()[0][0][0].label() != "WP": raise IndexError if self.get_query_tree()[0][0][0][0].lower() != self._keyword: raise IndexError if self.get_query_tree()[0][1].label() != "SQ": raise IndexError if len(self.get_query_tree()[0][1]) < 2: raise IndexError part = Pattern.Part() part.object = ParentedTree.fromstring( str(self.get_query_tree()[0][1][1])) self._parts.append(part) return self._parts except IndexError: return []
def extract_independent_clauses(input_sent, predictor): output = predictor.predict(sentence=input_sent) tree_str = output["trees"] t = ParentedTree.fromstring(tree_str) candidate_nodes = list(t.subtrees(filter=lambda x: filt_r(x) or filt_l(x))) for node in candidate_nodes: if node.parent() in candidate_nodes: candidate_nodes.remove(node.parent()) sub_sentences = [] for candidate in candidate_nodes: temp = [] for subtree in candidate: temp += subtree.leaves() sub_sentences.append(temp) sub_sentences = sub_sentences if sub_sentences else [t.leaves()] sentences = [] for sentence in sub_sentences: temp = "" for i, word in enumerate(sentence): if i == 0: temp += word[0].title() + word[1:] elif word in [".", "!", "?", ",", ";"]: temp += word else: temp += " " + word temp = temp.replace(" ’", "’") temp = temp.replace(" n’", "n’") sentences.append(temp) return sentences
def match(self, *args, **kwargs): Pattern.match(self, *args, **kwargs) try: if self.get_query_tree().label() != "ROOT": raise IndexError if self.get_query_tree()[0].label() != "SBARQ": raise IndexError if self.get_query_tree()[0][0].label() != "WHNP": raise IndexError if self.get_query_tree()[0][0][0].label() != "WP": raise IndexError if self.get_query_tree()[0][0][0][0].lower() != self._keyword: raise IndexError if self.get_query_tree()[0][1].label() != "SQ": raise IndexError if len(self.get_query_tree()[0][1]) < 2: raise IndexError part = Pattern.Part() part.object = ParentedTree.fromstring(str(self.get_query_tree()[0][1][1])) self._parts.append(part) return self._parts except IndexError: return []
def replace_terminals_with_indices(treestring): ''' Replaces each terminal in the tree read from a string with an index in the sentence ''' tree = ParentedTree.fromstring(treestring) for idx, _ in enumerate(tree.leaves()): tree_location = tree.leaf_treeposition(idx) non_terminal = tree[tree_location[:-1]] non_terminal[0] = str(idx) return tree
def generate(sent, synt, tmpls, synpg_model, pg_model, args): with torch.no_grad(): # convert syntax to tag sequence tagss = np.zeros((len(tmpls), args.max_sent_len), dtype=np.long) tags_ = ParentedTree.fromstring(synt) tags_ = getleaf(tags_) tags_ = [ dictionary.word2idx[f"<{w}>"] for w in tags_ if f"<{w}>" in dictionary.word2idx ] tagss[:, :len(tags_)] = tags_[:args.max_sent_len] tagss = torch.from_numpy(tagss).cuda() # generate parses from tag sequence and templates parse_idxs = pg_model.generate(tagss, tmpls, args.max_synt_len, temp=args.temp) # add <sos> and remove tokens after <eos> synts = np.zeros((len(tmpls), args.max_synt_len + 2), dtype=np.long) synts[:, 0] = 1 for i in range((len(tmpls))): parse_idx = parse_idxs[i].cpu().numpy() eos_pos = np.where(parse_idx == dictionary.word2idx["<eos>"])[0] eos_pos = eos_pos[0] + 1 if len(eos_pos) > 0 else len(idx) synts[i, 1:eos_pos + 1] = parse_idx[:eos_pos] synts = torch.from_numpy(synts).cuda() # bpe segment and convert sentence to tensor sents = np.zeros((len(tmpls), args.max_sent_len), dtype=np.long) sent_ = bpe.segment(sent).split() sent_ = [ dictionary.word2idx[w] if w in dictionary.word2idx else dictionary.word2idx["<unk>"] for w in sent_ ] sents[:, :len(sent_)] = sent_[:args.max_sent_len] sents = torch.from_numpy(sents).cuda() # generate paraphrases from sentence and generated parses output_idxs = synpg_model.generate(sents, synts, args.max_sent_len, temp=args.temp) output_idxs = output_idxs.cpu().numpy() paraphrases = [ reverse_bpe(synt2str(output_idxs[i], dictionary).split()) for i in range(len(tmpls)) ] return paraphrases
def generate(model, data, loader, dictionary, bpe, args): model.eval() with open(os.path.join(args.output_dir, f"target_sents.txt"), "w") as fp1, \ open(os.path.join(args.output_dir, f"target_synts.txt"), "w") as fp2, \ open(os.path.join(args.output_dir, f"outputs.txt"), "w") as fp3: with torch.no_grad(): iterator = tqdm(loader, total=len(loader)) for it, data_idxs in enumerate(iterator): data_idxs = data_idxs.numpy() sents_ = data[0][data_idxs] # sents1 targs_ = data[1][data_idxs] # sents2 synts_ = data[3][data_idxs] # synts2 batch_size = len(sents_) sents = np.zeros((batch_size, args.max_sent_len), dtype=np.long) synts = np.zeros((batch_size, args.max_synt_len + 2), dtype=np.long) for i in range(batch_size): sent_ = sents_[i] sent_ = bpe.segment(sent_).split() sent_ = [ dictionary.word2idx[w] if w in dictionary.word2idx else dictionary.word2idx["<unk>"] for w in sent_ ] sents[i, :len(sent_)] = sent_ synt_ = synts_[i] synt_ = ParentedTree.fromstring(synt_) synt_ = deleaf(synt_) synt_ = [ dictionary.word2idx[f"<{w}>"] for w in synt_ if f"<{w}>" in dictionary.word2idx ] synt_ = [dictionary.word2idx["<sos>"] ] + synt_ + [dictionary.word2idx["<eos>"]] synts[i, :len(synt_)] = synt_ sents = torch.from_numpy(sents).cuda() synts = torch.from_numpy(synts).cuda() idxs = model.generate(sents, synts, sents.size(1), sample=args.sample, temp=args.temp) for sent, idx, targ, synt_ in zip(sents_, idxs.cpu().numpy(), targs_, synts_): fp1.write(targ + '\n') fp2.write(synt_ + '\n') fp3.write( reverse_bpe(synt2str(idx, dictionary).split()) + '\n')
def evaluate(model, data, loader, criterion, dictionary, bpe, args): model.eval() total_loss = 0.0 max_it = len(loader) with torch.no_grad(): for it, data_idxs in enumerate(loader): data_idxs = np.sort(data_idxs.numpy()) # get batch of raw sentences and raw syntax sents_ = data[0][data_idxs] synts_ = data[1][data_idxs] batch_size = len(sents_) # initialize tensors sents = np.zeros((batch_size, args.max_sent_len), dtype=np.long) # words without position synts = np.zeros((batch_size, args.max_synt_len+2), dtype=np.long) # syntax targs = np.zeros((batch_size, args.max_sent_len+2), dtype=np.long) # target output for i in range(batch_size): # bpe segment and convert to tensor sent_ = sents_[i] sent_ = bpe.segment(sent_).split() sent_ = [dictionary.word2idx[w] if w in dictionary.word2idx else dictionary.word2idx["<unk>"] for w in sent_] sents[i, :len(sent_)] = sent_ # add <sos> and <eos> for target output targ_ = [dictionary.word2idx["<sos>"]] + sent_ + [dictionary.word2idx["<eos>"]] targs[i, :len(targ_)] = targ_ # parse syntax and convert to tensor synt_ = synts_[i] synt_ = ParentedTree.fromstring(synt_) synt_ = deleaf(synt_) synt_ = [dictionary.word2idx[f"<{w}>"] for w in synt_ if f"<{w}>" in dictionary.word2idx] synt_ = [dictionary.word2idx["<sos>"]] + synt_ + [dictionary.word2idx["<eos>"]] synts[i, :len(synt_)] = synt_ sents = torch.from_numpy(sents).cuda() synts = torch.from_numpy(synts).cuda() targs = torch.from_numpy(targs).cuda() # forward outputs = model(sents, synts, targs) # calculate loss targs_ = targs[:, 1:].contiguous().view(-1) outputs_ = outputs.contiguous().view(-1, outputs.size(-1)) loss = criterion(outputs_, targs_) total_loss += loss.item() return total_loss / max_it
def generate(epoch, eit, model, data, loader, dictionary, bpe, args, max_it=10): model.eval() with open(os.path.join(args.output_dir, "sents_valid_epoch{:02d}_it{:06d}.txt".format(epoch, eit)), "w") as fp: with torch.no_grad(): for it, data_idxs in enumerate(loader): if it >= max_it: break data_idxs = np.sort(data_idxs.numpy()) # get batch of raw sentences and raw syntax sents_ = data[0][data_idxs] synts_ = data[1][data_idxs] batch_size = len(sents_) # initialize tensors sents = np.zeros((batch_size, args.max_sent_len), dtype=np.long) # words without position synts = np.zeros((batch_size, args.max_synt_len+2), dtype=np.long) # syntax targs = np.zeros((batch_size, args.max_sent_len+2), dtype=np.long) # target output for i in range(batch_size): # bpe segment and convert to tensor sent_ = sents_[i] sent_ = bpe.segment(sent_).split() sent_ = [dictionary.word2idx[w] if w in dictionary.word2idx else dictionary.word2idx["<unk>"] for w in sent_] sents[i, :len(sent_)] = sent_ # add <sos> and <eos> for target output targ_ = [dictionary.word2idx["<sos>"]] + sent_ + [dictionary.word2idx["<eos>"]] targs[i, :len(targ_)] = targ_ # parse syntax and convert to tensor synt_ = synts_[i] synt_ = ParentedTree.fromstring(synt_) synt_ = deleaf(synt_) synt_ = [dictionary.word2idx[f"<{w}>"] for w in synt_ if f"<{w}>" in dictionary.word2idx] synt_ = [dictionary.word2idx["<sos>"]] + synt_ + [dictionary.word2idx["<eos>"]] synts[i, :len(synt_)] = synt_ sents = torch.from_numpy(sents).cuda() synts = torch.from_numpy(synts).cuda() targs = torch.from_numpy(targs).cuda() # generate idxs = model.generate(sents, synts, sents.size(1), temp=args.temp) # write output for sent, idx, synt in zip(sents.cpu().numpy(), idxs.cpu().numpy(), synts.cpu().numpy()): fp.write(synt2str(synt[1:], dictionary)+'\n') fp.write(sent2str(sent, dictionary)+'\n') fp.write(synt2str(idx, dictionary)+'\n') fp.write("--\n")
def process_tree(tree_str, label): example = None try: #print("getting tree") tree = ParentedTree.fromstring(tree_str.__str__()) #print("before get_relation") example = get_relation(tree, label) #print(example.e1) except ValueError as err: #print(err) pass return example
def parse_parented_tree(tree_string): """ Construct a tree from a constituent parse tree string. Args: tree_string (str): A constituent parse tree in bracket notation Returns: nltk.ParentedTree: A parse tree corresponding to the parse tree string. """ try: return ParentedTree(tree_string) except TypeError: return ParentedTree.fromstring(tree_string)
def __init__(self, filename, postagged='./data/postagged-files', parsed='./data/parsed-files', dependency='./data/dep-files'): self.filename = filename postagged_file = os.path.join(postagged, filename+'.tag') parsed_file = os.path.join(parsed, filename+'.parse') dep_file = os.path.join(dependency, filename+'.parse.dep') self.tagged_sents = [x.strip() for x in open(postagged_file) if x.strip()] self.parsed_sents = [ParentedTree.fromstring(x) for x in open(parsed_file) if x.strip()] self.dep_sents = [DepTree.fromstring(x) for x in open(dep_file).read().strip().split('\n\n') if x.strip()] assert len(self.tagged_sents) == len(self.parsed_sents)
def parse2edus(parse): """ 将成分句法树切割为 EDU :param parse: 成分句法树 Bracket 格式文本, e.g. '( (IP (NP (PN 我)) (VP (VV 爱) (NP (NR 北京) (NR 天安门)))))' :return: structure.tree.EDU 生成器 """ pipeline = get_pipeline() segmenter = pipeline.segmenter parse = ParentedTree.fromstring(parse) childs = list(parse.subtrees(lambda t: t.height() == 2 and t.label() != '-NONE-')) text = ''.join([child[0] for child in childs]) sentence = Sentence((0, len(text)), text, parse=parse) return segmenter.cut_edu(sentence)
def read_file(file_): """ """ trees = [] with open(file_, "r", encoding="utf-8") as f: lines = f.readlines() for line in lines: tree = ParentedTree.fromstring(line) trees.append(tree) return trees
def template2tensor(templates, max_tmpl_len, dictionary): tmpls = np.zeros((len(templates), max_tmpl_len + 2), dtype=np.long) for i, tp in enumerate(templates): tmpl_ = ParentedTree.fromstring(tp) tree2tmpl(tmpl_, 1, 2) tmpl_ = str(tmpl_).replace(")", " )").replace("(", "( ").split(" ") tmpl_ = [ dictionary.word2idx[f"<{w}>"] for w in tmpl_ if f"<{w}>" in dictionary.word2idx ] tmpl_ = [dictionary.word2idx["<sos>"] ] + tmpl_ + [dictionary.word2idx["<eos>"]] tmpls[i, :len(tmpl_)] = tmpl_ tmpls = torch.from_numpy(tmpls).cuda() return tmpls
def file_to_trees(filename): """Reads the parse trees in the given file and returns them as a list of ParentedTree objects. A depth attribute is added for each node in the full tree. Args: filename: The base name of the input document. Returns: A list of ParentedTree objects. """ tree_filepath = 'data/parsed-files/' + filename + '.head.rel.tokenized.raw.parse' sent_trees = [] with open(tree_filepath) as tree_file: for line in tree_file: if not line.startswith('\n'): tree = ParentedTree.fromstring(line) add_depth(tree) sent_trees.append(tree) return sent_trees
def cut_edu(self, sent: Sentence) -> List[EDU]: if not hasattr(sent, "parse"): print(sent.text) parse = self.parser.parse(sent.text) else: parse = getattr(sent, "parse") parse = ParentedTree.fromstring(parse.pformat()) children = list( parse.subtrees( lambda t: t.height() == 2 and t.label() != '-NONE-')) edus = [] last_edu_words = [] last_edu_tags = [] offset = 0 for child in children: if child[0] == '-LRB-': child[0] = '(' if child[0] == '-RRB-': child[0] = ')' last_edu_words.append(child[0]) last_edu_tags.append(child.label()) if child[0] in self._eos or (child[0] in self.candidate and self.model.predict(offset, parse)): text = "".join(last_edu_words) edu = EDU([TEXT(text)]) setattr(edu, "words", last_edu_words) setattr(edu, "tags", last_edu_tags) edus.append(edu) last_edu_words = [] last_edu_tags = [] offset += len(child[0]) if last_edu_words: text = "".join(last_edu_words) edu = EDU([TEXT(text)]) setattr(edu, "words", last_edu_words) setattr(edu, "tags", last_edu_tags) edus.append(edu) return edus
def get_phrases(sentence): chunking_temp = [] chunking = [] sent_tagged = pos_tag(sentence) res = get_tregex(sentence, 'ROOT') if (res): tree = ParentedTree.fromstring(res['0']['match']) print(tree.pretty_print()) list_removed = [] traverse_tree(tree, chunking_temp) print(chunking_temp) #fix overlap string for x in range(0, len(chunking_temp)): p_x, tagged_x = chunking_temp[x] for y in range(x + 1, len(chunking_temp)): p_y = chunking_temp[y][0] if (tagged_x == 'VP' or tagged_x == 'S' or tagged_x == 'SBAR'): #do intersection p_x = p_x.replace(p_y, '').strip() else: if p_y in p_x and p_y not in list_removed: list_removed.append(p_y) #chunking_temp[x][0] = re.sub(r" ", " ", chunking_temp[x][0]) #if(chunking_temp[x][0] != ''): if (p_x not in list_removed and p_x != ''): tags = [] for s in sent_tagged: if s[0] in p_x: tags.append(s[1]) chunking.append((p_x, tagged_x, ' '.join(tags))) return chunking
def batch_extract(file): ''' extract the isolate event from the sentences in the txt files file: txt with (ROOT (S ...) ) sentences ''' tree = "" events = [] # open the input file with the sentences with open(file, "r") as file: # read a sentence sentence = file.readline() while sentence: sentence = sentence.strip("\n") try: # make a tree from the sentence tree = ParentedTree.fromstring(sentence) event = extract_isolate(tree) events.append(event) except Exception as e: pass sentence = file.readline() return events
def toNLTKtree(str): newTree = ParentedTree.fromstring(str) return newTree
def parse_indexify_transformations(in_p, out_p, label_voc, args): in_trimmed_seqs = [] in_seqs = [] out_trimmed_seqs = [] out_seqs = [] max_trans_size = 0 for idx in range(len(in_p)): # very rarely, a tree is invalid try: in_trimmed = ParentedTree.fromstring(in_p[idx]) in_orig = ParentedTree.fromstring(in_p[idx]) out_trimmed = ParentedTree.fromstring(out_p[idx]) out_orig = ParentedTree.fromstring(out_p[idx]) except: continue out_dh = parse_tree_level_dropout(out_trimmed, args.tree_level_dropout) parse_tree_level_dropout(in_trimmed, args.tree_level_dropout, level=out_dh) in_orig = deleaf(in_orig) in_trimmed = deleaf(in_trimmed) out_orig = deleaf(out_orig) out_trimmed = deleaf(out_trimmed) if max_trans_size < len(in_orig): max_trans_size = len(in_orig) if max_trans_size < len(out_orig): max_trans_size = len(out_orig) # only consider instances where top-level of input parse != top-level output if in_trimmed != out_trimmed: # make sure everything is invocab try: x = [label_voc[z] for z in in_orig] x = [label_voc[z] for z in out_orig] in_seqs.append(in_orig) out_seqs.append(out_orig) out_trimmed_seqs.append(out_trimmed) in_trimmed_seqs.append(in_trimmed) except: pass # no syntactic transformations in the batch! if len(in_seqs) == 0: return None # otherwise, indexify and return else: in_trans_np = np.zeros((len(in_seqs), max_trans_size), dtype='int32') out_trans_np = np.zeros((len(in_seqs), max_trans_size), dtype='int32') in_trimmed_np = np.zeros((len(in_seqs), max_trans_size), dtype='int32') out_trimmed_np = np.zeros((len(in_seqs), max_trans_size), dtype='int32') in_lengths = [] out_lengths = [] out_trimmed_lengths = [] in_trimmed_lengths = [] for idx in range(len(in_seqs)): curr_in = in_seqs[idx] in_trans_np[idx, :len(curr_in)] = [label_voc[z] for z in curr_in] in_lengths.append(len(curr_in)) curr_out = out_seqs[idx] out_trans_np[idx, :len(curr_out)] = [label_voc[z] for z in curr_out] out_lengths.append(len(curr_out)) curr_trimmed_in = in_trimmed_seqs[idx] in_trimmed_np[idx, :len(curr_trimmed_in)] = [label_voc[z] for z in curr_trimmed_in] in_trimmed_lengths.append(len(curr_trimmed_in)) curr_trimmed_out = out_trimmed_seqs[idx] out_trimmed_np[idx, :len(curr_trimmed_out)] = [label_voc[z] for z in curr_trimmed_out] out_trimmed_lengths.append(len(curr_trimmed_out)) # cut off extra padding in_trans_np = in_trans_np[:, :np.max(in_lengths)] out_trans_np = out_trans_np[:, :np.max(out_lengths)] in_trimmed_np = in_trimmed_np[:, :np.max(in_trimmed_lengths)] out_trimmed_np = out_trimmed_np[:, :np.max(out_trimmed_lengths)] return in_trans_np, out_trans_np, in_trimmed_np, out_trimmed_np,\ np.array(in_lengths, dtype='int32'), np.array(out_lengths, dtype='int32'),\ np.array(in_trimmed_lengths, dtype='int32'), np.array(out_trimmed_lengths, dtype='int32')
def indexify_transformations(in_p, out_p, label_voc, args): in_seqs = [] out_seqs = [] mismatch_inds = [] max_trans_size = 0 for idx in range(len(in_p)): # very rarely, a tree is invalid try: in_tree = ParentedTree.fromstring(in_p[idx]) out_tree = ParentedTree.fromstring(out_p[idx]) except: continue if args.tree_dropout > 0: tree_dropout(in_tree, args.tree_dropout, 0) tree_dropout(out_tree, args.tree_dropout, 0) elif args.tree_level_dropout > 0: parse_tree_level_dropout(in_tree, args.tree_level_dropout) parse_tree_level_dropout(out_tree, args.tree_level_dropout) in_full_trans = deleaf(in_tree) out_full_trans = deleaf(out_tree) if max_trans_size < len(in_full_trans): max_trans_size = len(in_full_trans) if max_trans_size < len(out_full_trans): max_trans_size = len(out_full_trans) # only consider instances where input syntax differs from output syntax if in_full_trans != out_full_trans: # make sure everything is invocab try: x = [label_voc[z] for z in in_full_trans] x = [label_voc[z] for z in out_full_trans] in_seqs.append(in_full_trans) out_seqs.append(out_full_trans) mismatch_inds.append(idx) except: pass # no syntactic transformations in the batch! if len(in_seqs) == 0: return None # otherwise, indexify and return else: in_trans_np = np.zeros((len(in_seqs), max_trans_size), dtype='int32') out_trans_np = np.zeros((len(in_seqs), max_trans_size), dtype='int32') in_lengths = [] out_lengths = [] for idx in range(len(in_seqs)): curr_in = in_seqs[idx] in_trans_np[idx, :len(curr_in)] = [label_voc[z] for z in curr_in] in_lengths.append(len(curr_in)) curr_out = out_seqs[idx] out_trans_np[idx, :len(curr_out)] = [label_voc[z] for z in curr_out] out_lengths.append(len(curr_out)) return in_trans_np, out_trans_np, mismatch_inds,\ np.array(in_lengths, dtype='int32'), np.array(out_lengths, dtype='int32')
''' Main function. ''' if __name__ == "__main__": print("Beginning parse of PTB.ext...") data = [] num_lines = rawgencount("PTB.ext") with open('PTB.ext', encoding='utf-8') as f: for sent_tree in tqdm(f, total=num_lines): # Parse this sent_tree into an NLTK tree object tree = ParentedTree.fromstring(sent_tree) # Get all phrases in this tree for phrase in get_coordphrases(tree): conjuncts = phrase[0] conjunction = phrase[1] phrase_cat = phrase[2] phrase_text = phrase[3] sent_text = get_tree_text(tree) # Only include two-termed coordinations if len(conjuncts) != 2: continue row = []
def encode_data(text, parsed_repr, bpe, pp_vocab, parse_gen_voc, parse_net, tp_templates, tp_template_lens, net, rev_label_voc, rev_pp_vocab): stime = time.time() ssent = ' '.join(text.split()) seg_sent = bpe.segment(ssent.lower()).split() results = [] results.append(reverse_bpe(seg_sent)) # encode sentence using pp_vocab, leave one word for EOS seg_sent = [pp_vocab[w] for w in seg_sent if w in pp_vocab] # add EOS seg_sent.append(pp_vocab['EOS']) torch_sent = Variable( torch.from_numpy(np.array(seg_sent, dtype='int32')).long().cuda()) torch_sent_len = torch.from_numpy(np.array([len(seg_sent)], dtype='int32')).long().cuda() # encode parse using parse vocab parse_tree = ParentedTree.fromstring(parsed_repr.strip()) parse_tree = deleaf(parse_tree) np_parse = np.array([parse_gen_voc[w] for w in parse_tree], dtype='int32') torch_parse = Variable(torch.from_numpy(np_parse).long().cuda()) torch_parse_len = torch.from_numpy( np.array([len(parse_tree)], dtype='int32')).long().cuda() # generate full parses from templates beam_dict = parse_net.batch_beam_search(torch_parse.unsqueeze(0), tp_templates, torch_parse_len[:], tp_template_lens, parse_gen_voc['EOP'], beam_size=3, max_steps=150) seq_lens = [] seqs = [] for b_idx in beam_dict: prob, _, _, seq = beam_dict[b_idx][0] seq = seq[:-1] # chop off EOP seq_lens.append(len(seq)) seqs.append(seq) np_parses = np.zeros((len(seqs), max(seq_lens)), dtype='int32') for z, seq in enumerate(seqs): np_parses[z, :seq_lens[z]] = seq tp_parses = Variable(torch.from_numpy(np_parses).long().cuda()) tp_len = torch.from_numpy(np.array(seq_lens, dtype='int32')).long().cuda() # generate paraphrases from parses try: beam_dict = net.batch_beam_search(torch_sent.unsqueeze(0), tp_parses, torch_sent_len[:], tp_len, pp_vocab['EOS'], beam_size=3, max_steps=40) for b_idx in beam_dict: prob, _, _, seq = beam_dict[b_idx][0] gen_parse = ' '.join([rev_label_voc[z] for z in seqs[b_idx]]) gen_sent = ' '.join([rev_pp_vocab[w] for w in seq[:-1]]) results.append(reverse_bpe(gen_sent.split())) except: print("beam search OOM") print(traceback.format_exc()) return results
def main(): files = glob.glob(TMP_PATH + "*.info.xml") OUT = open("out.out", "w") EXAMPLE_OUT = open("examples/" + SOURCE + "_" + LABEL + ".json", "w") num_examples = 0 total_files = len(files) no_tlinks = 0 print("[", file=EXAMPLE_OUT) for file in files: print(file, file=OUT) soup = BeautifulSoup(open(file), "html.parser") sentence = soup.sentence print(sentence.string, file=OUT) # store tokens in a list. tokens = [] for token in soup.tokens.find_all("t"): text = token.string.rsplit("\"", 3)[0].split("\"", 3)[-1] if text[0] == " ": text = text[1:] tokens.append(text) print(file=OUT) # parse events # <event id="e1" eiid="ei1" offset="2" string="said" tense="PAST" # aspect="NONE" class="REPORTING" polarity="POS" modality="" happen="" # lowerBoundDuration="" upperBoundDuration="" # /> eid_dict = {} eiid_dict = {} for event in soup.events.find_all("event"): text = event["string"] token_pos = event["offset"] eid_dict[event["id"]] = text eiid_dict[event["eiid"]] = text print(text, file=OUT) # parse timexes # <timex tid="t1" text="autumn" offset="19" length="1" type="DATE" # value="XXXX-FA" temporalFunction="false"/> timex_dict = {} for timex in soup.timexes.find_all("timex"): text = timex["text"] timex_dict[timex["tid"].strip()] = text print(text, file=OUT) print(file=OUT) tlinks = soup.find_all("tlink") if len(tlinks) == 0: no_tlinks += 1 print("NO TLINKS", file=OUT) else: headers = ["e1", "e2", "relation"] table = [] e1s = [] e2s = [] rels = [] for tlink in tlinks: e1 = tlink["event1"] e2 = tlink["event2"] if e1 in eid_dict: e1 = eid_dict[e1] elif e1 in eiid_dict: e1 = eiid_dict[e1] elif e1 in timex_dict: e1 = timex_dict[e1] else: print("ERROR: Can't find e1", file=OUT) print(eiid_dict) if e2 in eid_dict: e2 = eid_dict[e2] elif e2 in eiid_dict: e2 = eiid_dict[e2] elif e2 in timex_dict: e2 = timex_dict[e2] else: print("ERROR: Can't find e2", file=OUT) # print(e1, "\t", e2, "\t", tlink["relation"]) table.append([e1, e2, tlink["relation"]]) print(tabulate(table, headers=headers), file=OUT) print(file=OUT) parse = soup.parse.string t = ParentedTree.fromstring(parse) example = get_relation(t, LABEL) if example: if num_examples > 0: print(",", file=EXAMPLE_OUT) print(example.to_json(), file=EXAMPLE_OUT, end="") num_examples += 1 print(parse, file=OUT) print(file=OUT) print("\n]", file=EXAMPLE_OUT) print("total files: ", total_files) print("files without tlinks: ", no_tlinks) print("files with failed event parsing: ", failed_event_parse) print("files with successful event parsing: ", total_files - failed_event_parse)
def __populate_Parses(lang, parsejson, new_parsedict): """ """ # start CoreNLP servers for UD1 from stanfordnlp.server import CoreNLPClient cwd = os.getcwd() version = 'stanford-corenlp-full-2018-10-05' corenlp_path = re.findall(r'\S*/marta-v2', cwd)[0] + '/04_utils/' + version os.environ["CORENLP_HOME"] = corenlp_path if lang == 'en': lang = {} # i.e. CoreNLP defaults to English model corenlpclient_UD1 = CoreNLPClient(properties={ 'ssplit.isOneSentence': True, 'tokenize.whitespace': True }, annotators=[ 'tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats' ], memory='2G', be_quiet=True, max_char_length=100000, output_format='conllu') # parse annotator is necessary to obtain udfeats (for postags) if lang == 'fr': lang = 'french' corenlpclient_UD1 = CoreNLPClient( properties=lang, annotators=[ 'tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats' ], memory='2G', be_quiet=True, max_char_length=100000, output_format='conllu' ) # note that udfeats (for postags) currently works for english only https://stanfordnlp.github.io/CoreNLP/udfeats.html if lang == 'zh': lang = 'chinese' corenlpclient_UD1 = CoreNLPClient(properties=lang, annotators=[ 'tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats' ], memory='2G', be_quiet=True, max_char_length=100000, output_format='conllu') # note that udfeats (for postags) currently works for english only https://stanfordnlp.github.io/CoreNLP/udfeats.html # begin processing for DocID in parsejson: print('Now processing: ', dataset, DocID) sentence_offset = 0 # this is the 4th element in a TokenList # obtain the gold constituency parses for the document. ConstTrees = __obtain_ConstTrees_Gold( DocID, readpath='./03_data/{}/{}tbRoot/{}/', lang=LANG) for sentence in parsejson[DocID]['sentences']: # 1. create a ParsePDTB object __parsepdtb = ParsePDTB( lang=LANG, docid=DocID, sentid=sentence_offset, gold_consttree=ConstTrees[sentence_offset], pdtb_version=PDTB_VERSION) # 2. add to .RawText and .Words __parsepdtb.RawText = " ".join( [word[0] for word in sentence['words']]) __parsepdtb.Words = sentence['words'] # 3. add to ConstTree_Auto. generate parse if missing if sentence['parsetree'] == '(())\n': _parse = a2_parsers._parse_rawtext2consttree( LANG, __parsepdtb.RawText, tokenized=True) __parsepdtb.ConstTree_Auto = _parse else: __parsepdtb.ConstTree_Auto = sentence['parsetree'] # 3. write to temp file, for converting to SD/UD1 in next steps with open('./02_modelbuilding/02_output/input_temp.parser', 'w+') as f: f.write(__parsepdtb.ConstTree_Gold) # 4. convert constituency parse to gold UD 1.0 and add to DepTree_UD1_Gold a2_parsers.convert_const2dep( LANG, dataset, filename='', readpath='/02_modelbuilding/02_output/input_temp.parser', writepath='/02_modelbuilding/02_output/output_temp.parser', format_='UD1', usage='experiments') with open('./02_modelbuilding/02_output/output_temp.parser', 'r') as f: UD1_Gold_conllu = f.read() def __conllu2tuple(conllu_doc): """helper function to convert CoNLL format into 3-tuple used by CoNLL 2016 organisers to store dependency parses """ to_list = conllu_doc.split('\n') tokenlist = [ i.split('\t')[1] + '-' + i.split('\t')[0] for i in to_list if i != '' ] # convert CoNLL line to <wordform>-<token num> tokenlist.insert(0, 'ROOT-0') # add a root token to the start deptree_gold = [ [ i.split('\t')[7], tokenlist[int(i.split('\t')[6])], i.split('\t')[1] + '-' + i.split('\t')[0] ] for i in to_list if i != '' ] # convert to CoNLL 2016 dependencies format return deptree_gold __parsepdtb.DepTree_UD1_Gold = __conllu2tuple(UD1_Gold_conllu) # 5. automatically generate UD 1.0 constituency parse (from raw text), place into same 3-tuple format as CoNLL 2016 Shared Task,and add to DepTree_UD1_Auto UD1_Auto_conllu = corenlpclient_UD1.annotate( __parsepdtb.RawText) __parsepdtb.DepTree_UD1_Auto = __conllu2tuple(UD1_Auto_conllu) # 6. add PTB-style and UD pos tags to .Words. Each of the variable below contain a list comprising 2-tuples. each tuple is (<wordform>, <part of speech>) globals()['pos_PTBGold'] = [ i for i in ParentedTree.fromstring( __parsepdtb.ConstTree_Gold).pos() if i[-1] != '-NONE-' ] # gold PTB parses have traces and these causes misalignment with the surface form. we drop these since parsers don't predict traces (Johannsen & Søgaard, 2013) globals()['pos_PTBAuto'] = ParentedTree.fromstring( __parsepdtb.ConstTree_Auto).pos() globals()['pos_UDGold'] = [(i.split('\t')[1], i.split('\t')[3]) for i in UD1_Gold_conllu.split('\n') if i != ''] globals()['pos_UDAuto'] = [(i.split('\t')[1], i.split('\t')[3]) for i in UD1_Auto_conllu.split('\n') if i != ''] for postagset in ['PTBGold', 'PTBAuto', 'UDGold', 'UDAuto']: try: _tagset = globals()['pos_' + postagset] assert len(_tagset) == len(__parsepdtb.Words) for idx in range(len(__parsepdtb.Words)): # add the part of speech as a new key in the dictionary for the token in .Words __parsepdtb.Words[idx][1].update( {'PartOfSpeech_' + postagset: _tagset[idx][1]}) except AssertionError as e: e.args += ( postagset.upper() + " is not of the same size as the .Words attribute for this sentence.", ) print(e) print("Continuing to attempt alignment of tokens.") _words = [i[0] for i in __parsepdtb.Words] _words_maxidx = len(_words) - 1 #'drop' the additional tokens in _tagset _tagset = [i for i in _tagset if i[0] in _words] _words_curridx = -1 # start with -1 for idx in range(len(_tagset)): _words_curridx += 1 while __parsepdtb.Words[_words_curridx][ 0] != _tagset[idx][ 0] and _words_curridx < _words_maxidx: __parsepdtb.Words[_words_curridx][1].update( { 'PartOfSpeech_' + postagset: 'ParserError' } ) # place a marker identifying the missing pos tag as an error from parsing _words_curridx += 1 __parsepdtb.Words[_words_curridx][1].update( {'PartOfSpeech_' + postagset: _tagset[idx][1]}) continue # raise sentence_offset += 1 # increase sentence offset before moving to handle next sentence try: new_parsedict[DocID].append(__parsepdtb) except: new_parsedict[DocID] = [__parsepdtb] # shut down the CoreNLP servers corenlpclient_UD1.stop()
def test_read_indexed_tree(self): leafreader = IndexedLeafReader() tree = ParentedTree.fromstring('(S1 (S (S (NP (PRP I|0|1)) (VP (VBP have|2|6) (NP (DT a|3|4) (NN book|9|13)))) (. .|13|14)))', read_leaf=leafreader.read_leaf) print(tree.pprint(margin=float("inf")))
def encode_data(out_file): fn = ['idx', 'template', 'generated_parse', 'sentence'] ofile = codecs.open(out_file, 'w', 'utf-8') out = csv.DictWriter(ofile, delimiter='\t', fieldnames=fn) out.writerow(dict((x,x) for x in fn)) # read parsed data infile = codecs.open(args.parsed_input_file, 'r', 'utf-8', errors='ignore') inrdr = csv.DictReader(infile, delimiter='\t') # loop over sentences and transform them for d_idx, ex in enumerate(inrdr): stime = time.time() ssent = ' '.join(ex['tokens'].split()) seg_sent = bpe.segment(ssent.lower()).split() # write gold sentence out.writerow({'idx': ex['idx'], 'template':'GOLD', 'generated_parse':ex['parse'], 'sentence':reverse_bpe(seg_sent)}) # encode sentence using pp_vocab, leave one word for EOS seg_sent = [pp_vocab[w] for w in seg_sent if w in pp_vocab] # add EOS seg_sent.append(pp_vocab['EOS']) if args.gpu >= 0: torch_sent = Variable(torch.from_numpy(np.array(seg_sent, dtype='int32')).long().cuda()) torch_sent_len = torch.from_numpy(np.array([len(seg_sent)], dtype='int32')).long().cuda() else: torch_sent = Variable(torch.from_numpy(np.array(seg_sent, dtype='int32')).long()) torch_sent_len = torch.from_numpy(np.array([len(seg_sent)], dtype='int32')).long() # encode parse using parse vocab parse_tree = ParentedTree.fromstring(ex['parse'].strip()) parse_tree = deleaf(parse_tree) np_parse = np.array([parse_gen_voc[w] for w in parse_tree], dtype='int32') if args.gpu >= 0: torch_parse = Variable(torch.from_numpy(np_parse).long().cuda()) torch_parse_len = torch.from_numpy(np.array([len(parse_tree)], dtype='int32')).long().cuda() else: torch_parse = Variable(torch.from_numpy(np_parse).long()) torch_parse_len = torch.from_numpy(np.array([len(parse_tree)], dtype='int32')).long() # generate full parses from templates beam_dict = parse_net.batch_beam_search(torch_parse.unsqueeze(0), tp_templates, torch_parse_len[:], tp_template_lens, parse_gen_voc['EOP'], beam_size=3, max_steps=150) seq_lens = [] seqs = [] for b_idx in beam_dict: prob,_,_,seq = beam_dict[b_idx][0] seq = seq[:-1] # chop off EOP seq_lens.append(len(seq)) seqs.append(seq) np_parses = np.zeros((len(seqs), max(seq_lens)), dtype='int32') for z, seq in enumerate(seqs): np_parses[z, :seq_lens[z]] = seq if args.gpu >= 0: tp_parses = Variable(torch.from_numpy(np_parses).long().cuda()) tp_len = torch.from_numpy(np.array(seq_lens, dtype='int32')).long().cuda() else: tp_parses = Variable(torch.from_numpy(np_parses).long()) tp_len = torch.from_numpy(np.array(seq_lens, dtype='int32')).long() # generate paraphrases from parses # try: beam_dict = net.batch_beam_search(torch_sent.unsqueeze(0), tp_parses, torch_sent_len[:], tp_len, pp_vocab['EOS'], beam_size=3, max_steps=40) for b_idx in beam_dict: prob,_,_,seq = beam_dict[b_idx][0] gen_parse = ' '.join([rev_label_voc[z] for z in seqs[b_idx]]) gen_sent = ' '.join([rev_pp_vocab[w] for w in seq[:-1]]) out.writerow({'idx': ex['idx'], 'template':templates[b_idx], 'generated_parse':gen_parse, 'sentence':reverse_bpe(gen_sent.split())}) # except: # print('beam search OOM') print(d_idx, time.time() - stime)
dest="parse", default="data/projectsyndicate/projectsyndicate.truecased.de.20.parse", help="german parses parse persentence") opt.add_option("-m", dest="map", default="data/projectsyndicate/de-negra.map") opt.add_option("-s", dest="word2unipos", default="data/projectsyndicate/de.pos") (options, _) = opt.parse_args() word2pos = {} f = codecs.open(options.parse, 'r', 'utf-8') posmap = {} for line in codecs.open(options.map, 'r', 'utf-8').readlines(): de_pos, uni_pos = line.split() posmap[de_pos] = uni_pos with f: for line in f: t = ParentedTree.fromstring(line.strip()) for pos_token in t.subtrees(lambda t: t.height() == 2): pos_token = str(pos_token)[1:-1] (pos, token) = pos_token.split() token = token.encode('utf-8') s = word2pos.get(token, set()) s.add(posmap[pos]) word2pos[token] = s f.close() w = codecs.open(options.word2unipos, 'w', 'utf-8') for token, set_pos in word2pos.items(): print token, ':', ','.join(set_pos) w.write(token.encode('utf-8') + '\t' + ','.join(set_pos) + '\n') w.flush() w.close()
def tree_to_ptree(tree: nltk.Tree): tree_str = tree.__str__() ptree = PTree.fromstring(tree_str) return ptree
def __init__(self, parse_string): self.__tree = ParentedTree.fromstring(parse_string)
args = get_args() i = 1 tot = str(len(args.input_files)) for file in args.input_files: print("(" + str(i) + "/" + tot + ")") print("Gathering coordination stats from " + file + "...") sents = pd.read_csv(file) data = [] for index, row in tqdm(sents.iterrows(), total=len(sents.index)): parse_tree = row["Sentence Parse Tree"] tree = ParentedTree.fromstring(parse_tree) sent = get_tree_text(tree) for coord in get_simple_coordphrases(tree): category1 = coord[0][0] conjunct1 = coord[0][1] conjunction = coord[1] category2 = coord[2][0] conjunct2 = coord[2][1] data.append([category1, conjunct1, category2, conjunct2, conjunction, sent, parse_tree]) columns = ['1st Conjunct Category', '1st Conjunct Text', '2nd Conjunct Category', '2nd Conjunct Text', 'Conjunction', 'Sentence Text', 'Sentence Parse Tree'] df = pd.DataFrame(data, columns=columns) df.drop_duplicates(inplace=True)
def test_read_normal_tree(self): leafreader = LeafReader('I have a book.') tree = ParentedTree.fromstring('(S1 (S (S (NP (PRP I)) (VP (VBP have) (NP (DT a) (NN book)))) (. .)))', read_leaf=leafreader.read_leaf) print(tree.pprint(margin=float("inf")))
def match(self, tree): if not isinstance(tree, ParentedTree): raise AttributeError self._query_tree = ParentedTree.fromstring(str(tree))
def train(epoch, model, train_data, valid_data, train_loader, valid_loader, optimizer, criterion, dictionary, bpe, args): timer = Timer() n_it = len(train_loader) for it, data_idxs in enumerate(train_loader): model.train() data_idxs = np.sort(data_idxs.numpy()) # get batch of raw sentences and raw syntax sents_ = train_data[0][data_idxs] synts_ = train_data[1][data_idxs] batch_size = len(sents_) # initialize tensors sents = np.zeros((batch_size, args.max_sent_len), dtype=np.long) # words without position synts = np.zeros((batch_size, args.max_synt_len+2), dtype=np.long) # syntax targs = np.zeros((batch_size, args.max_sent_len+2), dtype=np.long) # target output for i in range(batch_size): # bpe segment and convert to tensor sent_ = sents_[i] sent_ = bpe.segment(sent_).split() sent_ = [dictionary.word2idx[w] if w in dictionary.word2idx else dictionary.word2idx["<unk>"] for w in sent_] sents[i, :len(sent_)] = sent_ # add <sos> and <eos> for target output targ_ = [dictionary.word2idx["<sos>"]] + sent_ + [dictionary.word2idx["<eos>"]] targs[i, :len(targ_)] = targ_ # parse syntax and convert to tensor synt_ = synts_[i] synt_ = ParentedTree.fromstring(synt_) synt_ = deleaf(synt_) synt_ = [dictionary.word2idx[f"<{w}>"] for w in synt_ if f"<{w}>" in dictionary.word2idx] synt_ = [dictionary.word2idx["<sos>"]] + synt_ + [dictionary.word2idx["<eos>"]] synts[i, :len(synt_)] = synt_ sents = torch.from_numpy(sents).cuda() synts = torch.from_numpy(synts).cuda() targs = torch.from_numpy(targs).cuda() # forward outputs = model(sents, synts, targs) # calculate loss targs_ = targs[:, 1:].contiguous().view(-1) outputs_ = outputs.contiguous().view(-1, outputs.size(-1)) optimizer.zero_grad() loss = criterion(outputs_, targs_) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() if it % args.log_interval == 0: # print current loss valid_loss = evaluate(model, valid_data, valid_loader, criterion, dictionary, bpe, args) print("| ep {:2d}/{} | it {:3d}/{} | {:5.2f} s | loss {:.4f} | g_norm {:.6f} | valid loss {:.4f} |".format( epoch, args.n_epoch, it, n_it, timer.get_time_from_last(), loss.item(), model.grad_norm, valid_loss)) if it % args.gen_interval == 0: # generate output to args.output_dir generate(epoch, it, model, valid_data, valid_loader, dictionary, bpe, args) if it % args.save_interval == 0: # save model to args.model_dir torch.save(model.state_dict(), os.path.join(args.model_dir, "synpg_epoch{:02d}.pt".format(epoch)))