def get_features(ptree: nltk.ParentedTree, dtree: List[DepRel], indices, sense, offset): features_sentence = [] for i, drel in enumerate(dtree): if drel.rel.lower() == 'root': mv_position = i break else: mv_position = 0 main_verb = lemmatizer.lemmatize(ptree.pos()[mv_position][0]) for i, (word, tag) in enumerate(ptree.pos()): tree_pos = ptree.treeposition_spanning_leaves(i, i + 1)[:-2] chain = [ptree[tree_pos[:i + 1]].label() for i in range(len(tree_pos))] chain = ['S' if c == 'SBAR' else c for c in chain] if len(chain) > 0: chain = "-".join(get_compressed_chain(chain)) stem = stemmer.stem(word).lower() features_word = { 'idx': offset + i, 'BOS': i == 0, 'word': word.lower(), 'pos': tag, 'lemma': lemmatizer.lemmatize(word).lower(), 'stem': stem.lower(), 'chain': chain, 'conn': sense.split('.')[0] if offset + i in indices else "", 'inflection': word[len(stem):], 'is_main_verb': i == mv_position, 'main_verb': main_verb.lower() } features_sentence.append(features_word) return features_sentence
def get_features(ptree: nltk.ParentedTree, conn_idxs): leave_list = ptree.leaves() lca_loc = ptree.treeposition_spanning_leaves(conn_idxs[0], conn_idxs[-1] + 1)[:-1] self_category = ptree[lca_loc].label() parent_category = ptree[lca_loc].parent().label() if lca_loc else self_category left_sibling = get_sibling_label(ptree[lca_loc], 'left') right_sibling = get_sibling_label(ptree[lca_loc], 'right') labels = {n.label() for n in ptree.subtrees(lambda t: t.height() > 2)} bool_vp = 'VP' in labels bool_trace = 'T' in labels c = ' '.join(leave_list[conn_idxs[0]:conn_idxs[-1] + 1]).lower() prev, prev_conn, prev_pos, prev_pos_conn_pos = get_pos_features(ptree, conn_idxs, c, -1) next, next_conn, next_pos, next_pos_conn_pos = get_pos_features(ptree, conn_idxs, c, 1) prev = lemmatizer.lemmatize(prev) next = lemmatizer.lemmatize(next) r2l = [ptree[lca_loc[:i + 1]].label() for i in range(len(lca_loc))] r2lcomp = get_compressed_chain(r2l) feat = {'connective': c, 'connectivePOS': self_category, 'prevWord': prev, 'prevPOSTag': prev_conn, 'prevPOS+cPOS': prev_pos_conn_pos, 'nextWord': next, 'nextPOSTag': next_pos, 'cPOS+nextPOS': next_pos_conn_pos, 'root2LeafCompressed': ','.join(r2lcomp), 'root2Leaf': ','.join(r2l), 'left_sibling': left_sibling, 'right_sibling': right_sibling, 'parentCategory': parent_category, 'boolVP': bool_vp, 'boolTrace': bool_trace} return feat
def match(self, tree): try: if tree.label() != 'ROOT': raise IndexError if tree[0].label() != 'SBARQ': raise IndexError if tree[0][0][0].label() != 'WRB': raise IndexError if tree[0][0][0][0].lower() != 'when': raise IndexError if tree[0][1].label() != 'SQ': raise IndexError if tree[0][1][0].label() != 'VBD': raise IndexError if tree[0][1][1].label() != 'NP': raise IndexError if tree[0][1][2].label() != 'VP': raise IndexError part = Pattern.Part() part.object = ParentedTree.fromstring(str(tree[0][1][1])) part.property = ParentedTree.fromstring(str(Tree('VP', [ Tree.fromstring(str(tree[0][0][0])), Tree.fromstring(str(tree[0][1][0])), Tree.fromstring(str(tree[0][1][2])) ]))) return [part] except IndexError: return []
def walker(self, parent): if parent.label() == 'IN' and parent.leaves() == ["of"]: pos = parent.parent().treeposition() part = Pattern.Part() part.object = ParentedTree.fromstring(str(parent.right_sibling())) part.property = ParentedTree.fromstring(str(self.get_query_tree())) part.property[pos[:-1]].remove(part.property[pos]) self._parts.append(part) for child in parent: if isinstance(child, ParentedTree): self.walker(child)
def prune_tree(cls, tree, begin_index, end_index): """ Prune the tree that include the begin_index and the end_index so that it doesn't include leaves outside of the range limited by begin_index and end_index """ begin_path = tree.leaf_treeposition(begin_index) end_path = tree.leaf_treeposition(end_index) current_node = tree[begin_path[:-1]] end_node = tree[end_path[:-1]] new_tree = ParentedTree('(' + tree.node + ')') ## Initialize new tree l = [] current_new = new_tree current_old = tree for i in xrange(len(begin_path) - 1): if type(current_old[begin_path[i]]) != str: current_new.insert( 0, ParentedTree('(' + current_old[begin_path[i]].node + ')')) current_new = current_new[0] current_old = current_old[begin_path[i]] while current_old != end_node: if not (type(current_old[0]) == str or type(current_old[0]) == unicode): current_old = current_old[0] current_new.insert(0, ParentedTree('(' + current_old.node + ')')) current_new = current_new[0] else: current_new.insert(0, current_old[0]) while len(current_old.parent() ) == current_old.parent_index() + 1: current_old = current_old.parent() current_new = current_new.parent() current_old = current_old.parent()[current_old.parent_index() + 1] current_new.parent().insert( current_new.parent_index() + 1, ParentedTree('(' + current_old.node + ')')) current_new = current_new.parent()[current_new.parent_index() + 1] current_new.insert(0, current_old[0]) # print current_new return new_tree
def __init__(self, id_sentence, basic_dependencies=None, collapsed_dependencies=None, parsetree='', text=''): from nltk import ParentedTree assert type(id_sentence) == int, 'Wrong id type' assert basic_dependencies is None or \ type(basic_dependencies) == list, 'Basic dependencies type' assert collapsed_dependencies is None or \ type(collapsed_dependencies) == list, 'Collapsed dependencies type' if text: assert type(text) == list, 'Wrong text type' self.id_sentence = id_sentence self.basic_dependencies = DependencyGraph(basic_dependencies) self.collapsed_dependencies = DependencyGraph(collapsed_dependencies) self._parsetree = parsetree self.parsetree = ParentedTree(parsetree) self.words = [] self.next = None self.previous = None self.coreference_mentions = [] self.coreference_representatives = [] self._connected_sentences = None
def extract_independent_clauses(input_sent, predictor): output = predictor.predict(sentence=input_sent) tree_str = output["trees"] t = ParentedTree.fromstring(tree_str) candidate_nodes = list(t.subtrees(filter=lambda x: filt_r(x) or filt_l(x))) for node in candidate_nodes: if node.parent() in candidate_nodes: candidate_nodes.remove(node.parent()) sub_sentences = [] for candidate in candidate_nodes: temp = [] for subtree in candidate: temp += subtree.leaves() sub_sentences.append(temp) sub_sentences = sub_sentences if sub_sentences else [t.leaves()] sentences = [] for sentence in sub_sentences: temp = "" for i, word in enumerate(sentence): if i == 0: temp += word[0].title() + word[1:] elif word in [".", "!", "?", ",", ";"]: temp += word else: temp += " " + word temp = temp.replace(" ’", "’") temp = temp.replace(" n’", "n’") sentences.append(temp) return sentences
def simplify_tree(ptree: nltk.ParentedTree, collapse_root=False): ptree._label = 'S' tree = nltk.Tree.convert(ptree) if not collapse_root and isinstance(tree, nltk.Tree) and len(tree) == 1: nodes = [tree[0]] else: nodes = [tree] # depth-first traversal of tree while nodes: node = nodes.pop() if isinstance(node, nltk.Tree): if (len(node) == 1 and isinstance(node[0], nltk.Tree) and isinstance(node[0, 0], nltk.Tree)): if node.label() != node[0].label(): node.set_label(node.label() + '+' + node[0].label()) else: node.set_label(node.label()) node[0:] = [child for child in node[0]] # since we assigned the child's children to the current node, # evaluate the current node again nodes.append(node) else: for child in node: nodes.append(child) return nltk.ParentedTree.convert(tree)
def process_sentence(word_array, results): txt = ' '.join(word_array).replace('\n', '') tree = next(parser.raw_parse(txt)) tree = ParentedTree.convert(tree) leaf_values = tree.leaves() if len(leaf_values) != len(word_array): print('This may not happen') token_count = 0 for token in leaf_values: token_count += 1 leaf_index = leaf_values.index(token) tree_location = tree.leaf_treeposition(leaf_index) depth = len(tree_location) parent = tree[tree_location[0:(depth - 1)]] trace, POS_stanf = compute_total_trace(parent) df = pd.DataFrame([{ 'trace': trace, 'POS_stanf': POS_stanf, 'cd_idx': story, 'sentence_count': sentence, 'token_count': token_count, 'token': token }]) results = results.append(df, ignore_index=True) return str(tree), results
def getSentParses(sentence): if type(sentence) != str or len(sentence.split()) <= 1: return [] #Convert sentence into Stanford-parsed tree sentence = ParentedTree.convert(list(parser.raw_parse(sentence))[0]) #Split sentences if they contain multiple full sentences separated by ';', etc. sentences = [] if (sentence[0].label() == 'S') and (sentence[0,0].label() == 'S'): for i in range(len(sentence[0])): sentences += [sentence[0,i]] else: for i in range(len(sentence)): sentences += [sentence[i]] #Obtain desired tuple relations parsedSents = [] for sentence in sentences: print "Current subsentence", sentence.leaves() parsedSents += [getPrepParse(sentence)] parsedSents += [getSVBroadParse(sentence)] #Basic stupid coreferencing defaultSet = False for parsedSent in parsedSents: if len(parsedSent) == 0: continue if parsedSent[1].label() == 'NP' and parsedSent[1][0].label() != 'PRP': default = parsedSent[1] defaultSet = True if parsedSent[1].label() == 'NP' and parsedSent[1][0].label() == 'PRP' and defaultSet: parsedSent[1] = default return parsedSents
def match(self, *args, **kwargs): Pattern.match(self, *args, **kwargs) try: if self.get_query_tree().label() != "ROOT": raise IndexError if self.get_query_tree()[0].label() != "SBARQ": raise IndexError if self.get_query_tree()[0][0].label() != "WHNP": raise IndexError if self.get_query_tree()[0][0][0].label() != "WP": raise IndexError if self.get_query_tree()[0][0][0][0].lower() != self._keyword: raise IndexError if self.get_query_tree()[0][1].label() != "SQ": raise IndexError if len(self.get_query_tree()[0][1]) < 2: raise IndexError part = Pattern.Part() part.object = ParentedTree.fromstring(str(self.get_query_tree()[0][1][1])) self._parts.append(part) return self._parts except IndexError: return []
def run(self, args): input_text = args["input"]["text"][0] input_parse = args["input"]["parse"][0] output_parse = args["output"]["parse"][0] doc_list = args["doc_list"] tuples = self.get_io_files([input_text, input_parse, output_parse], doc_list) for files in tuples: indexed_parses = [] in_text_file, in_parse_file, out_parse_file = files text = self.read_file(in_text_file) parses = self.read_file(in_parse_file) parses = json.loads(parses) leafreader = LeafReader(text) for parse in parses: tree = ParentedTree.fromstring(parse, read_leaf=leafreader.read_leaf) indexed_parses.append(tree.pprint(margin=float("inf"))) # in-json parses output = json.dumps(indexed_parses) self.write_file(output, out_parse_file)
def match(self, *args, **kwargs): Pattern.match(self, *args, **kwargs) try: if self.get_query_tree().label() != "ROOT": raise IndexError if self.get_query_tree()[0].label() != "SBARQ": raise IndexError if self.get_query_tree()[0][0].label() != "WHNP": raise IndexError if self.get_query_tree()[0][0][0].label() != "WP": raise IndexError if self.get_query_tree()[0][0][0][0].lower() != self._keyword: raise IndexError if self.get_query_tree()[0][1].label() != "SQ": raise IndexError if len(self.get_query_tree()[0][1]) < 2: raise IndexError part = Pattern.Part() part.object = ParentedTree.fromstring( str(self.get_query_tree()[0][1][1])) self._parts.append(part) return self._parts except IndexError: return []
def ptph(self, rel): ptree = ParentedTree.convert(rel.parse_tree) # print(ptree.pprint()) arg1_tokens = rel.get_arg1_tokens() arg1_words = self.get_words(arg1_tokens) arg2_tokens = rel.get_arg2_tokens() arg2_words = self.get_words(arg2_tokens) return "ptp={0}".format(self.find_path(ptree, arg1_words, arg2_words))
def view(self, s): s = re.sub(r"set\(\[([\d, ]*)\]\)", r"{\g<1>}", s) print s #tree = ParentedTree.parse(s, node_pattern=r"\w*?\[.*?\]", parse_node=buildfeatstruct) tree = ParentedTree.parse(s, node_pattern=r"\w*?\[.*?\]", parse_node=FeatStruct) tree.draw()
def replace_terminals_with_indices(treestring): ''' Replaces each terminal in the tree read from a string with an index in the sentence ''' tree = ParentedTree.fromstring(treestring) for idx, _ in enumerate(tree.leaves()): tree_location = tree.leaf_treeposition(idx) non_terminal = tree[tree_location[:-1]] non_terminal[0] = str(idx) return tree
def sentence_to_tree(sentence): """ Given a sentence (as a text), it will transform it to a tree. Args: sentence: text of a sentence Return: sentence tree """ assert isinstance(sentence, basestring) sentence = pos_tag(word_tokenize(sentence)) tree = ParentedTree('S', []) for token in sentence: word, pos = token tree.append(ParentedTree(pos, [word])) return tree
def gerar_no(self, s): '''Gera um ParentedTree do NLTK apartir da string recebida. ''' all_ptrees = [] t_string = '(' + s[1] + ' ' + s[0] + ')' ptree = ParentedTree.convert(Tree.fromstring(t_string)) all_ptrees.extend(t for t in ptree.subtrees() if isinstance(t, Tree)) return ptree
def compute_gender(attributes): """ Compute the gender of a mention. Args: attributes (dict(str, object)): Attributes of the mention, must contain values for "type", "head", "head_index" and, if the mention is a pronoun, "citation_form". Returns: str: the number of the mention -- one of UNKNOWN, MALE, FEMALE, NEUTRAL and PLURAL. """ gender = "NEUTRAL" head_index = attributes["head_index"] gender_data = external_data.GenderData.get_instance() if attributes["head"] != [] and type(attributes["head"][0]) == type( ParentedTree('DT', ['a'])): attributes["head"] = [] for i in itertools.chain.from_iterable(attributes["head"]): attributes["head"].append(i.leaves()) if compute_number(attributes) == "PLURAL": gender = "PLURAL" elif attributes["type"] == "PRO": if attributes["citation_form"] == "he": gender = "MALE" elif attributes["citation_form"] == "she": gender = "FEMALE" elif attributes["citation_form"] == "it": gender = "NEUTRAL" elif attributes["citation_form"] in ["you", "we", "they"]: gender = "PLURAL" elif attributes["type"] == "NAM": if re.match(r"^mr(\.)?$", attributes["tokens"][0].lower()): gender = "MALE" elif re.match(r"^(miss|ms|mrs)(\.)?$", attributes["tokens"][0].lower()): gender = "FEMALE" elif not re.match(r"(PERSON|NONE)", attributes["ner"][head_index]): gender = "NEUTRAL" elif gender_data.look_up(attributes): gender = gender_data.look_up(attributes) elif attributes["type"] == "NOM": # print(attributes["head"][0]) # print(type(attributes["head"][0])) # print(attributes["head"] == [] or type(attributes["head"][0]) != type(u'qwe')) if attributes["head"] == [] or type( attributes["head"][0]) != type(u'qwe'): pass elif __wordnet_lookup_gender(" ".join(attributes["head"])): gender = __wordnet_lookup_gender(" ".join(attributes["head"])) elif gender_data.look_up(attributes): gender = gender_data.look_up(attributes) if gender == "NEUTRAL" and compute_semantic_class(attributes) == "PERSON": gender = "UNKNOWN" return gender
def generate(sent, synt, tmpls, synpg_model, pg_model, args): with torch.no_grad(): # convert syntax to tag sequence tagss = np.zeros((len(tmpls), args.max_sent_len), dtype=np.long) tags_ = ParentedTree.fromstring(synt) tags_ = getleaf(tags_) tags_ = [ dictionary.word2idx[f"<{w}>"] for w in tags_ if f"<{w}>" in dictionary.word2idx ] tagss[:, :len(tags_)] = tags_[:args.max_sent_len] tagss = torch.from_numpy(tagss).cuda() # generate parses from tag sequence and templates parse_idxs = pg_model.generate(tagss, tmpls, args.max_synt_len, temp=args.temp) # add <sos> and remove tokens after <eos> synts = np.zeros((len(tmpls), args.max_synt_len + 2), dtype=np.long) synts[:, 0] = 1 for i in range((len(tmpls))): parse_idx = parse_idxs[i].cpu().numpy() eos_pos = np.where(parse_idx == dictionary.word2idx["<eos>"])[0] eos_pos = eos_pos[0] + 1 if len(eos_pos) > 0 else len(idx) synts[i, 1:eos_pos + 1] = parse_idx[:eos_pos] synts = torch.from_numpy(synts).cuda() # bpe segment and convert sentence to tensor sents = np.zeros((len(tmpls), args.max_sent_len), dtype=np.long) sent_ = bpe.segment(sent).split() sent_ = [ dictionary.word2idx[w] if w in dictionary.word2idx else dictionary.word2idx["<unk>"] for w in sent_ ] sents[:, :len(sent_)] = sent_[:args.max_sent_len] sents = torch.from_numpy(sents).cuda() # generate paraphrases from sentence and generated parses output_idxs = synpg_model.generate(sents, synts, args.max_sent_len, temp=args.temp) output_idxs = output_idxs.cpu().numpy() paraphrases = [ reverse_bpe(synt2str(output_idxs[i], dictionary).split()) for i in range(len(tmpls)) ] return paraphrases
def generate(model, data, loader, dictionary, bpe, args): model.eval() with open(os.path.join(args.output_dir, f"target_sents.txt"), "w") as fp1, \ open(os.path.join(args.output_dir, f"target_synts.txt"), "w") as fp2, \ open(os.path.join(args.output_dir, f"outputs.txt"), "w") as fp3: with torch.no_grad(): iterator = tqdm(loader, total=len(loader)) for it, data_idxs in enumerate(iterator): data_idxs = data_idxs.numpy() sents_ = data[0][data_idxs] # sents1 targs_ = data[1][data_idxs] # sents2 synts_ = data[3][data_idxs] # synts2 batch_size = len(sents_) sents = np.zeros((batch_size, args.max_sent_len), dtype=np.long) synts = np.zeros((batch_size, args.max_synt_len + 2), dtype=np.long) for i in range(batch_size): sent_ = sents_[i] sent_ = bpe.segment(sent_).split() sent_ = [ dictionary.word2idx[w] if w in dictionary.word2idx else dictionary.word2idx["<unk>"] for w in sent_ ] sents[i, :len(sent_)] = sent_ synt_ = synts_[i] synt_ = ParentedTree.fromstring(synt_) synt_ = deleaf(synt_) synt_ = [ dictionary.word2idx[f"<{w}>"] for w in synt_ if f"<{w}>" in dictionary.word2idx ] synt_ = [dictionary.word2idx["<sos>"] ] + synt_ + [dictionary.word2idx["<eos>"]] synts[i, :len(synt_)] = synt_ sents = torch.from_numpy(sents).cuda() synts = torch.from_numpy(synts).cuda() idxs = model.generate(sents, synts, sents.size(1), sample=args.sample, temp=args.temp) for sent, idx, targ, synt_ in zip(sents_, idxs.cpu().numpy(), targs_, synts_): fp1.write(targ + '\n') fp2.write(synt_ + '\n') fp3.write( reverse_bpe(synt2str(idx, dictionary).split()) + '\n')
def evaluate(model, data, loader, criterion, dictionary, bpe, args): model.eval() total_loss = 0.0 max_it = len(loader) with torch.no_grad(): for it, data_idxs in enumerate(loader): data_idxs = np.sort(data_idxs.numpy()) # get batch of raw sentences and raw syntax sents_ = data[0][data_idxs] synts_ = data[1][data_idxs] batch_size = len(sents_) # initialize tensors sents = np.zeros((batch_size, args.max_sent_len), dtype=np.long) # words without position synts = np.zeros((batch_size, args.max_synt_len+2), dtype=np.long) # syntax targs = np.zeros((batch_size, args.max_sent_len+2), dtype=np.long) # target output for i in range(batch_size): # bpe segment and convert to tensor sent_ = sents_[i] sent_ = bpe.segment(sent_).split() sent_ = [dictionary.word2idx[w] if w in dictionary.word2idx else dictionary.word2idx["<unk>"] for w in sent_] sents[i, :len(sent_)] = sent_ # add <sos> and <eos> for target output targ_ = [dictionary.word2idx["<sos>"]] + sent_ + [dictionary.word2idx["<eos>"]] targs[i, :len(targ_)] = targ_ # parse syntax and convert to tensor synt_ = synts_[i] synt_ = ParentedTree.fromstring(synt_) synt_ = deleaf(synt_) synt_ = [dictionary.word2idx[f"<{w}>"] for w in synt_ if f"<{w}>" in dictionary.word2idx] synt_ = [dictionary.word2idx["<sos>"]] + synt_ + [dictionary.word2idx["<eos>"]] synts[i, :len(synt_)] = synt_ sents = torch.from_numpy(sents).cuda() synts = torch.from_numpy(synts).cuda() targs = torch.from_numpy(targs).cuda() # forward outputs = model(sents, synts, targs) # calculate loss targs_ = targs[:, 1:].contiguous().view(-1) outputs_ = outputs.contiguous().view(-1, outputs.size(-1)) loss = criterion(outputs_, targs_) total_loss += loss.item() return total_loss / max_it
def generate(epoch, eit, model, data, loader, dictionary, bpe, args, max_it=10): model.eval() with open(os.path.join(args.output_dir, "sents_valid_epoch{:02d}_it{:06d}.txt".format(epoch, eit)), "w") as fp: with torch.no_grad(): for it, data_idxs in enumerate(loader): if it >= max_it: break data_idxs = np.sort(data_idxs.numpy()) # get batch of raw sentences and raw syntax sents_ = data[0][data_idxs] synts_ = data[1][data_idxs] batch_size = len(sents_) # initialize tensors sents = np.zeros((batch_size, args.max_sent_len), dtype=np.long) # words without position synts = np.zeros((batch_size, args.max_synt_len+2), dtype=np.long) # syntax targs = np.zeros((batch_size, args.max_sent_len+2), dtype=np.long) # target output for i in range(batch_size): # bpe segment and convert to tensor sent_ = sents_[i] sent_ = bpe.segment(sent_).split() sent_ = [dictionary.word2idx[w] if w in dictionary.word2idx else dictionary.word2idx["<unk>"] for w in sent_] sents[i, :len(sent_)] = sent_ # add <sos> and <eos> for target output targ_ = [dictionary.word2idx["<sos>"]] + sent_ + [dictionary.word2idx["<eos>"]] targs[i, :len(targ_)] = targ_ # parse syntax and convert to tensor synt_ = synts_[i] synt_ = ParentedTree.fromstring(synt_) synt_ = deleaf(synt_) synt_ = [dictionary.word2idx[f"<{w}>"] for w in synt_ if f"<{w}>" in dictionary.word2idx] synt_ = [dictionary.word2idx["<sos>"]] + synt_ + [dictionary.word2idx["<eos>"]] synts[i, :len(synt_)] = synt_ sents = torch.from_numpy(sents).cuda() synts = torch.from_numpy(synts).cuda() targs = torch.from_numpy(targs).cuda() # generate idxs = model.generate(sents, synts, sents.size(1), temp=args.temp) # write output for sent, idx, synt in zip(sents.cpu().numpy(), idxs.cpu().numpy(), synts.cpu().numpy()): fp.write(synt2str(synt[1:], dictionary)+'\n') fp.write(sent2str(sent, dictionary)+'\n') fp.write(synt2str(idx, dictionary)+'\n') fp.write("--\n")
def traverse(graph, node): children = [int(c) for c in graph[node]["children"]] tagged_children = [] for child in children: ellipsed_parents = [ int(p) for p in graph[child]["ellipsed_parents"] ] # if the child is explicit if node not in ellipsed_parents: if graph[child]["terminal"] == "yes": tagged_children.append( ParentedTree(graph[child]["tag"], [graph[child]["text"]])) else: tagged_children.append(traverse(graph, child)) tree = ParentedTree(graph[node]["tag"], tagged_children) return tree
def process_tree(tree_str, label): example = None try: #print("getting tree") tree = ParentedTree.fromstring(tree_str.__str__()) #print("before get_relation") example = get_relation(tree, label) #print(example.e1) except ValueError as err: #print(err) pass return example
def get_features(ptree: nltk.ParentedTree, conn_idxs: List[int]): features = [] for i, (word, tag) in enumerate(ptree.pos()): features.append({ 'BOS': i == 0, 'word': word, 'pos': tag, 'lemma': lemmatizer.lemmatize(word), 'stem': stemmer.stem(word), 'conn': i in conn_idxs }) return features
def __init__(self, filename, postagged='./data/postagged-files', parsed='./data/parsed-files', dependency='./data/dep-files'): self.filename = filename postagged_file = os.path.join(postagged, filename+'.tag') parsed_file = os.path.join(parsed, filename+'.parse') dep_file = os.path.join(dependency, filename+'.parse.dep') self.tagged_sents = [x.strip() for x in open(postagged_file) if x.strip()] self.parsed_sents = [ParentedTree.fromstring(x) for x in open(parsed_file) if x.strip()] self.dep_sents = [DepTree.fromstring(x) for x in open(dep_file).read().strip().split('\n\n') if x.strip()] assert len(self.tagged_sents) == len(self.parsed_sents)
def parse2edus(parse): """ 将成分句法树切割为 EDU :param parse: 成分句法树 Bracket 格式文本, e.g. '( (IP (NP (PN 我)) (VP (VV 爱) (NP (NR 北京) (NR 天安门)))))' :return: structure.tree.EDU 生成器 """ pipeline = get_pipeline() segmenter = pipeline.segmenter parse = ParentedTree.fromstring(parse) childs = list(parse.subtrees(lambda t: t.height() == 2 and t.label() != '-NONE-')) text = ''.join([child[0] for child in childs]) sentence = Sentence((0, len(text)), text, parse=parse) return segmenter.cut_edu(sentence)
def parse_parented_tree(tree_string): """ Construct a tree from a constituent parse tree string. Args: tree_string (str): A constituent parse tree in bracket notation Returns: nltk.ParentedTree: A parse tree corresponding to the parse tree string. """ try: return ParentedTree(tree_string) except TypeError: return ParentedTree.fromstring(tree_string)
def read_file(file_): """ """ trees = [] with open(file_, "r", encoding="utf-8") as f: lines = f.readlines() for line in lines: tree = ParentedTree.fromstring(line) trees.append(tree) return trees
def norm_negation(node): if not isinstance(node, Tree): return for i, ni in enumerate(node): # is it a negation functor? if isinstance(ni, ParentedTree) and ni.label() == 'compound' and \ ni[0].label() == 'functor' and ni[0][0].val in ['\\+','not']: # take first argument first = ni[1][0] if isinstance(first, ParentedTree): first._parent = None # create a new tree ni = node[i] = ParentedTree( 'unop', [Token('NOT', '\\+', ni[0][0].pos), first]) norm_negation(ni)
def template2tensor(templates, max_tmpl_len, dictionary): tmpls = np.zeros((len(templates), max_tmpl_len + 2), dtype=np.long) for i, tp in enumerate(templates): tmpl_ = ParentedTree.fromstring(tp) tree2tmpl(tmpl_, 1, 2) tmpl_ = str(tmpl_).replace(")", " )").replace("(", "( ").split(" ") tmpl_ = [ dictionary.word2idx[f"<{w}>"] for w in tmpl_ if f"<{w}>" in dictionary.word2idx ] tmpl_ = [dictionary.word2idx["<sos>"] ] + tmpl_ + [dictionary.word2idx["<eos>"]] tmpls[i, :len(tmpl_)] = tmpl_ tmpls = torch.from_numpy(tmpls).cuda() return tmpls
def get_features(relation: Relation, ptree: nltk.ParentedTree): conn_raw = ' '.join(t.surface for t in relation.conn.tokens) conn_idxs = [t.local_idx for t in relation.conn.tokens] lca_loc = lca(ptree, conn_idxs) conn_tag = ptree[lca_loc].label() if conn_idxs[0] == 0: prev = "NONE" else: prev = ptree.leaves()[conn_idxs[0] - 1][0] prev = lemmatizer.lemmatize(prev) conn_pos_relative = get_connective_sentence_position(conn_idxs, ptree) feat = {'Connective': conn_raw, 'ConnectivePOS': conn_tag, 'ConnectivePrev': prev, 'connectivePosition': conn_pos_relative} return feat
def terms_inference(sentences, terms_trie): """ Given (tokenized and tagged) sentences and a trie of terms, it will infere terms occurences and return list of sentence trees. Args: sentences: shallow-parsed text terms_trie: trie of terms Return: list of shallow parse trees with inferred terms, dictionary of refferences to terms positions """ parsed_sentences = [] terms_positions = defaultdict(list) for sentence in sentences: parsed_sentence = ParentedTree('S', []) token_index = 0 while token_index < len(sentence): term_label, term_length = _longest_matching_term( sentence, token_index, terms_trie) if term_length > 0: # term found term_node = ParentedTree('TERM', []) term = name_to_term(term_label) term_node.term = term terms_positions[term].append(term_node) for token in sentence[token_index:token_index + term_length]: _append_word_token(term_node, token) parsed_sentence.append(term_node) token_index += term_length else: # there is no term starting from current postion token = sentence[token_index] _append_word_token(parsed_sentence, token) token_index += 1 parsed_sentences.append(parsed_sentence) return parsed_sentences, terms_positions
for single_char_word in SingleCharWord: if single_char_word in UpdatedVec: tag_set=UpdatedVec[single_char_word] else: tag_set=Word2Tag[single_char_word] print('Fail!') break tag_str=set2str(tag_set) tree_str='( '+tag_str+'_u '+single_char_word+' )' # revers to old version of discarding extra unary rule on Oct. 5 --- #tree_str=' ( '+tag_str+'_u '+' ( '+tag_str+'_b '+single_char_word+' ) ) ' #<-------- XXX Change on Oct. 4------ tree=ParentedTree(tree_str) index=len(NewForest) NewForest.append(tree) Word2treeID[single_char_word]=index print('done! Such trees have been appended to NewForest, and word2treeId mapping has been stored in Word2treeID hashtable.') #--------------------------->>> The following is the part that differ from 4_mini_tree_seq_gen.py <<<-------------- #
def test_read_normal_tree(self): leafreader = LeafReader('I have a book.') tree = ParentedTree.fromstring('(S1 (S (S (NP (PRP I)) (VP (VBP have) (NP (DT a) (NN book)))) (. .)))', read_leaf=leafreader.read_leaf) print(tree.pprint(margin=float("inf")))
def test_read_indexed_tree(self): leafreader = IndexedLeafReader() tree = ParentedTree.fromstring('(S1 (S (S (NP (PRP I|0|1)) (VP (VBP have|2|6) (NP (DT a|3|4) (NN book|9|13)))) (. .|13|14)))', read_leaf=leafreader.read_leaf) print(tree.pprint(margin=float("inf")))
#!/usr/bin/python # -*- coding: utf-8 -*- import sys from nltk import ParentedTree inputfile = sys.argv[1] parses = open(inputfile, "r").read().split("\n\n") for parse in parses: t = ParentedTree(parse) print(t._pprint_flat(nodesep="", parens="()", quotes=False))
# # 1st pass processing to 1. update the tree node label to set2str({possible tag associated with the leaves/strings}), i.e. concatenation of sorted possible tags for the string # 2. update InduceLeftNode and InduceRightNode hashtables. # count=0 print('\n>>1st pass of process the trees') for tree in Forest: count +=1 if count%int(len(Forest)/10)==0: print('progress------->',str(count/len(Forest)*100)[:2], '% finished') new_tree=ParentedTree(tree.pprint()) for subtree in new_tree.subtrees(): #update current tree string=''.join(subtree.leaves()) if string in Vec: #leaves/string in the record tag, subscript= decompose_tag(subtree.node) tag_vec_str=set2str(Vec[string]) #get the tag-set of the node according to the leaves and convert it to str subtree.node=tag_vec_str+'_'+subscript #update the node with the new_tag
dest="parse", default="data/projectsyndicate/projectsyndicate.truecased.de.20.parse", help="german parses parse persentence") opt.add_option("-m", dest="map", default="data/projectsyndicate/de-negra.map") opt.add_option("-s", dest="word2unipos", default="data/projectsyndicate/de.pos") (options, _) = opt.parse_args() word2pos = {} f = codecs.open(options.parse, 'r', 'utf-8') posmap = {} for line in codecs.open(options.map, 'r', 'utf-8').readlines(): de_pos, uni_pos = line.split() posmap[de_pos] = uni_pos with f: for line in f: t = ParentedTree.fromstring(line.strip()) for pos_token in t.subtrees(lambda t: t.height() == 2): pos_token = str(pos_token)[1:-1] (pos, token) = pos_token.split() token = token.encode('utf-8') s = word2pos.get(token, set()) s.add(posmap[pos]) word2pos[token] = s f.close() w = codecs.open(options.word2unipos, 'w', 'utf-8') for token, set_pos in word2pos.items(): print token, ':', ','.join(set_pos) w.write(token.encode('utf-8') + '\t' + ','.join(set_pos) + '\n') w.flush() w.close()
def find_pronouns(tree): pronouns = [] for child in tree: if type(child) in [unicode, str] and child.lower() in PRONOUNS: pronouns.append((child.lower(), None)) if isinstance(child, ParentedTree): pronouns = pronouns + find_pronouns(child) return pronouns total = 0 for file in treebank.fileids(): stats['name'] = file for tree in treebank.parsed_sents(file): tree = ParentedTree.convert(tree) for pronoun, np_node in find_pronouns(tree): if pronoun in gendered: stats['gendered'] += 1 if pronoun in itits: stats['itits'] += 1 stats['total'] += 1 total += 1 stats['pct_gendered'] = stats['gendered']/float(stats['total']) print file, total files.append(stats.copy()) stats = dict.fromkeys(stats, 0)
for single_char_word in SingleCharWord: if single_char_word in UpdatedVec: tag_set=UpdatedVec[single_char_word] else: #tag_set=Word2Tag[single_char_word] print('Fail!') break tag_str=set2str(tag_set) tree_str='( '+tag_str+'_u '+single_char_word+' )' tree=ParentedTree(tree_str) index=len(NewForest) NewForest.append(tree) Word2treeID[single_char_word]=index print('done! Such trees have been appended to NewForest, and word2treeId mapping has been stored in Word2treeID hashtable.')
for single_char_word in SingleCharWord: if single_char_word in UpdatedVec: tag_set=UpdatedVec[single_char_word] else: tag_set=Word2Tag[single_char_word] print('Fail!') break tag_str=set2str(tag_set) #tree_str='( '+tag_str+'_b '+single_char_word+' )' # revers to old version of discarding extra unary rule on Oct. 5 --- tree_str=' ( '+tag_str+'_l '+' ( '+tag_str+'_b '+single_char_word+' ) ) ' ##<-------- XXX Change on Oct. 7, only use l/b, and discard 'u' tag------ tree=ParentedTree(tree_str) index=len(NewForest) NewForest.append(tree) Word2treeID[single_char_word]=index print('done! Such trees have been appended to NewForest, and word2treeId mapping has been stored in Word2treeID hashtable.') #--------------------------->>> The following is the part that differ from 4_mini_tree_seq_gen.py <<<-------------- #
def getHead(syntac_sen): t = ParentedTree(syntac_sen.text) target = t[0] while target.height() != 2: ### non-trivial rules: no.1 flag = 0 parent = target if target.node == "SBARQ": for ts in target: if ts.node in ["WHNP", "WHPP", "WHADJP", "WHADVP"] and len(ts) > 1: target = ts flag = 1 break ### if not flag: rules = head_trace_rule[target.node] #rules = head_trace_rule.get(target.node, []) for rule in rules: if rule[0] == "L": newTarget = LookByL(target, rule[1:]) elif rule[0] == "R": newTarget = LookByR(target, rule[1:]) elif rule[0] == "LBP": newTarget = LookByLBP(target, rule[1:]) elif rule[0] == "RBP": newTarget = LookByRBP(target, rule[1:]) if newTarget != "": break if newTarget == "": target = target[0] else: target = newTarget #print target #print target.height() ### non-trivial rules: no.2: if flag: leafPos = getLeafPOS(target) m = re.search(r'(NN|NNS)_(\d+) POS_', leafPos) if m != None: lvs = target.leaves() print m.groups() target = ParentedTree("("+m.group(1)+" "+lvs[int(m.group(2))]+")") ### non-trivial rules: no.3 if target.height() == 2 and target.leaves()[0] in ["name", "kind", "type", "genre", "group", "part"]: print parent for k in parent: if k.node == "PP": target = k break pr = parent.right_sibling() for p in pr: if pr.node == "PP": target = pr break return target.leaves()[0]
def match(self, tree): if not isinstance(tree, ParentedTree): raise AttributeError self._query_tree = ParentedTree.fromstring(str(tree))
def main(): answers = open('coref_key.txt', 'r') this_correct = 0 correct = 0 total = 0 prev_sentences = deque() for file in FILENAMES: this_correct = 0 this_total = 0 prev_sentences.clear() for tree in treebank.parsed_sents(file): tree = ParentedTree.convert(tree) for pronoun, np_node in find_pronouns(tree): # i = 0 # for t in list(prev_sentences)[-3:]: # t.pretty_print() # print("-"*25) # i = i + 1 # if i == 3: break proposed = hobbs_to_string(hobbs(np_node, pronoun.lower(), prev_sentences)) tree.pretty_print() actual = answers.readline() if proposed == actual[:-1]: update_pronoun_results(pronoun, 1) correct += 1 this_correct += 1 update_pronoun_results(pronoun, 0) total += 1 this_total += 1 print "Pronoun: '" + pronoun + "' Proposed: '" + proposed + "' Actual: '" + actual + "'" if total: print "Overall:\tCorrect:", correct, "\tTotal:", total, "\tPercentage:", correct/float(total), "\n" print("*"*100) print("*"*100) prev_sentences.append(tree) print("-"*50) if this_correct: print file,":\tCorrect:", this_correct, "\tTotal:", this_total, "\tPercentage:", this_correct/float(this_total), "\n" if total: print "Overall:\tCorrect:", correct, "\tTotal:", total, "\tPercentage:", correct/float(total), "\n" print("-"*50) print "Male correct:", PRONOUN_RESULTS['male'], "\tMale total:", PRONOUN_RESULTS['male_total'], "\tPercent correct:", PRONOUN_RESULTS['male_pct'] print "Female correct:", PRONOUN_RESULTS['female'], "\tFemale total:", PRONOUN_RESULTS['female_total'], "\tPercent correct:", PRONOUN_RESULTS['female_pct'] print "Neutral correct:", PRONOUN_RESULTS['neutral'], "\tNeutral total:", PRONOUN_RESULTS['neutral_total'], "\tPercent correct:", PRONOUN_RESULTS['neutral_pct'] print "Plural correct:", PRONOUN_RESULTS['they'], "\tPlural total:", PRONOUN_RESULTS['they_total'], "\tPercent correct:", PRONOUN_RESULTS['they_pct'] print "Reflexive correct:", PRONOUN_RESULTS['reflexive'], "\tReflexive total:", PRONOUN_RESULTS['reflexive_total'], "\tPercent correct:", PRONOUN_RESULTS['reflexive_pct'] print "Total correct:", correct, "\tTotal:", total, "\tPercent correct:", correct/float(total)