def rnnFromTree(tree, vocabulary, wordReduction=False, grammarBased=False): if tree.height() > 2: if grammarBased: cat = tree.label() + ' -> ' + ' '.join( [child.label() for child in tree]) else: cat = 'composition' children = [ rnnFromTree(child, vocabulary, wordReduction) for child in tree ] return Node(children, cat, 'tanh') else: #preterminal node words = tree.leaves() if len(words) == 1: word = words[0] else: 'Not exactly one leaf?!', tree try: index = vocabulary.index(word) except: index = 0 leaf = Leaf('word', index, word) if wordReduction: # wordReduction adds an extra layer to reduce high-dimensional words # to the dimensionality of the inner representations if grammarBased: cat = tree.label() else: cat = 'preterminal' return Node([leaf], cat, 'tanh') else: return leaf
def compute_gtscore(tree, model): try: tree.label() except AttributeError: return else: if (tree.height() <= 2): # if its a leaf then return the vector a = torch.Tensor(get_embed(tree[0])) return torch.Tensor([0]), a, torch.Tensor([0]) else: try: sl, pl, ll = compute_gtscore(tree[0], model) sr, pr, lr = compute_gtscore(tree[1], model) s, p, logprob = model(pl, pr) tlist = [] tlist.append(tree.label()) gt_val = torch.Tensor(tlist) gt_val = gt_val.long() logprob = logprob.unsqueeze(dim=0) # gt_val = gt_val.unsqueeze(dim=0) loss = F.nll_loss(logprob, gt_val) s = s + sr + sl loss = loss + ll + lr return s, p, loss except: return
def search(tree_in): if not isinstance(tree_in,nltk.tree.Tree): return False vp_pair=[] stack=[] stack.append(tree_in) current_tree='' while stack: tree=stack.pop() if isinstance(tree,nltk.tree.Tree) and tree.label()=="ROOT": for i in range(len(tree)): stack.append(tree[i]) if isinstance(tree,nltk.tree.Tree) and tree.label()=="IP":#从句 for i in range(len(tree)): stack.append(tree[i]) if isinstance(tree,nltk.tree.Tree) and tree.label()=="VP":#动词从句verb phrase duplicate=[] if len(tree)>=2: for i in range(1,len(tree)): if tree[0].label()=='VV' and tree[i].label()=="NP": verb=''.join(tree[0].leaves()) noun=get_noun_chunk(tree[i]) if verb and noun: vp_pair.append((verb,noun)) duplicate.append(noun) elif tree[0].label()=='VV' and tree[i].label()!="NP": noun=get_vv_loss_np(tree) verb=''.join(tree[0].leaves()) if verb and noun and noun not in duplicate: duplicate.append(noun) vp_pair.append((verb,noun)) if vp_pair: return vp_pair else: return False
def search(tree_in): # 遍历刚才构建的树 if not isinstance(tree_in, nltk.tree.Tree): return False vp_pair = [] stack = [] stack.append(tree_in) # 将tree放入stack current_tree = '' while stack: tree = stack.pop() if isinstance(tree, nltk.tree.Tree) and tree.label() == "ROOT": # 要处理的文本的语句 for i in range(len(tree)): stack.append(tree[i]) if isinstance(tree, nltk.tree.Tree) and tree.label() == "IP": # 简单从句 for i in range(len(tree)): stack.append(tree[i]) if isinstance(tree, nltk.tree.Tree) and tree.label() == "VP": # 动词短语 duplicate = [] if len(tree) >= 2: for i in range(1, len(tree)): if tree[0].label() == 'VV' and tree[i].label() == "NP": # 动词 和 名词短语 verb = ''.join(tree[0].leaves()) # 合并动词 leaves是分词 noun = get_noun_chunk(tree[i]) if verb and noun: vp_pair.append((verb, noun)) # 返回 动名词短语对 duplicate.append(noun) elif tree[0].label() == 'VV' and tree[i].label() != "NP": noun = get_vv_loss_np(tree) verb = ''.join(tree[0].leaves()) if verb and noun and noun not in duplicate: duplicate.append(noun) vp_pair.append((verb, noun)) if vp_pair: return vp_pair else: return False
def process_expression(self, expr_tree): val = None oplist = ['add_expr', 'mul_expr', 'sub_expr', 'div_expr'] #print(expr_tree) for tree in expr_tree.subtrees(): if (tree.label() == "expression_t"): for t in tree.subtrees(): if (t.label() in oplist): break if (t.label() != "expression_t" and ((t.label() == "id" or t.label() == "num") and t.right_sibling() is None)): for l in t.subtrees(): if (l.label().isdigit() and (l.label() != "id" and l.label() != "num")): #print("Returning") return int(l.label()) elif ((l.label() != "id" and l.label() != "num")): #print("Returning2") return (self.lookup(l.label())) elif (tree.label() in oplist): #print("expr", str(t)) newstr = str(t) nstr = (((newstr.replace(' ', '')).replace(')', '')).replace( '(', ' ')).replace('\n', '') #print(nstr) val = self.convertsymbols(nstr) #print("expr val", val) #val = self.evaluate_expr(newstr) return val
def find_subject(tree): if isinstance(tree, nltk.Tree): if tree.label() == 'TOP': return find_subject(tree.copy().pop()) elif tree.label() == 'S': try: if 'NP' in tree[0].label() and 'VP' in tree[1].label(): return tree[0] except IndexError: return 0
def find_subject(tree): if isinstance(tree,nltk.Tree): if tree.label() == 'TOP': return find_subject(tree.copy().pop()) elif tree.label() == 'S': try: if 'NP' in tree[0].label() and 'VP' in tree[1].label(): return tree[0] except IndexError: return 0
def iornnFromTree(tree, vocabulary, grammarBased=False): # print tree if tree.height() > 2: if grammarBased: cat = tree.label() + ' -> ' + ' '.join( [child.label() for child in tree]) else: cat = 'composition' children = [ iornnFromTree(child, vocabulary, grammarBased) for child in tree ] parent = IORNN.Node(children, cat, 'tanh', 'tanh') return parent else: #preterminal node words = tree.leaves() if len(words) == 1: word = words[0].lower() else: print 'Not exactly one leaf?!', tree word = 'UNK' try: index = vocabulary.index(word) except: index = 0 leaf = IORNN.Leaf('word', index, 'tanh', word) return leaf
def orderSentenceRec(tree, sentence, printThings, linearized): label = tree.label() if "-" in label: label = label[:label.index("-")] children = [child for child in tree] if type(children[0]) != nltk.tree.Tree: assert all([type(x) != nltk.tree.Tree for x in children]) assert len(list(children)) == 1, list(children) for c in children: if len(label) == 0 or label in ["'", ":", "``", ",", "''", "#", ".", "-NONE-"] or label[0] == "-" or "*-" in c: continue word = sentence[tree.start]["word"] #c.lower(), ) if word != c.lower().replace("\/","/"): print(142, word, c.lower()) return {"word" : word, "category" : label, "children" : None, "dependency" : "NONE"} else: assert all([type(x) == nltk.tree.Tree for x in children]) children = [child for child in children if child.start < child.end] # remove children that consist of gaps or otherwise eliminated tokens # find which children seem to be dependents of which other children if model != "REAL_REAL": childDeps = [None for _ in children] for i in range(len(children)): incomingFromOutside = [x for x in tree.incoming if x in children[i].incoming] if len(incomingFromOutside) > 0: childDeps[i] = sentence[incomingFromOutside[-1][1]]["dep"] if len(incomingFromOutside) > 1: print("FROM OUTSIDE", [sentence[incomingFromOutside[x][1]]["dep"] for x in range(len(incomingFromOutside))]) for j in range(len(children)): if i == j: continue incomingFromJ = [x for x in children[i].incoming if x in children[j].outgoing] if len(incomingFromJ) > 0: if len(incomingFromJ) > 1: duplicateDeps = tuple([sentence[incomingFromJ[x][1]]["dep"] for x in range(len(incomingFromJ))]) if not (duplicateDeps == ("obj", "xcomp")): print("INCOMING FROM NEIGHBOR", duplicateDeps) childDeps[i] = sentence[incomingFromJ[-1][1]]["dep"] assert None not in childDeps, (childDeps, children) keys = childDeps logits = [(x, distanceWeights[stoi_deps[key]], key) for x, key in zip(children, keys)] logits = sorted(logits, key=lambda x:-x[1]) childrenLinearized = list(map(lambda x:x[0], logits)) else: childrenLinearized = children # print(logits) childrenAsTrees = [] for child in childrenLinearized: childrenAsTrees.append(orderSentenceRec(child, sentence, printThings, linearized)) if childrenAsTrees[-1] is None: # this will happen for punctuation etc del childrenAsTrees[-1] else: childrenAsTrees[-1]["dependency"] = "Something" if len(childrenAsTrees) == 0: return None else: return {"category" : label, "children" : childrenAsTrees, "dependency" : "NONE"}
def lexicalize(tree, grup=None): """ Lexicaliza un árbol en el primer nivel y opcionalmente en algún grupo """ if not isinstance(tree, tuple): new_tree = nltk.Tree(tree.label(), []) if len(tree) == 1 and isinstance(tree[0], tuple): new_tree.set_label(u"%s-%s"%(tree.label(),tree[0][1])) elif grup and tree.label() == grup: verbs_pos = filter(lambda pos: pos[1][0] == grup[5], tree.pos()) # Devuelve una lista de ((palabra,lema),cat) siendo cat un verbo lemmas = map(lambda x: x[0][1], verbs_pos) # Devuelve una lista de lemas de verbos lemmas = "-".join(lemmas) new_tree.set_label(u"%s-%s"%(tree.label(),lemmas)) for child in tree: new_tree.append(lexicalize(child,grup)) return new_tree else: return tree[0]
def _to_json_inner(cls, tree): if cls.is_terminal(tree): return cls._terminal_to_json(tree) obj = { "nonterminal": tree.label(), "children": list(cls._to_json_inner(child) for child in tree), } return obj
def traverse_tree(tree): print("lable: ", tree.label()) #print("type(tree):", type(tree)) positions = tree.treepositions() print("treepositions:", positions) for subtree in tree: if type(subtree) == nltk.tree.Tree: traverse_tree(subtree) #recursive call
def descendTree(tree, vocab, posFine, depsVocab): label = tree.label() for child in tree: if type(child) == nltk.tree.Tree: # print((label, child.label()), type(tree)) key = (label, child.label()) depsVocab.add(key) descendTree(child, vocab, posFine, depsVocab) else: posFine.add(label) word = child.lower() if "*-" in word: continue vocab[word] = vocab.get(word, 0) + 1
def get_constituent_spans(s_statement, ): tree = Tree.fromstring(s_statement, brackets='[]') ''' # !Important to reset TreeNode static index # In future use better way make a static variable as it causes issue when instantiating multiple objects of different kind ''' TreeNode.static_index = 1 node_tree = None all_nodes_list = [] node_tree = TreeNode(tree.label()) traverse_tree(tree, node_tree) # pass empty list below to ensure proper count of node and the nodes all_nodes_list = get_all_nodes_list(node_tree, []) all_nodes_min_max = [] for node in all_nodes_list: all_nodes_min_max.append([node.node_name, self_recurse(node)]) return all_nodes_min_max
def to_html_inner(cls, tree): if cls.is_terminal(tree): return cls.terminal_to_html(tree) nonterminal = tree.label() parts = nonterminal.split("-") nonterminal_class = "nonterminal-{0}".format(parts[0]).lower() attrib = { "class": " ".join(["snode", nonterminal_class]), "data-nonterminal": nonterminal, } snode = ET.Element("div", attrib=attrib) snode.text = nonterminal snode.extend(list(cls.to_html_inner(x) for x in tree)) return snode
def _terminal_to_json(cls, tree): flat_terminal = tree.label() terminal_extra = { child.label(): child for child in tree if isinstance(child, AnnoTree) } variants = split_flat_terminal(flat_terminal) obj = {} obj["text"] = html_parens_to_parens(cls.leaf_text(tree)) obj["cat"] = variants["cat"] del variants["cat"] obj["variants"] = variants obj["lemma"] = html_parens_to_parens( cls.leaf_text(terminal_extra.get("lemma", []))) obj["exp_seg"] = cls.leaf_text(terminal_extra.get("exp_seg", [])) obj["exp_abbrev"] = cls.leaf_text(terminal_extra.get("exp_abbrev", [])) obj["terminal"] = flat_terminal return obj
def get_ip_recursion_noun(tree): np_list = [] if len(tree) == 1: tr = tree[0] get_ip_recursion_noun(tr) if len(tree) == 2: tr = tree[0] get_ip_recursion_noun(tr) tr = tree[1] get_ip_recursion_noun(tr) if len(tree) == 3: tr = tree[0] get_ip_recursion_noun(tr) tr = tree[1] get_ip_recursion_noun(tr) tr = tree[2] get_ip_recursion_noun(tr) if tree.label() == 'NP': np_list.append(get_noun_chunk(tree)) return np_list
def calc_top_vit_loglikelihood(p0, expansion, pcfg_split, trees): loglikehood = 0 with torch.no_grad(): expansion_3d = expansion.view(-1, int(expansion.shape[1] ** 0.5), int(expansion.shape[1]**0.5)) for tree in trees: tree_ll = torch.tensor([0]).to('cuda') top_a = int(tree.label()) tree_ll += p0[top_a] / np.log(10) productions = tree.productions() for production in productions: if len(production.rhs()) == 1: continue else: parent = int(production.lhs().symbol()) child1, child2 = int(production.rhs()[0].symbol()), int(production.rhs()[1].symbol()) tree_ll += expansion_3d[parent, child1, child2] + pcfg_split[parent, 0] loglikehood += tree_ll.item() return loglikehood return loglikehood
def tree_to_leave_pos_node_span_collapse_v3(tree): # print(f'tree_to_leave_pos_node_span_collapse.....') leaves = tree.leaves() # tree.pretty_print() # len_leave = len(leaves) padding_leaves_wnum(leaves, tree) pos_tags = [] tree_node_lst = [] spans = [] queue_tree = queue.Queue() queue_tree.put(tree) while not queue_tree.empty(): node = queue_tree.get() while len(node) == 1 and isinstance(node[0], nltk.Tree): node.set_label(node[0].label()) node[0:] = [c for c in node[0]] if len(node) == 1 and isinstance(node[0], str): pos_tags.append(node.label()) continue internal_leaves = node.leaves() tree_node_lst.append(node) _span = [int(internal_leaves[0]), int(internal_leaves[-1])] spans.append(_span) # spans.append(leaves2span(internal_leaves, leaves)) # loc = [t.leaf_treeposition(i) for i in range(3)] for c in node: if isinstance(c, nltk.Tree): queue_tree.put(c) del queue_tree nodes = [x.label() for x in tree_node_lst] if len(nodes) == 0: nodes = [tree.label()] spans = [[0, len(leaves) - 1]] # print(f'{len(spans)}, {len(nodes)}') # tree.pretty_print() # for n, s in zip(nodes, spans): # print(f'[{n}]: {s}') return leaves, pos_tags, nodes, spans, tree_node_lst
def is_terminal(cls, tree): return isinstance(tree, AnnoTree) and tree.label().islower()
def terminal_to_html(cls, tree): flat_terminal = tree.label() token_text = cls.leaf_text(tree) lemma = None seg = None exp_attrib = None terminal_extra = { child.label(): child for child in tree if isinstance(child, AnnoTree) } if "lemma" in terminal_extra: lemma = cls.leaf_text(terminal_extra["lemma"]) if "exp_abbrev" in terminal_extra: seg = { "type": "exp_abbrev", "text": cls.leaf_text(terminal_extra["exp_abbrev"]), } exp_attrib = {"data-abbrev": seg["text"]} elif "exp_seg" in terminal_extra: seg = { "type": "exp_seg", "text": cls.leaf_text(terminal_extra["exp_seg"]) } exp_attrib = {"data-seg": seg["text"]} parts = split_flat_terminal(flat_terminal) terminal_class = "terminal-{0}".format(parts["cat"]).lower() lemma = html_parens_to_parens(lemma) if lemma else lemma token_text = html_parens_to_parens(token_text) attrib = {("data-" + key): value for (key, value) in parts.items()} attrib.update({ "class": " ".join(["snode", terminal_class]), "data-text": token_text, "data-lemma": lemma if lemma else "", "data-seg": "", "data-abbrev": "", "data-terminal": flat_terminal, }) if exp_attrib is not None: attrib.update(exp_attrib) snode = ET.Element("div", attrib=attrib) snode.text = flat_terminal wnode = ET.SubElement(snode, "span", attrib={"class": "wnode"}) wnode.text = token_text if lemma: lemma_node = ET.SubElement(snode, "span", attrib={"class": "wnode lemma-node"}) lemma_node.text = lemma if seg: seg_class = ("exp-seg-node" if seg["type"] == "exp_seg" else "exp-abbrev-node") seg_node = ET.SubElement( snode, "span", attrib={"class": " ".join(["wnode", seg_class])}) seg_node.text = seg["text"] return snode
def prune(tree, start, end=None): if end is None: return nltk.tree.Tree(tree.label(), children=tree[start:]) else: return nltk.tree.Tree(tree.label(), children=tree[start:end])
def posMatches(tree, matcher): if isinstance(tree, nltk.tree.Tree): return str_.matches(tree.label(), matcher) else: return False
def orderSentenceRec(tree, sentence, printThings, linearized): global totalCountRCs global totalCountObjectIsLast label = tree.label() if label[-1] in "1234567890": label = label[:label.rfind("-")] children = [child for child in tree] if type(children[0]) != nltk.tree.Tree: assert all([type(x) != nltk.tree.Tree for x in children]) assert len(list(children)) == 1, list(children) for c in children: if label in ["'", ":", "``", ",", "''", "#", ".", "-NONE-" ] or label[0] == "-" or "*-" in c: continue word = sentence[tree.start]["word"] #c.lower(), ) if word != c.lower().replace("\/", "/"): print(142, word, c.lower()) return { "word": word, "category": label, "children": None, "dependency": "NONE" } else: assert all([type(x) == nltk.tree.Tree for x in children]) children = [ child for child in children if child.start < child.end ] # remove children that consist of gaps or otherwise eliminated tokens # find which children seem to be dependents of which other children if True or model != "REAL_REAL": childDeps = [None for _ in children] childHeads = [None for _ in children] for i in range(len(children)): incomingFromOutside = [ x for x in tree.incoming if x in children[i].incoming ] if len(incomingFromOutside) > 0: childDeps[i] = sentence[incomingFromOutside[-1][1]]["dep"] childHeads[i] = sentence[incomingFromOutside[-1] [1]]["head"] if len(incomingFromOutside) > 1: print("FROM OUTSIDE", [ sentence[incomingFromOutside[x][1]]["dep"] for x in range(len(incomingFromOutside)) ]) for j in range(len(children)): if i == j: continue incomingFromJ = [ x for x in children[i].incoming if x in children[j].outgoing ] if len(incomingFromJ) > 0: if len(incomingFromJ) > 1: duplicateDeps = tuple([ sentence[incomingFromJ[x][1]]["dep"] for x in range(len(incomingFromJ)) ]) if not (duplicateDeps == ("obj", "xcomp")): print("INCOMING FROM NEIGHBOR", duplicateDeps) childDeps[i] = sentence[incomingFromJ[-1][1]]["dep"] childHeads[i] = sentence[incomingFromJ[-1][1]]["head"] assert None not in childDeps, (childDeps, children) keys = childDeps childrenLinearized = children childrenAsTrees = [] for child, dependency in zip(children, childDeps): childrenAsTrees.append( orderSentenceRec(child, sentence, printThings, linearized)) if childrenAsTrees[ -1] is None: # this will happen for punctuation etc del childrenAsTrees[-1] else: childrenAsTrees[-1]["dependency"] = dependency if label == "SBAR": if len(childrenAsTrees) > 1: if len(childrenAsTrees ) == 2 and childrenAsTrees[0]["category"] in [ "IN", "WHNP" ] and childrenAsTrees[1][ "category"] == "S" and childrenAsTrees[1][ "dependency"] == "acl:relcl": # Relative clause if childrenAsTrees[0][ "dependency"] == "nsubj": # SUBJECT Relatives if sentence[childHeads[1] - 1]["dep"] in ["nsubj", "obj"]: # _ = 0 if sentence[childHeads[1] - 1]["dep"] == "nsubj": leaves = [ x for x in tree.leaves() if not (x.startswith("*T*") or x.startswith("*U*")) ] # print("WORDS ", " ".join(leaves)) # print("CATEGORIES ", zip([x["category"] for x in childrenAsTrees], [x["dependency"] for x in childrenAsTrees])) # print("Position in matrix clause", sentence[childHeads[1]-1]["dep"], len(tree.leaves())) # assert len(childrenAsTrees[1]["children"]) == 1, childrenAsTrees[1]["children"] # print("CHILDREN IN THE RC", [x["category"] for x in childrenAsTrees[1]["children"]]) firstVP = [ x["category"] for x in childrenAsTrees[1]["children"] ].index("VP") # print("First VP", firstVP) childrenInTheVP = [ x["category"] for x in childrenAsTrees[1] ["children"][firstVP]["children"] ] print("CHILDREN IN THE VP", childrenInTheVP) if len(childrenInTheVP ) > 1 and childrenInTheVP[0].startswith( "VB" ) and childrenInTheVP[1] == "NP": # print("Embedded verb head", sentence[childHeads[0]-1]) # print("Is the last word of RC?", sentence[childHeads[0]-1]["word"] == leaves[-1]) totalCountRCs += 1 totalCountObjectIsLast += ( 1 if len(childrenInTheVP) == 2 else 0) print(totalCountObjectIsLast / float(totalCountRCs), totalCountRCs ) # about 63% # What follows the relative clause? # else: # print(childrenAsTrees) # print(tree.leaves()) # print([x["category"] for x in childrenAsTrees]) return { "category": label, "children": childrenAsTrees, "dependency": "NONE" }
def tree2dict(tree): return {tree.label(): [tree2dict(t) if isinstance(t, Tree) else t for t in tree]}
def get_noun_chunk(tree): noun_chunk = [] if tree.label() == "NP": nouns_phase = ''.join(tree.leaves()) noun_chunk.append(nouns_phase) return noun_chunk
def get_noun_chunk(tree): if tree.label() == 'NP': nouns_phase = ''.join(tree.leaves()) # noun_chunk.append(nouns_phase) return nouns_phase
def orderSentenceRec(tree, sentence, printThings, linearized, order="mixed"): label = tree.label() # print("TREE", tree) children = [child for child in tree] # print("CHILDREN", children) if type(children[0]) != nltk.tree.Tree: assert all([type(x) != nltk.tree.Tree for x in children]) #print(children) for c in children: # print((label, label in ["'", ":", "``", ",", "''", "#", ".", "-NONE-"] or label[0] == "-" or c.startswith("*-"))) if label in ["'", ":", "``", ",", "''", "#", ".", "-NONE-"] or label[0] == "-" or "*-" in c: continue word = sentence[tree.start]["word"] #c.lower(), ) if word != c.lower().replace("\/","/"): print(142, word, c.lower()) linearized.append({"word" : word, "posFine" : label}) else: assert all([type(x) == nltk.tree.Tree for x in children]) children = [child for child in children if child.start < child.end] # remove children that consist of gaps or otherwise eliminated tokens # find those # # if len(tree.incoming) > 1: # print("INCOMING", [sentence[x]["dep"] for _, x in tree.incoming]) # find which children seem to be dependents of which other children if model != "REAL_REAL": childDeps = [None for _ in children] for i in range(len(children)): incomingFromOutside = [x for x in tree.incoming if x in children[i].incoming] if len(incomingFromOutside) > 0: childDeps[i] = sentence[incomingFromOutside[-1][1]]["dep"] if len(incomingFromOutside) > 1: print("FROM OUTSIDE", [sentence[incomingFromOutside[x][1]]["dep"] for x in range(len(incomingFromOutside))]) for j in range(len(children)): if i == j: continue incomingFromJ = [x for x in children[i].incoming if x in children[j].outgoing] if len(incomingFromJ) > 0: if len(incomingFromJ) > 1: duplicateDeps = tuple([sentence[incomingFromJ[x][1]]["dep"] for x in range(len(incomingFromJ))]) if not (duplicateDeps == ("obj", "xcomp")): print("INCOMING FROM NEIGHBOR", duplicateDeps) childDeps[i] = sentence[incomingFromJ[-1][1]]["dep"] assert None not in childDeps, (childDeps, children) keys = childDeps logits = [(x, distanceWeights[stoi_deps[key]]) for x, key in zip(children, keys)] logits = sorted(logits, key=lambda x:-x[1]) childrenLinearized = map(lambda x:x[0], logits) else: childDeps = [None for _ in children] for i in range(len(children)): incomingFromOutside = [x for x in tree.incoming if x in children[i].incoming] if len(incomingFromOutside) > 0: childDeps[i] = sentence[incomingFromOutside[-1][1]]["dep"] if len(incomingFromOutside) > 1: print("FROM OUTSIDE", [sentence[incomingFromOutside[x][1]]["dep"] for x in range(len(incomingFromOutside))]) for j in range(len(children)): if i == j: continue incomingFromJ = [x for x in children[i].incoming if x in children[j].outgoing] if len(incomingFromJ) > 0: if len(incomingFromJ) > 1: duplicateDeps = tuple([sentence[incomingFromJ[x][1]]["dep"] for x in range(len(incomingFromJ))]) if not (duplicateDeps == ("obj", "xcomp")): print("INCOMING FROM NEIGHBOR", duplicateDeps) childDeps[i] = sentence[incomingFromJ[-1][1]]["dep"] assert None not in childDeps, (childDeps, children) keys = childDeps childrenLinearized = children REVERSE_SUBJECT = (order == "VS" or (order == "mixed" and random() > 0.5)) # print(order, REVERSE_SUBJECT) if REVERSE_SUBJECT: if "nsubj" in childDeps and len(childDeps) > 1: labels = [x.label() for x in children] if "NP-SBJ" in str(labels): hasReversed = False for i in range(len(children)-1): if labels[i].startswith("NP-SBJ") and labels[i+1].startswith("VP"): childrenLinearized[i], childrenLinearized[i+1] = childrenLinearized[i+1], childrenLinearized[i] labels[i], labels[i+1] = labels[i+1], labels[i] hasReversed=True elif labels[i].startswith("NP-SBJ") and labels[i+1].startswith("NP-PRD"): childrenLinearized[i], childrenLinearized[i+1] = childrenLinearized[i+1], childrenLinearized[i] labels[i], labels[i+1] = labels[i+1], labels[i] hasReversed=True elif labels[i].startswith("NP-SBJ") and labels[i+1].startswith("ADJP-PRD"): childrenLinearized[i], childrenLinearized[i+1] = childrenLinearized[i+1], childrenLinearized[i] labels[i], labels[i+1] = labels[i+1], labels[i] hasReversed=True elif i < len(children)-2 and labels[i].startswith("NP-SBJ") and labels[i+2].startswith("VP"): childrenLinearized[i], childrenLinearized[i+1], childrenLinearized[i+2] = childrenLinearized[i+1], childrenLinearized[i+2], childrenLinearized[i] labels[i], labels[i+1], labels[i+2] = labels[i+1], labels[i+2], labels[i] hasReversed=True elif i < len(children)-3 and labels[i].startswith("NP-SBJ") and labels[i+3].startswith("VP"): childrenLinearized[i], childrenLinearized[i+1], childrenLinearized[i+2], childrenLinearized[i+3] = childrenLinearized[i+1], childrenLinearized[i+2], childrenLinearized[i+3], childrenLinearized[i] labels[i], labels[i+1], labels[i+2], labels[i+3] = labels[i+1], labels[i+2], labels[i+3], labels[i] hasReversed=True elif i < len(children)-4 and labels[i].startswith("NP-SBJ") and labels[i+4].startswith("VP"): childrenLinearized[i], childrenLinearized[i+1], childrenLinearized[i+2], childrenLinearized[i+3], childrenLinearized[i+4] = childrenLinearized[i+1], childrenLinearized[i+2], childrenLinearized[i+3], childrenLinearized[i+4], childrenLinearized[i] labels[i], labels[i+1], labels[i+2], labels[i+3], labels[i+4] = labels[i+1], labels[i+2], labels[i+3], labels[i+4], labels[i] hasReversed=True elif i < len(children)-5 and labels[i].startswith("NP-SBJ") and labels[i+4].startswith("VP"): childrenLinearized[i], childrenLinearized[i+1], childrenLinearized[i+2], childrenLinearized[i+3], childrenLinearized[i+4], childrenLinearized[i+5] = childrenLinearized[i+1], childrenLinearized[i+2], childrenLinearized[i+3], childrenLinearized[i+4], childrenLinearized[i+5], childrenLinearized[i] labels[i], labels[i+1], labels[i+2], labels[i+3], labels[i+4], labels[i+5] = labels[i+1], labels[i+2], labels[i+3], labels[i+4], labels[i+5], labels[i] hasReversed=True if not hasReversed and not "VP NP-SBJ" in " ".join(labels) and not "VBZ NP-SBJ" in " ".join(labels) and not "VB NP-SBJ" in " ".join(labels): print((childDeps, [x.incoming for x in children], [x.outgoing for x in children], label, [x.label() for x in children])) # logits = [(x, distanceWeights[stoi_deps[key]]) for x, key in zip(children, keys)] # logits = sorted(logits, key=lambda x:-x[1]) # childrenLinearized = map(lambda x:x[0], logits) # print(logits) for child in childrenLinearized: # if type(child) == nltk.tree.Tree: orderSentenceRec(child, sentence, printThings, linearized, order=order)
def read_corenlp_result(doc, corenlp_doc): """Read CoreNLP's output for a document. Parameters ---------- doc: educe.rst_dt.document_plus.DocumentPlus The original document (currently unused, could be necessary to determine e.g. token offset for specific file formats ; if it never gets used, this function should probably to the generic default and moved to `educe.external.corenlp`). corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource Object that contains all annotations for the document Returns ------- corenlp_doc: CoreNlpDocument A CoreNlpDocument containing all information """ # sentences sentences = corenlp_doc.get_ordered_sentence_list() # tokens sentence_toks = defaultdict(list) for tok in corenlp_doc.get_ordered_token_list(): sid = tok['s_id'] sentence_toks[sid].append(tok) # educe tokens educe_tokens = defaultdict(dict) for sent in sentences: sid = sent['id'] sent_toks = sentence_toks[sid] offset = 0 # was: sent_begin for tok in sent_toks: tid = tok['id'] educe_tokens[sid][tid] = CoreNlpToken(tok, offset) # educe tokens, ctree and dtree all_tokens = [] all_ctrees = [] all_dtrees = [] for sent in sentences: sid = sent['id'] tokens_dict = educe_tokens[sid] # sort tokens by their (integer) local id tok_local_id = tok_lid(sid) sorted_tokens = [ tokens_dict[x] for x in sorted(tokens_dict, key=tok_local_id) ] # ctree tree = nltk.tree.Tree.fromstring(sent['parse']) # FIXME 2016-06-13 skip the ROOT node, as in PTB # maybe we'd better add ROOT to the empty parentheses in the # PTB version, but just getting rid of ROOT here seems simpler: # the type of the root node of a tree is informative: usually # S, but more interestingly SINV, NP... if tree.label() != 'ROOT' or len(tree) > 1: print(tree) raise ValueError('Atypical root of CoreNLP tree') tree = tree[0] # go down from ROOT to the real root educe_ctree = ConstituencyTree.build(tree, sorted_tokens) # dtree deps = defaultdict(list) for lbl, gov_id, dep_id in sent['dependencies']: deps[gov_id].append((lbl, dep_id)) educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0') # store educe tokens, ctrees and dtrees all_tokens.extend(sorted_tokens) all_ctrees.append(educe_ctree) all_dtrees.append(educe_dtree) # coreference chains all_chains = [] for chain in corenlp_doc.get_coref_chains(): mentions = [] for mntn in chain: sid = mntn['sentence'] # helper functions to map from/to local and global ids tok_local_id = tok_lid(sid) tok_global_id = tok_gid(sid) # retrieve tokens for this mention start = tok_local_id(mntn['start']) end = tok_local_id(mntn['end']) tokens = [ educe_tokens[sid][tok_global_id(tok_idx)] for tok_idx in range(start, end) ] head = educe_tokens[sid][mntn['head']] mentions.append(Mention(tokens, head, mntn['most_representative'])) all_chains.append(Chain(mentions)) corenlp_doc = CoreNlpDocument(all_tokens, all_ctrees, all_dtrees, all_chains) return corenlp_doc
def tree2dict(tree): return { tree.label(): [tree2dict(t) if isinstance(t, Tree) else t for t in tree] }
def read_corenlp_result(doc, corenlp_doc): """Read CoreNLP's output for a document. Parameters ---------- doc: educe.rst_dt.document_plus.DocumentPlus The original document (currently unused, could be necessary to determine e.g. token offset for specific file formats ; if it never gets used, this function should probably to the generic default and moved to `educe.external.corenlp`). corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource Object that contains all annotations for the document Returns ------- corenlp_doc: CoreNlpDocument A CoreNlpDocument containing all information """ # sentences sentences = corenlp_doc.get_ordered_sentence_list() # tokens sentence_toks = defaultdict(list) for tok in corenlp_doc.get_ordered_token_list(): sid = tok['s_id'] sentence_toks[sid].append(tok) # educe tokens educe_tokens = defaultdict(dict) for sent in sentences: sid = sent['id'] sent_toks = sentence_toks[sid] offset = 0 # was: sent_begin for tok in sent_toks: tid = tok['id'] educe_tokens[sid][tid] = CoreNlpToken(tok, offset) # educe tokens, ctree and dtree all_tokens = [] all_ctrees = [] all_dtrees = [] for sent in sentences: sid = sent['id'] tokens_dict = educe_tokens[sid] # NEW extract local id to properly sort tokens tok_local_id = lambda x: int(x[len(sid) + 1:]) sorted_tokens = [tokens_dict[x] for x in sorted(tokens_dict, key=tok_local_id)] # ctree tree = nltk.tree.Tree.fromstring(sent['parse']) # FIXME 2016-06-13 skip the ROOT node, as in PTB # maybe we'd better add ROOT to the empty parentheses in the # PTB version, but just getting rid of ROOT here seems simpler: # the type of the root node of a tree is informative: usually # S, but more interestingly SINV, NP... if tree.label() != 'ROOT' or len(tree) > 1: print(tree) raise ValueError('Atypical root of CoreNLP tree') tree = tree[0] # go down from ROOT to the real root educe_ctree = ConstituencyTree.build(tree, sorted_tokens) # dtree deps = defaultdict(list) for lbl, gov_id, dep_id in sent['dependencies']: deps[gov_id].append((lbl, dep_id)) educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0') # store educe tokens, ctrees and dtrees all_tokens.extend(sorted_tokens) all_ctrees.append(educe_ctree) all_dtrees.append(educe_dtree) # coreference chains all_chains = [] for chain in corenlp_doc.get_coref_chains(): mentions = [] for mntn in chain: sid = mntn['sentence'] # helper functions to extract local ids and generate global ids local_id = lambda x: int(x[len(sid) + 1:]) global_id = lambda x: sid + '-' + str(x) # retrieve tokens for this mention start = local_id(mntn['start']) end = local_id(mntn['end']) tokens = [educe_tokens[sid][global_id(tok_idx)] for tok_idx in range(start, end)] head = educe_tokens[sid][mntn['head']] mentions.append(Mention(tokens, head, mntn['most_representative'])) all_chains.append(Chain(mentions)) corenlp_doc = CoreNlpDocument(all_tokens, all_ctrees, all_dtrees, all_chains) return corenlp_doc