def parsed(element): if element: # element viewed as a list is non-empty (it has subelements) subtrees = map(parsed, element) subtrees = [t for t in subtrees if t is not None] return tree.Tree(element.tag, subtrees) else: # element viewed as a list is empty. we are in a terminal. if element.get('elliptic') == 'yes': return None else: return tree.Tree( element.get('pos') or element.get('ne') or 'unk', [element.get('wd')])
def btree_xrc(relation_list, words): """ 构建二叉树 :param relation_list: 依存关系集合 :param words: 原始词集合 :return: 二叉树 """ # # for relation in relation_list: # print relation.mw, " - ", relation.relation, " - ", relation.cw # while len(relation_list) >0: # relation_dict = {} # count = 1 # for r in relation_list: # relation_dict[count] = r # count += 1 # 直接用list的下标作为编号,,因此编号从0 开始 # T = [] T_list = [] # 栈 stack = [] T = tree.Tree("root", ["", ""]) T_word = [] for w in words: if len(stack) == 0: print w stack.append(w) else: w0 = stack.pop() print w, w0 for r in relation_list: if r.mw == w and r.cw == w0: temp = tree.Tree(relation_list.index(r), [w, w0]) T_word.append([w, w0]) T = temp elif r.mw == w0 and r.cw == w: temp = tree.Tree(relation_list.index(r), [w0, w]) T_word.append([w, w0]) T = temp T.draw()
def unary_parses(self, p, t, i, j): node = t.label() l_val = node.l_val r_val = node.r_val if node.mark == '|': res = [] elif node.mark == '<>': p2 = self.p_stop_left(node.word, l_val, self.harmonic) + p t2 = tree.Tree(Node('|', node.word, node.index, l_val, r_val), [t]) res = [(p2, t2)] elif node.mark == '>': p2 = self.p_stop_right(node.word, r_val, self.harmonic) + p t2 = tree.Tree(Node('<>', node.word, node.index, l_val, r_val), [t]) res = self.unary_parses(p2, t2, i, j) return [(p, t)] + res
def parsed(element): """Converts a 'sentence' XML element (xml.etree.ElementTree.Element) to an NLTK tree. element -- the XML sentence element (or a subelement) """ if element: # element viewed as a list is non-empty (it has subelements) subtrees = map(parsed, element) # recursive call here! subtrees = [t for t in subtrees if t is not None] return tree.Tree(element.tag, subtrees) else: # element viewed as a list is empty. we are in a terminal. if element.get('elliptic') == 'yes' and not element.get('wd'): return None else: return tree.Tree( element.get('pos') or element.get('ne') or 'unk', [element.get('wd')])
def gold_pos_strategy(line, abp_domain_size, gold_pos_dict=None, **kwargs): if gold_pos_dict is None: raise Exception( "Gold pos dictionary must be provided when using the gold pos strategy!" ) placholder = '-REPLACE-' new_tree = tree.Tree(str(random.randint(1, abp_domain_size)), [placholder, placholder]) num_nodes = len(line) - 2 for i in range(num_nodes): positions = new_tree.treepositions('leaves') random_pick = random.choice(positions) add_tree = tree.Tree(str(random.randint(1, abp_domain_size)), [placholder, placholder]) new_tree[random_pick] = add_tree for index, replace in enumerate(new_tree.treepositions('leaves')): new_tree[replace] = tree.Tree(str(gold_pos_dict[line[index]]), [line[index]]) return new_tree
def constree(self): # Some depgraphs have several roots (for instance, 512th of Turkish). # i = self.root['address'] roots = self.nodelist[0]['deps'] if len(roots) == 1: return treebank.Tree(self._constree(roots[0])) else: # TODO: check projectivity here also. trees = [self._constree(i) for i in roots] return treebank.Tree(tree.Tree('TOP', trees))
def to_tensor(tree: tree_mod.Tree): """ Maps a tree of int(s) to a tree of torch.LongTensor(s) which contain the same values. :param tree: :return: """ lab = torch.LongTensor([int(tree.label())]) children = [to_tensor(t) for t in tree] return tree_mod.Tree(lab, children)
def to_numbers(tree: tree_mod.Tree, interner: interners.Interner): """ Maps a tree to a tree of int(s) which are the numbers assigned to the node labels by the interner. :param tree: :param interner: :return: """ num = interner(tree.label().strip()) return tree_mod.Tree(num, [to_numbers(t, interner) for t in tree])
def recursion(t, f): if not isinstance(t, tree.Tree): return t subtrees = [] for st in t: if f(st): st = recursion(st, f) subtrees += [st] if subtrees == []: return t.label() else: return tree.Tree(t.label(), subtrees)
def recursion(t, f): # terminals are pos tags # if not isinstance(t, tree.Tree): # return t if t.height() == 2: return t subtrees = [] for st in t: if f(st): st = recursion(st, f) subtrees += [st] if subtrees == []: # ideally, subtrees cannot be empty return t.label() else: return tree.Tree(t.label(), subtrees)
def createTree1(self, tags): isVerb=False tempList=list("") curTree=NULL prevTree=NULL for words in reversed(tags): #start creating a VP branch if(words[1]=='VERB' or words[1]=='ADV'): #continuing on current branch if(isVerb): tempList.append(words[0]) #finish the previous branch so that we can start the VP branch else: if(prevTree==NULL and curTree==NULL): curTree=tree.Tree('NP', tempList) else: prevTree=curTree curTree=tree.Tree('NP', [tempList, prevTree] ) isVerb=True tempList.clear() tempList.append(words[0]) #start creating a NP branch elif(words[1]=='NOUN' or words[1]=='ADJ'): #continuing on current branch if(not isVerb): tempList.append(words[0]) #finish the previous branch so that we can start the NP branch else: if(prevTree==NULL and curTree==NULL): curTree=tree.Tree('VP', tempList) else: prevTree=curTree curTree=tree.Tree('VP', [tempList, prevTree] ) isVerb=False tempList.clear() tempList.append(words[0]) else: tempList.append(words[0]) if(isVerb): prevTree=curTree curTree=tree.Tree('VP', tempList) else: prevTree=curTree curTree=tree.Tree('NP', tempList) self.myTree=tree.Tree('S', [curTree, prevTree])
def filter_subtrees(self, f): def recursion(t, f): if not isinstance(t, tree.Tree): return t subtrees = [] for st in t: if f(st): st = recursion(st, f) subtrees += [st] if subtrees == []: return t.label() else: return tree.Tree(t.label(), subtrees) t = recursion(self, f) if isinstance(t, tree.Tree): self.__init__(t, self.labels) else: self.__init__(tree.Tree(t, []), self.labels)
def creat_tree(mw, cw, relation, id): if relation == "VOB": return tree.Tree(id, [cw, mw]) # elif # # a = [1,2,3,4] # b=[] # for i in a: # b.append(i) # # # for n in range(len(a)): # print n, a.pop() # print # # # # print b
def filter_subtrees(self, f): def recursion(t, f): # terminals are pos tags # if not isinstance(t, tree.Tree): # return t if t.height() == 2: return t subtrees = [] for st in t: if f(st): st = recursion(st, f) subtrees += [st] if subtrees == []: # ideally, subtrees cannot be empty return t.label() else: return tree.Tree(t.label(), subtrees) t = recursion(self, f) if isinstance(t, tree.Tree): self.__init__(t, self.labels) else: self.__init__(tree.Tree(t, []), self.labels)
def test_queryVersionCookie(self): version_string = "( (VERSION (FORMAT dash) (FOO (BAR baz))))" self.assertEqual(util.queryVersionCookie(version_string, "FORMAT"), "dash") self.assertEqual(util.queryVersionCookie(version_string, "FOO.BAR"), "baz") self.assertEqual(util.queryVersionCookie(version_string, "FOO"), T.Tree("BAR", ["baz"])) self.assertIsNone(util.queryVersionCookie(version_string, "ABC")) # Invalid version cookie gives null result invalid_version_tree = "( (FOO bar))" self.assertIsNone(util.queryVersionCookie(invalid_version_tree, "foo")) # multiple matches gives null result, only for aberrant key multiple_matches = "( (VERSION (FOO bar) (FOO baz) (BAR quux)))" self.assertIsNone(util.queryVersionCookie(multiple_matches, "FOO")) self.assertEqual(util.queryVersionCookie(multiple_matches, "BAR"), "quux") # Empty input gives null result self.assertIsNone(util.queryVersionCookie("", "FOO")) self.assertIsNone(util.queryVersionCookie(None, "FOO"))
def dep_parse(self, s): """ output: returned t is a nltk.tree.Tree without root node """ parse = {} # OPTIMIZATION: END considered only explicitly # s = s + [self.end_symbol] n = len(s) for i in range(n): j = i + 1 w = str(s[i]) t1 = tree.Tree(Node('>', w, i, 0, 0), [w]) parse[i, j] = ParseDict(self.unary_parses(math.log(1.0), t1, i, j)) for l in range(2, n+1): for i in range(n-l+1): j = i + l parse_dict = ParseDict() for k in range(i+1, j): for (p1, t1) in parse[i, k].itervalues(): for (p2, t2) in parse[k, j].itervalues(): n1 = t1.label() n2 = t2.label() if n1.mark == '>' and n2.mark == '|': m = n1.index h = n1.word p = self.p_nonstop_right(h, n1.r_val, self.harmonic) + \ self.p_attach_right(n2.word, h, self.harmonic, n2.index - m) + \ p1 + p2 new_node = Node(n1.mark, n1.word, n1.index, n1.l_val, n1.r_val + 1) t = tree.Tree(new_node, [t1, t2]) parse_dict.add(p, t) if n1.mark == '|' and n2.mark == '<>': m = n2.index h = n2.word p = self.p_nonstop_left(h, n2.l_val, self.harmonic) + \ self.p_attach_left(n1.word, h, self.harmonic, m - n1.index) + \ p1 + p2 new_node = Node(n2.mark, n2.word, n2.index, n2.l_val + 1, n2.r_val) t = tree.Tree(new_node, [t1, t2]) parse_dict.add(p, t) parse[i, j] = ParseDict(sum((self.unary_parses(p, t, i, j) \ for (p, t) in parse_dict.itervalues()), [])) w = s[0] (p1, t1) = parse[0, n].val('|'+w+'0') t_max, p_max = t1, p1 + self.p_attach_left(w, self.end_symbol, self.harmonic) l = [(t_max, p_max)] for i in range(1, n): w = s[i] (p1, t1) = parse[0, n].val('|'+w+str(i)) p = p1 + self.p_attach_left(w, self.end_symbol, self.harmonic) if p > p_max: p_max = p l = [(t1, p)] elif p == p_max: l += [(t1, p)] (t_max, p_max) = self.choice(l, self.args.choice) return (t_max, p_max)
current = "" while stack: current = stack.pop() if isinstance(current, tree.Tree): for i in range(len(current)): stack.append(current[i]) elif isinstance(current, str): # print "[输出] ",current print current if __name__ == "__main__": C = tree.Tree("C", ["E", "F"]) B = tree.Tree("B", [C, "D"]) M = tree.Tree("M", ["O", "P"]) H = tree.Tree("H", [M, "N"]) G = tree.Tree("G", ["X", "Y"]) A = tree.Tree("A", [G, H]) K = tree.Tree("K", ["L", "Q"]) root = tree.Tree("Root", [A, B, K]) print root[0] print root.height() print len(root) print type(root) import time
def treebank_from_sentences(S): """Returns a treebank with sentences S and trivial trees. """ trees = [Tree(tree.Tree('ROOT', [tree.Tree(x, [x]) for x in s])) for s in S] return Treebank(trees)
def parsed(self, files=None): for t in treebank.SavedTreebank.parsed(self, files): yield Cast3LBTree(tree.Tree('ROOT', [t]), t.labels)
from State import State
def btree(relation_list, words): """ 构建二叉树 :param relation_list: 依存关系集合 :param words: 情感词词集合 :return: 二叉树 """ # # for relation in relation_list: # print relation.mw, " - ", relation.relation, " - ", relation.cw # while len(relation_list) >0: T = [] T_list = [] for w in words: es = {} es_word = [w] # 当前情感词的关联词集合 sort_dict = { "VOB": 1, "ATT": 2, "ADV": 3, "VV": 4, "COO": 5, "SMP": 6, "SBV": 7, "CNJ": 8 } # sort_dict = { # "ATT": 1, # "ADV": 2, # "VV": 3, # "COO": 4, # "SMP": 5, # "VOB": 6, # "SBV": 7, # "CNJ": 8 # } # 查找情感词关联的依存关系 for relation in relation_list: if es_word.__contains__(relation.mw) or es_word.__contains__(relation.cw): es_word.append(relation.mw) es_word.append(relation.cw) print relation.mw, " - ", relation.relation, " - ", relation.cw es[relation] = sort_dict[relation.relation] # relation_list.remove(relation) # 对es中的依存关系进行排序得到rs es_sort = sorted(es.iteritems(), key=lambda d: d[1], reverse=False) rs = [] for e in es_sort: rs.append(e[0]) print "-----------------" for r in rs: print r.mw, " - ", r.relation, " - ", r.cw # 剔除rs 中的CNJ得到rs1 # 开始遍历建树 T1 = tree.Tree("root", ["", ""]) # 某个情感词关联的所有依存 T1_word = [] T1_list = [] # 子树集合 for r in rs: name = r.relation w1 = r.cw w2 = r.mw if not T1_word.__contains__(w1) and not T1_word.__contains__(w2): print "[1]",w1,w2 n1 = w1 n2 = w2 T1_word.append(w1) T1_word.append(w2) print T1.leaves()[1].__len__() if T1.leaves()[0].__len__() == 0: T1 = tree.Tree(name, [n2, n1]) T1_list.append(T1) else: temp = tree.Tree(name, [n2, n1]) T1_list.append(temp) # T1 = tree.Tree(name, [T1, temp]) elif T1_word.__contains__(w1) and not T1_word.__contains__(w2): print "[2]", w1, w2 T1_word.append(w2) n1 = T1 n2 = w2 T1 = tree.Tree(name, [n2, n1]) elif T1_word.__contains__(w2) and not T1_word.__contains__(w1): print "[3]", w1, w2 n1 = "RIGHT" n2 = w2 T1_word.append(w1) else: print "[4]", w1, w2 n1 = tree.Tree("n1", ["", ""]) n2 = tree.Tree("n2", ["", ""]) for t in T1_list: if t.leaves().__contains__(w1): n1 = t if t.leaves().__contains__(w2): n2 = t T1 = tree.Tree(name, [n2, n1]) # T1.draw()
def dep_parse(self, s): parse = {} # OPTIMIZATION: END considered only explicitly # s = s + [self.end_symbol] # solo para uso como param. de phi s2 = s + [self.end_symbol] n = len(s) for i in range(n): j = i + 1 # >w -> w # <w -> w w = s[i] # DMVCCM: multiplicar por phi: # aca da lo mismo: # phi = self.phi(i, j, s) phi = 1.0 pl = self.p_order('left', w) * phi pr = self.p_order('right', w) * phi t0 = tree.Tree(dmv.Node('<', w, i), [w]) t1 = tree.Tree(dmv.Node('>', w, i), [w]) parse[i, j] = dmv.ParseDict(self.unary_parses(pl, t0, i, j) + self.unary_parses(pr, t1, i, j)) for l in range(2, n + 1): for i in range(n - l + 1): j = i + l # tenemos parse[a, b] para todas las cosas adentro de (i, j). parse_dict = dmv.ParseDict() phi = self.phi(i, j, s2) for k in range(i + 1, j): # aqui, mejores parses entre parse[i, k] y parse[k, j] for (p1, t1) in parse[i, k].itervalues(): for (p2, t2) in parse[k, j].itervalues(): n1 = t1.node n2 = t2.node if n1.mark[0] == '>' and n2.mark == '|': m = n1.index h = n1.word # n2.index-m = distancia entre uno y otro # DMVCCM: multiplicar por phi: p = self.p_nonstop_right(h, m == k - 1) * \ self.p_attach_right(n2.word, h, n2.index - m) * \ p1 * p2 * phi t = tree.Tree(n1, [t1, t2]) parse_dict.add(p, t) if n1.mark == '|' and n2.mark[0] == '<': m = n2.index h = n2.word # m-n1.index = distancia entre uno y otro # DMVCCM: multiplicar por phi: p = self.p_nonstop_left(h, m == k) * \ self.p_attach_left(n1.word, h, m - n1.index) * \ p1 * p2 * phi t = tree.Tree(n2, [t1, t2]) parse_dict.add(p, t) # here is where the stops are generated: parse[i, j] = dmv.ParseDict(sum((self.unary_parses(p, t, i, j) \ for (p, t) in parse_dict.itervalues()), [])) # OPTIMIZATION: finally, choose the head of the sentence. # t_max, p_max = None, 0.0 w = s[0] (p1, t1) = parse[0, n].val('|' + w + '0') t_max, p_max = t1, p1 * self.p_attach_left(w, self.end_symbol, n) # unbiased: l = [(t_max, p_max)] for i in range(1, n): w = s[i] (p1, t1) = parse[0, n].val('|' + w + str(i)) p = p1 * self.p_attach_left(w, self.end_symbol, n - i) # aca hay bias (> seria elegir el primer head en caso de empate): # al parecer este bias solo afecta al modelo si no esta entrenado. # bias a RBRANCH: # if p > p_max: # t_max, p_max = t1, p # bias a LBRANCH: # if p >= p_max: # t_max, p_max = t1, p # unbiased: if p > p_max: p_max = p l = [(t1, p)] elif p == p_max: l += [(t1, p)] (t_max, p_max) = random.choice(l) return (t_max, p_max)
def btree_zx(relation_list): """ 构建二叉树 :param relation_list: 依存关系集合 :param words: 原始词集合 :return: 二叉树 """ # while len(relation_list) >0: # relation_dict = {} # count = 1 # for r in relation_list: # relation_dict[count] = r # count += 1 # 直接用list的下标作为编号,,因此编号从0 开始 # 栈 stack = [] list_copy = [] for r in relation_list: list_copy.append(r) T = tree.Tree("root", ["", ""]) T_list = [] T_word = [] for n in range(len(relation_list)): r = relation_list.pop() mw = r.mw cw = r.cw id = list_copy.index(r) # 如果是并列关系: COO if r.relation == "COO": print "\n[COO]---" if not T_word.__contains__(mw) and not T_word.__contains__(cw): temp = creat_tree(mw, cw, r.relation, id) T_word.extend([mw, cw]) T = temp T_list.append(T) elif T_word.__contains__(mw) and not T_word.__contains__(cw): print "[2]", mw, cw T_word.append(cw) temp = T T = tree.Tree(id, [temp, cw]) T_list.append(T) elif not T_word.__contains__(mw) and T_word.__contains__(cw): print "[3]", mw, cw T_word.append(mw) temp = T T = tree.Tree(id, [mw, temp]) T_list.append(T) else: print "[4]", mw, cw n1 = tree.Tree("n1", ["", ""]) n2 = tree.Tree("n2", ["", ""]) for t in T_list: if t.leaves().__contains__(cw): n1 = t if t.leaves().__contains__(mw): n2 = t T = tree.Tree(id, [n2, n1]) T_list.append(T) else: print "\n[不是COO]---" if not T_word.__contains__(mw) and not T_word.__contains__(cw): print "[1]", mw, cw temp = creat_tree(mw, cw, r.relation, id) T_word.extend([mw, cw]) T = temp T_list.append(T) elif T_word.__contains__(mw) and not T_word.__contains__(cw): print "[2]", mw, cw T_word.append(cw) temp = T T = tree.Tree(id, [temp, cw]) T_list.append(T) elif not T_word.__contains__(mw) and T_word.__contains__(cw): print "[3]", mw, cw T_word.append(mw) temp = T T = tree.Tree(id, [mw, temp]) T_list.append(T) else: print "[4]", mw, cw n1 = tree.Tree("n1", ["", ""]) n2 = tree.Tree("n2", ["", ""]) for t in T_list: if t.leaves().__contains__(cw): n1 = t if t.leaves().__contains__(mw): n2 = t T = tree.Tree(id, [n2, n1]) T_list.append(T) T.draw()
current = "" while stack: current = stack.pop() if isinstance(current, tree.Tree): for i in range(len(current)): stack.append(current[i]) elif isinstance(current, str): # print "[输出] ",current print (current) if __name__ == "__main__": C = tree.Tree("我", ["E", "F"]) B = tree.Tree("是", [C, "D"]) H = tree.Tree("好", ["M", "N"]) A = tree.Tree("人", ["G", H]) root = tree.Tree("Root", [A, B]) print (root[0]) print (root.height()) print( len(root)) print (type(root)) test(root) # test_2(root) root.draw()
def dep_parse(self, s): parse = {} # OPTIMIZATION: END considered only explicitly # s = s + [self.end_symbol] n = len(s) for i in range(n): j = i + 1 # >w -> w # <w -> w w = s[i] p = self.p_order('left', w) t0 = tree.Tree(Node('<', w, i), [w]) t1 = tree.Tree(Node('>', w, i), [w]) parse[i, j] = ParseDict(self.unary_parses(p, t0, i, j) + \ self.unary_parses(1.0 - p, t1, i, j)) for l in range(2, n + 1): for i in range(n - l + 1): j = i + l # tenemos parse[a, b] para todas las cosas adentro de (i, j). parse_dict = ParseDict() for k in range(i + 1, j): # aqui, mejores parses entre parse[i, k] y parse[k, j] for (p1, t1) in parse[i, k].itervalues(): for (p2, t2) in parse[k, j].itervalues(): n1 = t1.node n2 = t2.node if n1.mark[0] == '>' and n2.mark == '|': m = n1.index h = n1.word # n2.index-m = distancia entre uno y otro p = self.p_nonstop_right(h, m==k-1) * \ self.p_attach_right(n2.word, h, n2.index-m) * \ p1 * p2 t = tree.Tree(n1, [t1, t2]) parse_dict.add(p, t) if n1.mark == '|' and n2.mark[0] == '<': m = n2.index h = n2.word # m-n1.index = distancia entre uno y otro p = self.p_nonstop_left(h, m==k) * \ self.p_attach_left(n1.word, h, m-n1.index) * \ p1 * p2 t = tree.Tree(n2, [t1, t2]) parse_dict.add(p, t) # aca se generan los stops parse[i, j] = ParseDict(sum((self.unary_parses(p, t, i, j) \ for (p, t) in parse_dict.itervalues()), [])) # solo falta elegir el head de la oracion: #t_max, p_max = None, 0.0 w = s[0] (p1, t1) = parse[0, n].val('|' + w + '0') t_max, p_max = t1, p1 * self.p_attach_left(w, self.end_symbol, n) # unbiased: l = [(t_max, p_max)] for i in range(1, n): w = s[i] (p1, t1) = parse[0, n].val('|' + w + str(i)) p = p1 * self.p_attach_left(w, self.end_symbol, n - i) # aca hay bias (> seria elegir el primer head en caso de empate): # al parecer este bias solo afecta al modelo si no esta entrenado. # bias a RBRANCH: #if p > p_max: # t_max, p_max = t1, p # bias a LBRANCH: #if p >= p_max: # t_max, p_max = t1, p # unbiased: if p > p_max: p_max = p l = [(t1, p)] elif p == p_max: l += [(t1, p)] (t_max, p_max) = random.choice(l) return (t_max, p_max)