def build_tree(sentences): sent = nltk.tokenize.sent_tokenize(sentences) roots = [] for s in sent: parse = nlp.parse(s) stack = [] root = None for token in parse.split(): if token[0] == '(': node = Node(token[1:]) if stack: stack[-1].addkid(node) stack.append(node) if root is None: root = node else: first = token.find(')') child = Node(token[:first]) if stack: stack[-1].addkid(child) for x in range(len(token) - first): if stack: stack.pop() roots.append(root) R = Node('R') for node in roots: R.addkid(node) return (R)
def zss_similarity(node1, node2): a = Node(node1['name'], node1['children']) b = Node(node2['name'], node2['children']) dist = simple_distance(a, b) return dist
def convertTreeToEditDistanceFormat(self): nodeObjects = dict() #remove all weights and then make a set. This should help in having the same ordered list. If the order of the edges is not the same, then zss will mess up the edit distance somehow. unweightedEdges = [] for edge in self.edgeList: newEdge = (0, edge[1], edge[2]) unweightedEdges.append(newEdge) unweightedEdges.sort(key=lambda tup: tup[1]) for edge in set(unweightedEdges): #obtain the parent and child parent = edge[1] child = edge[2] #Create an object for the parent. #The only object that we will be appending to is the parent one if parent not in nodeObjects.keys(): parentNode = Node(str(parent), []) nodeObjects[parent] = parentNode if child not in nodeObjects.keys(): childNode = Node(str(child), []) nodeObjects[child] = childNode nodeObjects[parent].addkid(nodeObjects[child]) return nodeObjects
def list_to_zsstree(input_list, label='0'): if not isinstance(input_list, list): # 叶节点 return Node(label) else: # 中间节点 return Node(label)\ .addkid(list_to_zsstree(input_list[0]))\ .addkid(list_to_zsstree(input_list[1]))
def convert_parse_tree_to_zss_tree(self, tree_as_string, ignore_leaves=False): ''' The ignore leaves argument will create a tree where the words in the sentence are not included. This will only represent sentence structure. ''' tree_as_list = [item.strip() for item in re.split(r'([\(\)])', tree_as_string) if item.strip()] tree_as_list = tree_as_list[2:-1] stack = [Node('ROOT')] root_node = stack[0] # Iterate over the list for i, item in enumerate(tree_as_list): if item == '(': # match the string for each item match = re.search(r'[A-Z]+[ ][A-Za-z]+', tree_as_list[i + 1]) if match: # if match, node has no children label = match.group().split(' ') node = Node(label[0]).addkid(Node(label[1])) if not ignore_leaves else Node(label[0]) else: # otherwise node has children node = Node(tree_as_list[i + 1]) # Add the node to the children of the current item stack[-1].addkid(node) # Then add the node to the stack itself stack.append(node) elif item == ')': # this node has no children so just pop it from the stack stack.pop() return root_node
def tree_generate(node_name, index, pos, dependency_list, visited_list, reverse=False): """Generating a tree from the node name given :type index: int :type pos: str :type visited_list: list :type node_name: str :type dependency_list: list """ node = Node(node_name, pos, index=index) visited_list += [index] kids_index_name_pos = [(d[1][0], d[1][1], d[1][2]) for d in dependency_list if d[0][0] == index] for x in kids_index_name_pos: if x in visited_list: node.value = 1 node.addkid(Node(label=x[1], index=x[0], pos=x[2])) else: node.addkid( tree_generate(x[1], x[0], x[2], dependency_list, visited_list)) visited_list += [x] else: if Node.get_children(node): node.value += sum([g.value for g in Node.get_children(node)]) # print node.label, node.value, node.pos node.children.sort(key=lambda x: x.label, reverse=reverse) return node
def test_de(): expected_ops = [ Operation(Operation.remove, Node("b"), None), Operation(Operation.remove, Node("c"), None), Operation(Operation.match, Node("a"), Node("a")) ] cost, ops = simple_distance(D, E, return_operations=True) assert ops == expected_ops
def loop_tree(dictionary, node): print node print "----" for k, v in dictionary.iteritems(): if isinstance(dictionary[k], dict): loop_tree(dictionary[k], node.addkid(Node(k))) else: node.addkid(Node(v))
def helper(obj): if isinstance(obj, list): node = Node(obj[0]) for kid in obj[1:]: node.addkid(helper(kid)) return node else: return Node(obj)
def dgl_tree_to_zzs_tree(tree, vocab_key_list, u): if tree.in_degrees(u) == 0: return Node(vocab_key_list[tree.ndata['x'][u]]) node = Node("PAD") in_nodes = tree.in_edges(u)[0] for in_node in in_nodes: in_node = int(in_node) node.addkid(dgl_tree_to_zzs_tree(tree, vocab_key_list, in_node)) return node
def build_tree_no_date(self, event): A = ( Node(event.event_type.lower()) .addkid(Node(event.country.lower())) .addkid(Node(event.state.lower())) .addkid(Node(event.city.lower())) .addkid(Node(event.name.lower())) ) return A
def totree(self, e): if isinstance(e, Var): return Node(('V', str(e))) if isinstance(e, Const): return Node(('C', str(e))) if isinstance(e, Op): n = Node(('O', e.name)) for arg in e.args: n.addkid(self.totree(arg)) return n
def get_ztree(cn, ztp=None): if isinstance(cn, str): cn = Tree.fromstring(cn) if ztp is None: ztp = Node(cn.label()) for subtree in cn: if isinstance(subtree, Tree): n = Node(subtree.label()) ztp.addkid(n) get_ztree(subtree, n) return ztp
def parsed_tree_to_zzs_tree(u): if len(u.child) == 1: return parsed_tree_to_zzs_tree(u.child[0]) elif len(u.child) > 1: assert len(u.child) == 2 node = Node("PAD") node.addkid(parsed_tree_to_zzs_tree(u.child[0])) node.addkid(parsed_tree_to_zzs_tree(u.child[1])) return node else: return Node(u.value)
def main(node,child): if isinstance(child, list) and len(child) == 0: return node.addkid(Node('')) if not isinstance(child, list) and not isinstance(child, dict): return node.addkid(Node(child)) if isinstance(child, dict): for k, v in child.items(): node.addkid(main(Node(k), v)) if isinstance(child, list): for n, i in enumerate(child): node.addkid(main(Node(n), i))
def transform(parent): if parent not in clause: return Node(parent[0]) children = clause[parent] root = parent[0] xs = [] for child in children: xs.append(transform(child)) res = Node(root, children=xs) return res
def zss_code_distance(code_a, code_b): root_node_a = ast.parse(code_a) root_zss_node_a = Node("root") zss_ast_visit(root_node_a, root_zss_node_a) root_node_b = ast.parse(code_b) root_zss_node_b = Node("root") zss_ast_visit(root_node_b, root_zss_node_b) return simple_distance(root_zss_node_a, root_zss_node_b, label_dist=label_weight)
def make_html_zssgraph(parent, graph=None, ignore_comments=True): ''' Given a string containing HTML, return a zss style tree of the DOM ''' if not graph: graph = Node(parent.tag) for node in parent.getchildren(): # if the element is a comment, ignore it if ignore_comments and not isinstance(node.tag, basestring): continue graph.addkid(Node(node.tag)) make_html_zssgraph(node, graph) return graph
def mktree(node, child, count=0): print(count) if isinstance(child, list): for c in child: count += 1 return mktree(node, c, count) elif isinstance(child, dict): for k, v in child.items(): if isinstance(child[k], dict): node.addkid(Node(k)) return mktree(node, v, count) else: node.addkid(Node(v))
def zss_code_ast_edit(code_a, code_b): root_node_a = ast.parse(code_a) root_zss_node_a = Node("root") zss_ast_visit(root_node_a, root_zss_node_a) root_node_b = ast.parse(code_b) root_zss_node_b = Node("root") zss_ast_visit(root_node_b, root_zss_node_b) cost, ops = simple_distance(root_zss_node_a, root_zss_node_b, label_dist=label_weight, return_operations=True) return cost, ops
def tree_edit_distance(s1, s2): l1 = s1.split(',') l2 = s2.split(',') n1 = Node("") for item in l1: #print item n1.addkid(Node(item)) n2 = Node("") for item in l2: #print item n2.addkid(Node(item)) return simple_distance(n1, n2)
def to_zzzNode(E, root=0): from zss import Node A = Node(str(root)) U = [(0, A)] while len(U) != 0: parent = U[0][0] node = U[0][1] del U[0] children = getChildren(E, parent) if len(children) == 0: continue for i in range(len(children)): node.addkid(Node(str(children[i]))) U.append((children[i], node.children[i])) return A
def syntax_similarity_conversation(self, documents1): """Syntax similarity of each document with its before and after.""" global numnodes documents1parsed = [] # Detect sentences and parse them for d1 in tqdm(range(len(documents1))): tempsents = (self.sent_detector.tokenize(documents1[d1].strip())) for s in tempsents: if len(s.split()) > 70: documents1parsed.append("NA") break else: temp = list(self.parser.raw_parse_sents((tempsents))) for i in range(len(temp)): temp[i] = list(temp[i])[0] temp[i] = ParentedTree.convert(temp[i]) documents1parsed.append(list(temp)) results = [] for d1 in range(len(documents1parsed) - 1): d2 = d1 + 1 if documents1parsed[d1] == "NA" or documents1parsed[d2] == "NA": results.append(float('NaN')) continue costMatrix = [] for i in range(len(documents1parsed[d1])): numnodes = 0 tempnode = Node(documents1parsed[d1][i].root().label()) sentencedoc1 = self.convert_mytree(documents1parsed[d1][i], tempnode) temp_costMatrix = [] sen1nodes = numnodes for j in range(len(documents1parsed[d2])): numnodes = .0 tempnode = Node(documents1parsed[d2][j].root().label()) sentencedoc2 = self.convert_mytree(documents1parsed[d2][j], tempnode) ED = simple_distance(sentencedoc1, sentencedoc2) ED /= (numnodes + sen1nodes) temp_costMatrix.append(ED) costMatrix.append(temp_costMatrix) costMatrix = np.array(costMatrix) results.append(1 - np.mean(costMatrix)) return np.array(results)
def randtree(depth=2, alpha='abcdefghijklmnopqrstuvwxyz', repeat=2, width=2): labels = [''.join(x) for x in itertools.product(alpha, repeat=repeat)] shuffle(labels) labels = (x for x in labels) root = Node("root") p = [root] c = list() for x in xrange(depth - 1): for y in p: for z in xrange(randint(1, 1 + width)): n = Node(labels.next()) y.addkid(n) c.append(n) p = c c = list() return root
def parseTreeFromStrings(tree, debug=False): """ Create the tree from a list of strings @param tree: {List} string format of the tree, one element per string @param debug: {Boolean} True to display debugging information; False not @return: {Dictionary} the three types of trees in the dictionary """ if tree is None: return None root = None rightMosts, cur = [], 0 for line in tree: header = line.split(": ")[0] level, tag = (0, header) if "|-" not in header else header.split("- ") tag = tag.strip() if "|-" in header: level = 1 + int(len(level) / 2) xs = [int(x) for x in re.split("\\D+", line) if x != ""] cur = Node("%s[x=%04d,y=%04d]" % (tag, xs[2]-xs[0], xs[3]-xs[1])) if level == len(rightMosts): rightMosts.append(cur) else: assert level < len(rightMosts) rightMosts[level] = cur pass # else - if level == len(rightMosts) if level == 0: root = cur else: rightMosts[level-1].addkid(cur) pass # else - if level == 0 pass # for line in tree return root
def build_tree(self, event): A = (Node(event.event_type.lower()).addkid(Node( event.country.lower())).addkid(Node(event.state.lower())).addkid( Node(event.city.lower())).addkid(Node( event.name.lower())).addkid( Node(event.day.lower())).addkid( Node(event.month.lower())).addkid( Node(event.year.lower()))) return A
def convert_body(body, parent_node=None, root_node=None): body = seperate_dict(body) if isinstance(body, dict): if parent_node == None: parent_node = Node(body['_PyType']) new_parent = parent_node root_node = parent_node for j in body: if j != '_PyType': # still have a kid, then recursion needed if isinstance(body[j], dict): if '_PyType' in body[j].keys(): if 'attr' in body[j].keys(): node_content = j + ' ' + body[j][ '_PyType'] + ' ' + body[j]['attr'] new_parent = Node(node_content) parent_node.addkid(new_parent) new_parent = convert_body(body[j], parent_node=new_parent, root_node=root_node) else: call_call_func_name = '' #if j == '' if j == 'func': call_call_func_name = ' ' + body[j]['id'] node_content = j + ' ' + body[j][ '_PyType'] + call_call_func_name new_parent = Node(node_content) parent_node.addkid(new_parent) new_parent = convert_body(body[j], parent_node=new_parent, root_node=root_node) else: # case when it's a dict but not with PyType if 'udv' in json.dumps(body[j]): node_content = j + ' ' + 'udv' else: node_content = j + ' ' + json.dumps(body[j]) parent_node = parent_node.addkid(Node(node_content)) elif isinstance(body[j], list) or isinstance(body[j], str): if body[j]: node_content = j + ' ' + body[j] else: node_content = j + ' ' + '' parent_node = parent_node.addkid(Node(node_content)) return root_node
def helper(node: Node, index: int): children = dep[str(index)]['deps'].values() children = sum(children, []) for c in children: cNode = Node(dep[str(c)]['word']) helper(cNode, c) node.addkid(cNode) return
def buildTree(state, node, chart): if state[4] == (): return else: for ptr in state[4]: child = Node(chart[ptr[0]][ptr[1]][0]) node.addkid(child) buildTree(chart[ptr[0]][ptr[1]], child, chart)
def getModel(edges): newList = [] data = {} for i in edges: if i[0] not in newList: newList.append(i[0]) if i[1] not in newList: newList.append(i[1]) for i in range(len(newList)): data[newList[i]] = Node(i) for edge in edges: data[edge[0]].addkid(data[edge[1]]) if len(newList) == 0: return Node(0) else: return data[0]