def inner_node_match(node1, node2, match_final, f, t): if node1.label == node2.label: common = 0 # 记录以node1为根的子树和以node2为根的子树中,匹配的叶子节点数 node_list1, inner_node_list1, leaf_node_list1 = node_get(node1) node_list2, inner_node_list2, leaf_node_list2 = node_get(node2) for node in leaf_node_list1: if node.matched == 0: # 若该叶子节点不存在与之匹配的节点,跳过 continue else: id1 = node.id for item in match_final: # 在match_final中找node的匹配信息,计算common if item[0] == id1: id2 = item[1] for leaf_node in leaf_node_list2: if leaf_node.id == id2: common += 1 max_num = len(leaf_node_list1) if len(leaf_node_list1) > len(leaf_node_list2) else len(leaf_node_list2) # 阈值t的大小根据子树规模动态改变 if len(leaf_node_list1) <= 4 or len(leaf_node_list2) <= 4: t = 0.4 sim_inner = common / max_num # 好像没有int/double的问题 sim_value = string_similarity_ngram(node1.value, node2.value, 2) # 为中间节点相似度设置权重,common leaves function有更高的权重 # 即使中间节点value的相似度小于阈值f,但是若子树相似度远大于t,也认为两个中间节点相似 if sim_inner >= 0.8: match_final.append((node1.id, node2.id, sim_value)) node1.matched = 1 node2.matched = 1 elif sim_inner > t and sim_value > f: match_final.append((node1.id, node2.id, sim_value)) node1.matched = 1 node2.matched = 1
def inner_node_match(node1, node2, match_final, f, t): if node1.label == node2.label: common = 0 # 记录以node1为根的子树和以node2为根的子树中,匹配的叶子节点数 node_list1, inner_node_list1, leaf_node_list1 = node_get(node1) node_list2, inner_node_list2, leaf_node_list2 = node_get(node2) for node in leaf_node_list1: if node.matched == 0: # 若该叶子节点不存在与之匹配的节点,跳过 continue else: id1 = node.id for item in match_final: # 在match_final中找node的匹配信息,计算common if item[0] == id1: id2 = item[1] for leaf_node in leaf_node_list2: if leaf_node.id == id2: common += 1 max_num = len(leaf_node_list1) if len(leaf_node_list1) > len( leaf_node_list2) else len(leaf_node_list2) # 阈值t的大小根据子树规模动态改变 if len(leaf_node_list1) <= 4 or len(leaf_node_list2) <= 4: t = 0.4 sim_inner = common / max_num # 好像没有int/double的问题 sim_value = string_similarity_ngram(node1.value, node2.value, 2) # 为中间节点相似度设置权重,common leaves function有更高的权重 # 即使中间节点value的相似度小于阈值f,但是若子树相似度远大于t,也认为两个中间节点相似 if sim_inner >= 0.8: match_final.append((node1.id, node2.id, sim_value)) node1.matched = 1 node2.matched = 1 elif sim_inner > t and sim_value > f: match_final.append((node1.id, node2.id, sim_value)) node1.matched = 1 node2.matched = 1
def leaf_match(left_leaf_node_list, right_leaf_node_list, f): match_temp = [] for i in range(len(left_leaf_node_list)): for item in right_leaf_node_list: if left_leaf_node_list[i].label == item.label: sim = string_similarity_ngram(left_leaf_node_list[i].value, item.value, 2) if sim > f: match_temp.append((left_leaf_node_list[i].id, item.id, sim)) return match_temp
def leaf_match(left_leaf_node_list, right_leaf_node_list, f): match_temp = [] for i in range(len(left_leaf_node_list)): for item in right_leaf_node_list: if left_leaf_node_list[i].label == item.label: sim = string_similarity_ngram(left_leaf_node_list[i].value, item.value, 2) if sim > f: match_temp.append( (left_leaf_node_list[i].id, item.id, sim)) return match_temp