def inner_node_match(node1, node2, match_final, f, t): if node1.label == node2.label: common = 0 # 记录以node1为根的子树和以node2为根的子树中,匹配的叶子节点数 node_list1, inner_node_list1, leaf_node_list1 = node_get(node1) node_list2, inner_node_list2, leaf_node_list2 = node_get(node2) for node in leaf_node_list1: if node.matched == 0: # 若该叶子节点不存在与之匹配的节点,跳过 continue else: id1 = node.id for item in match_final: # 在match_final中找node的匹配信息,计算common if item[0] == id1: id2 = item[1] for leaf_node in leaf_node_list2: if leaf_node.id == id2: common += 1 max_num = len(leaf_node_list1) if len(leaf_node_list1) > len(leaf_node_list2) else len(leaf_node_list2) # 阈值t的大小根据子树规模动态改变 if len(leaf_node_list1) <= 4 or len(leaf_node_list2) <= 4: t = 0.4 sim_inner = common / max_num # 好像没有int/double的问题 sim_value = string_similarity_ngram(node1.value, node2.value, 2) # 为中间节点相似度设置权重,common leaves function有更高的权重 # 即使中间节点value的相似度小于阈值f,但是若子树相似度远大于t,也认为两个中间节点相似 if sim_inner >= 0.8: match_final.append((node1.id, node2.id, sim_value)) node1.matched = 1 node2.matched = 1 elif sim_inner > t and sim_value > f: match_final.append((node1.id, node2.id, sim_value)) node1.matched = 1 node2.matched = 1
def inner_node_match(node1, node2, match_final, f, t): if node1.label == node2.label: common = 0 # 记录以node1为根的子树和以node2为根的子树中,匹配的叶子节点数 node_list1, inner_node_list1, leaf_node_list1 = node_get(node1) node_list2, inner_node_list2, leaf_node_list2 = node_get(node2) for node in leaf_node_list1: if node.matched == 0: # 若该叶子节点不存在与之匹配的节点,跳过 continue else: id1 = node.id for item in match_final: # 在match_final中找node的匹配信息,计算common if item[0] == id1: id2 = item[1] for leaf_node in leaf_node_list2: if leaf_node.id == id2: common += 1 max_num = len(leaf_node_list1) if len(leaf_node_list1) > len( leaf_node_list2) else len(leaf_node_list2) # 阈值t的大小根据子树规模动态改变 if len(leaf_node_list1) <= 4 or len(leaf_node_list2) <= 4: t = 0.4 sim_inner = common / max_num # 好像没有int/double的问题 sim_value = string_similarity_ngram(node1.value, node2.value, 2) # 为中间节点相似度设置权重,common leaves function有更高的权重 # 即使中间节点value的相似度小于阈值f,但是若子树相似度远大于t,也认为两个中间节点相似 if sim_inner >= 0.8: match_final.append((node1.id, node2.id, sim_value)) node1.matched = 1 node2.matched = 1 elif sim_inner > t and sim_value > f: match_final.append((node1.id, node2.id, sim_value)) node1.matched = 1 node2.matched = 1
def editscript_calculate(left_node_list, match_final, left_id_to_node, right_id_to_node, right_node_list): left = [] right = [] for item in match_final: left.append(item[0]) right.append(item[1]) # 根据match_final得到两个匹配映射map_left_to_right和map_right_to_left # map_left_to_right为左树到右树的匹配情况,key为T1中的节点id,value为该节点在T2中的最佳匹配节点id map_left_to_right = dict(zip(left, right)) map_right_to_left = dict(zip(right, left)) num_of_node = len(left_node_list) # 计算左子树节点数量,用于为新插入节点设置id edit_script = [] change_information = [] # 记录插入、删除和更新等操作的信息 change_information2 = [] # 记录移动操作的信息,MOV操作需要记录变化前后的父节点类型 # 对于T2中的每个节点x,其父节点为y;x在T1中的匹配节点为w,y为z for right_node in right_node_list: x = right_node.id # x、y、w和z都是id y = right_node.parent w = map_right_to_left.get(x) z = map_right_to_left.get(y) if x == 0: # 跳过头指针 continue else: x_node = right_id_to_node.get(x) z_node = left_id_to_node.get(z) # 如果x的匹配节点不存在且x未经过插入操作处理,则定义x为新增节点,创建一个插入操作INS(x,z),表示在z节点上添加了节点x if w is None and right_node.inserted == 0: new_node = Node(x_node.label, x_node.value) # 将插入操作作用于T1 z_node.insertchild(new_node) new_node.id = num_of_node # 保持左子树中原有节点id不变,为新插入的节点设置id map_right_to_left[x] = new_node.id # 为新插入的节点设置匹配关系(x, z) map_left_to_right[new_node.id] = x left_id_to_node[ num_of_node] = new_node # 为新插入的节点设置id_to_node的匹配关系 # 获取以x_node为根的子树中的所有节点集合node_list,将node_list集合中的节点标记为已插入node.inserted=1 new_node.inserted = 1 node_list, inner_node_list, leaf_node_list = node_get(x_node) for item in node_list: item.inserted = 1 operation = '(' + str( (x_node.label, x_node.value, new_node.id)) + ', ' + str( (z_node.label, z)) + ')' edit_script.append('INS ' + operation) # 没考虑是在z节点的哪个位置上添加了节点x change_information.append(('INS', new_node, z_node)) num_of_node += 1 # 如果x的匹配节点存在,且未经过插入操作处理 elif w is not None and right_node.inserted == 0: w_node = left_id_to_node.get(w) v_id = w_node.parent v_node = left_id_to_node.get(v_id) v_match = map_left_to_right.get(v_id) if w_node.value != x_node.value: # 如果w节点存在,但其值不等于x节点的值,则定义一个更新操作UPD(w,value(x)) operation = '(' + str((w_node.label, w_node.value, w)) + ', ' + x_node.value + ')' edit_script.append('UPD ' + operation) change_information.append(('UPD', w_node, v_node)) # v为w节点的父节点,如果v与y不匹配,判断x的父节点发生了变化,则定义一个移动操作MOV(w,z),表示w节点被移动到z节点下 if v_match != y: operation = '(' + str( (w_node.label, w_node.value, w)) + ', ' + str( (z_node.label, z_node.id)) + ')' edit_script.append('MOV ' + operation) # w_node为变更节点,v_node为变更节点父节点,z_node为移动后的父节点 change_information2.append(('MOV', w_node, v_node, z_node)) # 遍历T1中的节点,如果某T1中的节点在T2中找不到对应的匹配节点,则定义一个删除操作DEL(w) # left_node_list中的节点按广度优先遍历的顺序存放 for left_node in left_node_list: if map_left_to_right.get( left_node.id) is None and left_node.deleted == 0: left_node.deleted = 1 node_list, inner_node_list, leaf_node_list = node_get(left_node) # 删除某个中间节点后,以该节点为根的子树都删除 # 对节点left_node执行DEL操作,并获取以该节点为根的子树中的所有节点集合node_list,将node_list集合中的节点标记为已删除node.deleted=1 for item in node_list: item.deleted = 1 operation = '(' + str( (left_node.label, left_node.value, left_node.id)) + ')' edit_script.append('DEL' + operation) # 找到被删除节点的父节点 left_node_parent = left_id_to_node.get(left_node.parent) change_information.append(('DEL', left_node, left_node_parent)) return edit_script, change_information, change_information2
def editscript_calculate(left_node_list, match_final, left_id_to_node, right_id_to_node, right_node_list): left = [] right = [] for item in match_final: left.append(item[0]) right.append(item[1]) # 根据match_final得到两个匹配映射map_left_to_right和map_right_to_left # map_left_to_right为左树到右树的匹配情况,key为T1中的节点id,value为该节点在T2中的最佳匹配节点id map_left_to_right = dict(zip(left, right)) map_right_to_left = dict(zip(right, left)) num_of_node = len(left_node_list) # 计算左子树节点数量,用于为新插入节点设置id edit_script = [] change_information = [] # 记录插入、删除和更新等操作的信息 change_information2 = [] # 记录移动操作的信息,MOV操作需要记录变化前后的父节点类型 # 对于T2中的每个节点x,其父节点为y;x在T1中的匹配节点为w,y为z for right_node in right_node_list: x = right_node.id # x、y、w和z都是id y = right_node.parent w = map_right_to_left.get(x) z = map_right_to_left.get(y) if x == 0: # 跳过头指针 continue else: x_node = right_id_to_node.get(x) z_node = left_id_to_node.get(z) # 如果x的匹配节点不存在且x未经过插入操作处理,则定义x为新增节点,创建一个插入操作INS(x,z),表示在z节点上添加了节点x if w is None and right_node.inserted == 0: new_node = Node(x_node.label, x_node.value) # 将插入操作作用于T1 z_node.insertchild(new_node) new_node.id = num_of_node # 保持左子树中原有节点id不变,为新插入的节点设置id map_right_to_left[x] = new_node.id # 为新插入的节点设置匹配关系(x, z) map_left_to_right[new_node.id] = x left_id_to_node[num_of_node] = new_node # 为新插入的节点设置id_to_node的匹配关系 # 获取以x_node为根的子树中的所有节点集合node_list,将node_list集合中的节点标记为已插入node.inserted=1 new_node.inserted = 1 node_list, inner_node_list, leaf_node_list = node_get(x_node) for item in node_list: item.inserted = 1 operation = '(' + str((x_node.label, x_node.value, new_node.id)) + ', ' + str((z_node.label, z)) + ')' edit_script.append('INS ' + operation) # 没考虑是在z节点的哪个位置上添加了节点x change_information.append(('INS', new_node, z_node)) num_of_node += 1 # 如果x的匹配节点存在,且未经过插入操作处理 elif w is not None and right_node.inserted == 0: w_node = left_id_to_node.get(w) v_id = w_node.parent v_node = left_id_to_node.get(v_id) v_match = map_left_to_right.get(v_id) if w_node.value != x_node.value: # 如果w节点存在,但其值不等于x节点的值,则定义一个更新操作UPD(w,value(x)) operation = '(' + str((w_node.label, w_node.value, w)) + ', ' + x_node.value + ')' edit_script.append('UPD ' + operation) change_information.append(('UPD', w_node, v_node)) # v为w节点的父节点,如果v与y不匹配,判断x的父节点发生了变化,则定义一个移动操作MOV(w,z),表示w节点被移动到z节点下 if v_match != y: operation = '(' + str((w_node.label, w_node.value, w)) + ', ' + str((z_node.label, z_node.id)) + ')' edit_script.append('MOV ' + operation) # w_node为变更节点,v_node为变更节点父节点,z_node为移动后的父节点 change_information2.append(('MOV', w_node, v_node, z_node)) # 遍历T1中的节点,如果某T1中的节点在T2中找不到对应的匹配节点,则定义一个删除操作DEL(w) # left_node_list中的节点按广度优先遍历的顺序存放 for left_node in left_node_list: if map_left_to_right.get(left_node.id) is None and left_node.deleted == 0: left_node.deleted = 1 node_list, inner_node_list, leaf_node_list = node_get(left_node) # 删除某个中间节点后,以该节点为根的子树都删除 # 对节点left_node执行DEL操作,并获取以该节点为根的子树中的所有节点集合node_list,将node_list集合中的节点标记为已删除node.deleted=1 for item in node_list: item.deleted = 1 operation = '(' + str((left_node.label, left_node.value, left_node.id)) + ')' edit_script.append('DEL' + operation) # 找到被删除节点的父节点 left_node_parent = left_id_to_node.get(left_node.parent) change_information.append(('DEL', left_node, left_node_parent)) return edit_script, change_information, change_information2
def code_change_extraction(left_file, right_file): # 对两个源程序进行处理,生成各自的中间抽象语法树,并获取节点列表等相应信息 left_AstContent = customast.parse_file(left_file) left_tree = Tree('头指针', 'left_head') left_root = Node('AstRoot', 'root') left_tree.linktohead(left_root) ast_process(left_AstContent, left_root) id_set(left_tree.head) left_child_to_parent = child_parent_information(left_tree.head) left_node_list, left_inner_node_list, left_leaf_node_list = node_get(left_tree.head) left_id_to_node = id_to_node_get(left_node_list) # 设置每个节点的parent_id属性 for pair in left_child_to_parent: for left_id in left_id_to_node: if left_id == pair[0]: left_node = left_id_to_node.get(left_id) left_node.parent = pair[1] right_AstContent = customast.parse_file(right_file) right_tree = Tree('头指针', 'right_head') right_root = Node('AstRoot', 'root') right_tree.linktohead(right_root) ast_process(right_AstContent, right_root) id_set(right_tree.head) right_child_to_parent = child_parent_information(right_tree.head) right_node_list, right_inner_node_list, right_leaf_node_list = node_get(right_tree.head) right_id_to_node = id_to_node_get(right_node_list) for pair in right_child_to_parent: for right_id in right_id_to_node: if right_id == pair[0]: right_node = right_id_to_node.get(right_id) right_node.parent = pair[1] # 匹配叶子节点 match_temp = leaf_match(left_leaf_node_list, right_leaf_node_list, 0.6) match_final = best_match(match_temp) leaf_matched_set(left_leaf_node_list, right_leaf_node_list, match_final) # 匹配中间节点 # 对T1中所有标记为unmatched的中间节点,如果T2中存在一个节点y与之匹配,则将(x, y)加入match_final集合 # 在匹配中间节点时采用first match,对于中间节点而言,first is best的概率较大 for node1 in left_inner_node_list: for node2 in right_inner_node_list: if node1.matched == 0 and node2.matched == 0: inner_node_match(node1, node2, match_final, 0.4, 0.6) # 将头指针和根节点加入match_final集合,并将matched标志置为1,确保头指针和根节点一定匹配 match_final.append((0, 0, 1.0)) match_final.append(('head_parent', 'head_parent', 1.0)) match_final.append((1, 1, 1.0)) # 根据T1与T2的匹配节点集合match_final计算从T1转换为T2的编辑操作 edit_script, change_information, change_information2 = \ editscript_calculate(left_node_list, match_final, left_id_to_node, right_id_to_node, right_node_list) # 根据编辑操作得到相应的ChangeType change_type_list, scc_list, parent_entity_list, changed_entity_list = \ changetype_generation(change_information, change_information2) ''' # 输出change信息 for i in range(len(change_type_list)): if change_type_list[i] != '': print('\nChangeType:', change_type_list[i]) print('scc:', scc_list[i]) print('ChangedEntity:', changed_entity_list[i]) print('ParentEntity:', parent_entity_list[i]) ''' # 根据变更抽取结果统计ChangeType的分布 change_type = change_type_enum() change_type_percentage = [0] * len(change_type) # 用于保存每种change出现的百分比 # 若change_type_list长度为0,表示没有不同,change_type_percentage中的元素全为0,否则计算每种change出现的频率 change_type_dict = changetype_statistic(change_type_list) if len(change_type_list) != 0: for key, value in change_type_dict.items(): for i in range(len(change_type)): if key == change_type[i]: change_type_percentage[i] = value / len(change_type_list) # change_type_percentage在对应位置保存了每种change_type的分布百分比 # 例如,change_type_percentage[i]保存的是change_type[i]出现的频率 return change_type_percentage