def read_node2vec_result(path): print('读入,node2vec的结果') node_list, edge_list = return_node_list_and_edge_list() serialNumber_listIndex_dict = nodeSerialNumber_listIndex_dict_generation( node_list) node2vec_result_file = open(path, mode='r') lines = node2vec_result_file.readlines() lines.pop(0) # 第一行包含:节点个数 节点嵌入后的维度 for line in lines: arr = line.strip().split(' ') serial_number = int(arr[0]) arr.pop(0) node_list[ serialNumber_listIndex_dict[serial_number]].embedded_vector = arr for node in node_list: if len(node.embedded_vector) != 64: print('node2vec读入有问题') node2vec_result_file.close()
def read_node2vec_result(path): print('read node2vec result') node_list, edge_list = return_node_list_and_edge_list() serialNumber_listIndex_dict = nodeSerialNumber_listIndex_dict_generation(node_list) node2vec_result_file = open(path, mode='r') lines = node2vec_result_file.readlines() lines.pop(0) # 第一行包含:节点个数 节点嵌入后的维度 for line in lines: arr = line.strip().split(' ') serial_number = int(arr[0]) arr.pop(0) node_list[serialNumber_listIndex_dict[serial_number]].embedded_vector = arr count_node_without_node2vecResult = 0 for node in node_list: if len(node.embedded_vector) != 64: count_node_without_node2vecResult += 1 node.embedded_vector = [0] * 64 print(f'没有node2vec结果的节点数:{count_node_without_node2vecResult}') node2vec_result_file.close()
def reduce_dataset_mentainConnected(G, ratio, list_interaction, list_negativeInteraction, list_lncRNA, list_protein): print(f'reduce the dataset to its {ratio}') # 确定要删减的数据的数量 len_list_interaction = len(list_interaction) len_list_negativeInteraction = len(list_negativeInteraction) num_delete_interaction = int( len(list_interaction) - (len(list_interaction) * ratio)) num_delete_negativeInteraction = int( len(list_negativeInteraction) - (len(list_negativeInteraction) * ratio)) # 创建从serial_number到索引的字典,方便找到lncRNA, protein dict_serialNumber_listIndex_lncRNA = nodeSerialNumber_listIndex_dict_generation( list_lncRNA) dict_serialNumber_listIndex_protein = nodeSerialNumber_listIndex_dict_generation( list_protein) # 打乱 random.shuffle(list_interaction) random.shuffle(list_negativeInteraction) # 从list_interaction中删除 num_deleted_positive = 0 num_deleted_negative = 0 delete_positive = True count = 0 while num_deleted_positive + num_deleted_negative < num_delete_interaction + num_delete_negativeInteraction: # 挑选正例来删除 delete_positive = not delete_positive if delete_positive == True and num_deleted_positive < num_delete_interaction: index_interaction = random.sample( range(0, len_list_interaction - num_deleted_positive), 1)[0] interaction = list_interaction[index_interaction] # 定位到interaction对应的lncRNA和protein index_lncRNA = dict_serialNumber_listIndex_lncRNA[ interaction.lncRNA.serial_number] index_protein = dict_serialNumber_listIndex_protein[ interaction.protein.serial_number] lncRNA = list_lncRNA[index_lncRNA] protein = list_protein[index_protein] # 判断删除这个相互作用,会不会使数据集的二部图成为非连通图 G_temp = G.copy() e = (lncRNA.serial_number, protein.serial_number) G_temp.remove_edge(*e) num_connectedComponent_G_temp = len( list(nx.connected_components(G_temp))) if num_connectedComponent_G_temp == 1: # 删除这个条边,二部图还是一个连通分量 delete_interaction_from_lncRNA_protein(lncRNA, protein) G = G_temp del list_interaction[index_interaction] num_deleted_positive += 1 elif num_connectedComponent_G_temp == 2 and len( lncRNA.interaction_list) == 1: # 删除这条边,二部图变成两个连通分量,其中小的那个只包含一个孤立的lncRNA delete_interaction_from_lncRNA_protein(lncRNA, protein) G_temp.remove_node(lncRNA.serial_number) G = G_temp del list_interaction[index_interaction] num_deleted_positive += 1 elif num_connectedComponent_G_temp == 2 and len( protein.interaction_list) == 1: # 删除这条边,二部图变成两个连通分量,其中小的那个只包含一个孤立的protein delete_interaction_from_lncRNA_protein(lncRNA, protein) G_temp.remove_node(protein.serial_number) G = G_temp del list_interaction[index_interaction] num_deleted_positive += 1 # 回收垃圾 gc.collect() elif delete_positive == False and num_deleted_negative < num_delete_negativeInteraction: # 挑选负例来删除 index_interaction = random.sample( range(0, len_list_negativeInteraction - num_deleted_negative), 1)[0] interaction = list_negativeInteraction[index_interaction] # 定位到interaction对应的lncRNA和protein index_lncRNA = dict_serialNumber_listIndex_lncRNA[ interaction.lncRNA.serial_number] index_protein = dict_serialNumber_listIndex_protein[ interaction.protein.serial_number] lncRNA = list_lncRNA[index_lncRNA] protein = list_protein[index_protein] # 判断删除这个相互作用,会不会使数据集的二部图成为非连通图 G_temp = G.copy() e = (lncRNA.serial_number, protein.serial_number) G_temp.remove_edge(*e) num_connectedComponent_G_temp = len( list(nx.connected_components(G_temp))) if num_connectedComponent_G_temp == 1: # 删除这个条边,二部图还是一个连通分量 delete_interaction_from_lncRNA_protein(lncRNA, protein) G = G_temp del list_negativeInteraction[index_interaction] num_deleted_negative += 1 elif num_connectedComponent_G_temp == 2 and len( lncRNA.interaction_list) == 1: # 删除这条边,二部图变成两个连通分量,其中小的那个只包含一个孤立的lncRNA # print('删除这条边,二部图变成两个连通分量,其中小的那个只包含一个孤立的lncRNA') delete_interaction_from_lncRNA_protein(lncRNA, protein) G_temp.remove_node(lncRNA.serial_number) G = G_temp del list_negativeInteraction[index_interaction] num_deleted_negative += 1 elif num_connectedComponent_G_temp == 2 and len( protein.interaction_list) == 1: # 删除这条边,二部图变成两个连通分量,其中小的那个只包含一个孤立的protein # print('删除这条边,二部图变成两个连通分量,其中小的那个只包含一个孤立的protein') delete_interaction_from_lncRNA_protein(lncRNA, protein) G_temp.remove_node(protein.serial_number) G = G_temp del list_negativeInteraction[index_interaction] num_deleted_negative += 1 # 回收垃圾 gc.collect() if count % 100 == 0: print( f'number of positive samples need to be deleted: {num_delete_interaction-num_deleted_positive},number of negative samples need to be deleted: {num_delete_negativeInteraction-num_deleted_negative}' ) count += 1 print('reduce process done')