def DisSim_by_IDN(): input_file = "./Dataset/disease-gene.txt" output_file = "./Result/IDN_gene.txt" lines = FileUtil.readFile2List(input_file) dis_sim = IDN.calculateDisSim(lines) FileUtil.writeDic2File(dis_sim, output_file)
def calculateDisSim(disease_symptom, output_file): begin_time = time.clock() disease2symptom = common.list2DictSet(disease_symptom, key=2, value=1) symptom2disease = common.list2DictSet(disease_symptom, key=1, value=2) diseases = list(disease2symptom.keys()) symptoms = list(symptom2disease.keys()) print("there are {} diseases and {} symptoms in disease-symptom".format(len(diseases), len(symptoms))) # -------------------------------------------------------------------------------------- weight = np.zeros((len(symptoms), len(diseases))) for line in disease_symptom: row_index = symptoms.index(line[0]) col_index = diseases.index(line[1]) weight[row_index][col_index] = float(line[2]) * math.log2( float(len(diseases))/ len(symptom2disease[line[0]])) weight = weight.transpose() # ---------------------------------------------------------------------- CosineDFV_sim = {} for i in range(0, len(diseases)): temp_time1 = time.clock() for j in range(i+1, len(diseases)): cosine_value = common.cosinValue(weight[i], weight[j]) if cosine_value != 0: CosineDFV_sim["{}\t{}".format(diseases[i], diseases[j])] = cosine_value temp_time2 = time.clock() print("{} -> {} costs {}s".format(i, diseases[i], temp_time2 - temp_time1)) FileUtil.writeDic2File(CosineDFV_sim, output_file) end_time = time.clock() print("CosineDFV costs {}s".format(end_time - begin_time)) pass
def calculateDisSim(walks, output_file, save_node_vectors=False): print("learn representations...") walks = [list(map(str, walk)) for walk in walks] model = Word2Vec(walks, size=128, window=10, min_count=0, sg=1, workers=8, iter=1) if save_node_vectors: temp_walk_fname = "./node_vectors.txt" else: _, temp_walk_fname = tempfile.mkstemp() print(temp_walk_fname) model.wv.save_word2vec_format(temp_walk_fname) node_vectors = defaultdict(list) with open(temp_walk_fname, 'r') as f: lines = f.readlines() for index in range(1, len(lines)): line = lines[index].strip().split(' ') vec = line[1:] vec = [float(i) for i in vec] node_vectors[ line[0]] = vec # key:label_disease的下标,value:对应的vectors dis_sim = {} disease = list(node_vectors.keys()) for x in range(0, len(disease)): for y in range(x + 1, len(disease)): sim = common.cosinValue(node_vectors[disease[x]], node_vectors[disease[y]]) if sim != 0: dis_sim["{}\t{}".format(disease[x], disease[y])] = sim FileUtil.writeDic2File(dis_sim, output_file)
def calculateDisSim(dm_edge, mg_edge, gg_edge, output_file, walk_length=1000, path_type=1, save_vectors=False): ''' :param dm_edge: 二维list,表示disease-miRNA network :param mg_edge: 二维list,表示miRNA-gene network :param gg_edge: 二维list,表示gene-gene network :param output_file: str,表示结果保存的路径 :param walk_length: int,表示随机游走中单个节点走的步数。默认为1000 :param path_type: int,表示随机游走的特定路径。默认为1 :param save_vectors: boolean,表示是否保存disease vectors。默认为False,表示不保存疾病特征 :return: ''' dm_d, dm_m = NetUtil.getNodes2HeterNet(dm_edge) d_name_id = NetUtil.labelNode(dm_d, "i") m_name_id = NetUtil.labelNode(dm_m, "f") print("有{}个disease加标签".format(len(d_name_id.keys()))) print("有{}个miRNA加标签".format(len(m_name_id.keys()))) # ------------------------------------------------------------ gg_g = NetUtil.getNodes2HomoNet(gg_edge) mg_m, mg_g = NetUtil.getNodes2HeterNet(mg_edge) g_name = mg_g & gg_g g_name = ['a' + str(i) for i in g_name] print("有{}个gene加标签".format(len(g_name))) # ----------------------------------------------------------------------------------- # dm = defaultdict(list) md = defaultdict(list) for line in dm_edge: if line[1].upper() in dm_m: dm[d_name_id[line[0].upper().strip()]].append( m_name_id[line[1].upper()]) md[m_name_id[line[1].upper()]].append( d_name_id[line[0].upper().strip()]) # ---------------------------------------------------------------------------------------- gg = defaultdict(list) for line in gg_edge: g1 = 'a' + str(line[0]) g2 = 'a' + str(line[1]) if (g1 in g_name) & (g2 in g_name): gg[g1].append(g2) gg[g2].append(g1) gene_del = [] for gene in gg.keys(): if len(gg[gene]) == 2: gene_del.append(gene) for gene in gene_del: del gg[gene] g_name = list(gg.keys()) # --------------------------------------------------------------- test_mg_m = set() mg = defaultdict(list) gm = defaultdict(list) for line in mg_edge: g = 'a' + str(line[1]) if (line[0].upper() in dm_m) & (g in g_name): mg[m_name_id[line[0].upper().strip()]].append(g) gm[g].append(m_name_id[line[0].upper().strip()]) print("mg中有{}个miRNA, 标签后{}个".format(len(test_mg_m), len(mg.keys()))) # ------------------------------------------------------------------------------------- print("mg中miRNA标签后{}个".format(len(mg.keys()))) disease_list = [] for d in dm.keys(): ms = dm[d] for m in ms: if set(mg[m]) & set(g_name): disease_list.append(d) disease_list = list(set(disease_list)) print("mg中miRNA标签后{}个".format(len(mg.keys()))) # print("there are {} diseases, {} genes in random walk.".format(len(disease_list ), len(g_name ))) print("there {} miRNA in mg, {} miRNA in dm".format( len(mg.keys()), len(md.keys()))) total_walk = [] ii = 0 for disease in disease_list: jj = 0 for i in range(walk_length): # print(str(ii) + ' ' + str(i)) temp = [disease] for k in range(50): j = 1 while ((j == 1) & (jj != walk_length)): try: temp2 = random_walk_disease(disease, dm, mg, gg, gm, md, path_type) temp.extend(temp2[1:]) disease = temp2[-1] except: j = 1 jj += 1 else: j = 0 if jj == walk_length: break total_walk.append(temp) ii += 1 # random_walk_path = "./Result/inputomim2.txt" # with open(random_walk_path, 'w') as f: # for lines in total_walk: # for line in lines: # f.write(line + ' ') # f.write('\n') print("learn representations...") _, temp_walk_fname = tempfile.mkstemp() print(temp_walk_fname) with open(temp_walk_fname, 'w') as f: for walk in total_walk: for line in walk: f.write(line + ' ') f.write('\n') _, temp_node_vec_fname = tempfile.mkstemp() statement = "Common/metapath2vec++ -train {} -output {} -pp 1 -size 128 -window 7 -negative 5 -threads 32".format( temp_walk_fname, temp_node_vec_fname) print(statement) os.system(statement) print("\ncalculate disease similarity...") node_vectors_path = temp_node_vec_fname + ".txt" node_vectors = defaultdict(list) with open(node_vectors_path, 'r') as f: lines = f.readlines() for line in lines: line = line.strip().split(' ') if line[0].startswith('i'): vec = line[1:] vec = [float(i) for i in vec] node_vectors[ line[0]] = vec # key:label_disease的下标,value:对应的vectors if save_vectors: FileUtil.writeDicSet2File(node_vectors, "./Result/mpDisNet_node_vectors.txt") print() new_label_disease = {value: key for key, value in d_name_id.items()} dis_sim = {} for x in range(0, len(disease_list)): for y in range(x + 1, len(disease_list)): sim = common.cosinValue(node_vectors[disease_list[x]], node_vectors[disease_list[y]]) if sim != 0: dis_sim["{}\t{}".format( new_label_disease[disease_list[x]], new_label_disease[disease_list[y]])] = sim FileUtil.writeDic2File(dis_sim, output_file)
def calculateDisSim(DAG, output_file, disease_genes = None): ''' :param DAG: 二维list,表示一个有向无循环图 :param output_file: str,表示结果的存储路径 :param disease_genes: 二维list,表示disease-gene associations :return: ''' begin_time = time.clock() diseases = NetUtil.getNodes2HomoNet(DAG) disease_DAG = nx.DiGraph() disease_DAG.add_edges_from(DAG) # ---------------------------------------------------------------------------- IC = defaultdict() if disease_genes: diseases_asso, genes = NetUtil.getNodes2HeterNet(disease_genes) disease2genes = common.list2DictSet(disease_genes, key= 1, value= 2) for di in diseases: if di in diseases_asso: IC[di] = - math.log2(float(len(disease2genes[di])) / len(genes)) else: IC[di] = 0 diseases = diseases & diseases_asso print("there are {} diseases for similarity based on DAG and associations.".format(len(diseases))) else: for di in diseases: descendants = nx.ancestors(disease_DAG, di) if descendants: IC[di] = - math.log2(float(len(descendants))/len(diseases)) print("there are {} diseases for similarity based on DAG.".format(len(diseases))) # -------------------------------------------------------------------------------- print("begin to calculate disease similarity......") diseases = list(diseases) simi_matrix = np.zeros((len(diseases), len(diseases))) for i in range(0, len(diseases)): sys.stdout.flush() temp_time1 = time.clock() di_A = diseases[i] for j in range(i + 1, len(diseases)): di_B = diseases[j] commonAncestors = getCommonAncesters(disease_DAG, di_A, di_B) sectionOfDoId2Gene = getSectionoFromDic(commonAncestors, IC) newDic = sorted(sectionOfDoId2Gene.items(), key=lambda x: x[1], reverse=True) if newDic: simi_matrix[i][j] = newDic[0][1] temp_time2 = time.clock() sys.stdout.write('\r{} -> {}, {}s'.format(i, diseases[i], (temp_time2 - temp_time1))) print() # --------------------------------------------------------------------------------------------- Resnik_simi = {} for i in range(0, len(diseases)): di_A = diseases[i] for j in range(i + 1, len(diseases)): di_B = diseases[j] if simi_matrix[i][j] > 0: Resnik_simi["{}\t{}".format( di_A, di_B)] = simi_matrix[i][j] Resnik_simi = common.normalizeDict(Resnik_simi) FileUtil.writeDic2File(Resnik_simi, output_file) end_time = time.clock() print("ResnikSim costs {}s.".format(end_time - begin_time)) pass
def calculateDisSim(DAG, output_file, selected_diseases=None): ''' :param DAG: 二维list,表示一个有向无循环图 :param output_file: str,表示结果的存储路径 :param selected_diseases: set,表示需要计算相似性的疾病集合。默认为None :return: ''' begin_time = time.clock() diseases = NetUtil.getNodes2HomoNet(DAG) print("there are {} diseases in DAG.".format(len(diseases))) disease_DAG = nx.DiGraph() disease_DAG.add_edges_from(DAG) # ---------------------------------------------------------- DV_diseases_dict = {} for di in diseases: ancestors = nx.descendants(disease_DAG, di) if ancestors: DV_disease_dict = {} ancestors.add(di) sub_graph = disease_DAG.subgraph(list(ancestors)) sub_graph_nodes = sub_graph.nodes for node in sub_graph_nodes: DV_disease_dict[node] = getNodeDVByIT(sub_graph, node) DV_diseases_dict[di] = DV_disease_dict else: DV_disease_dict = {} DV_disease_dict[di] = 1 DV_diseases_dict[di] = DV_disease_dict # ----------------------------------------------------------------------------------- if selected_diseases: diseases = list(selected_diseases & diseases) else: diseases = list(diseases) print("{} diseases are used to calculate similarity.".format( len(diseases))) XuanSim_sim = {} for i in range(0, len(diseases)): DV_disease_A = DV_diseases_dict[diseases[i]] for j in range(i + 1, len(diseases)): DV_disease_B = DV_diseases_dict[diseases[j]] common_diseases = set(DV_disease_A.keys()) & set( DV_disease_B.keys()) if common_diseases: common_DV = 0 for di in common_diseases: common_DV += DV_disease_A[di] + DV_disease_B[di] XuanSim_sim["{}\t{}".format( diseases[i], diseases[j])] = common_DV / (DV_disease_A[diseases[i]] + DV_disease_B[diseases[j]]) if i % 100 == 0: print("{}->{}".format(i, len(diseases))) FileUtil.writeDic2File(XuanSim_sim, output_file) end_time = time.clock() print("XuanSim costs {}s.".format(end_time - begin_time)) pass
def calculateDisSim(phenotypes_info, tree_id2synonyms, output_file): ''' 根据mesh和omim中的数据计算表型的相似性 :param phenotypes_info: 表示omim数据库中的表型信息,dict;key:omim id,value:对该表型的描述 :param tree_id2synonyms: 表示mesh tree的节点及对应的表型同义词,dict;key:tree id,value:表型同义词 :param output_file: 表型相似性的存储路径 :return: ''' # -------------------------------计算actual acount---------------------------------- print("-----actual acount-------") omim_ids = list(phenotypes_info.keys()) tree_ids = list(tree_id2synonyms.keys()) actual_count = np.zeros((len(omim_ids), len(tree_ids))) for index_omim, omim_id in enumerate(omim_ids): description = phenotypes_info[omim_id] for index_tree, tree_id in enumerate(tree_ids): synonyms = tree_id2synonyms[tree_id] for synonym in synonyms: actual_count[index_omim][index_tree] += description.count(synonym) sys.stdout.write("\r{}->{}".format(index_omim, omim_id)) np.savetxt("./Result/actual_count.txt", actual_count, delimiter="\t", fmt="%d") # ---------------------根据hirarchy_count----------- print("\n-----hirarchy_count-------") hiera_count = actual_count for index_omim, omim_id in enumerate(omim_ids): is_calculate = OrderedDict() tree_id_count = OrderedDict() for index_tree, tree_id in enumerate(tree_ids): tree_id_count[tree_id] = hiera_count[index_omim][index_tree] is_calculate[tree_id] = False for tree_id in tree_ids: if is_calculate[tree_id] == False: calculate_counter(tree_id, tree_ids, tree_id_count, is_calculate) for index_tree, tree_id in enumerate(tree_id_count.keys()): if is_calculate[tree_id] == True: hiera_count[index_omim][index_tree] = tree_id_count[tree_id] sys.stdout.write("\r{}->{}".format(index_omim, omim_id)) np.savetxt("./Result/hierachy_count.txt", hiera_count, delimiter="\t", fmt="%f") # ----------------------------------计算global weight------------------------------------ print("\n-----global weight-------") gwc_global = OrderedDict() for tree_id in tree_ids: gwc_global[tree_id] = 0 mostCount = [] for index_omim, omim_id in enumerate(omim_ids): most_occur = 0 for index_tree, tree_id in enumerate(tree_ids): meshNum = float(actual_count[index_omim][index_tree]) if meshNum > most_occur: most_occur = meshNum if meshNum > 0: gwc_global[tree_id] += 1 mostCount.append(most_occur) for key in gwc_global.keys(): recordNum = gwc_global[key] if recordNum > 0: gwc_global[key] = math.log2(len(omim_ids) / recordNum) else: gwc_global[key] = 0.0 # ------------------------------计算local weight------------------------------ print("-----local weight-------") gwc = list(gwc_global.values()) weight_count = np.zeros((len(omim_ids), len(tree_ids))) for index_omim, omim_id in enumerate(omim_ids): mf = mostCount[index_omim] cal_list = [] for index_tree, tree_id in enumerate(tree_ids): cal_list.append(float(hiera_count[index_omim][index_tree])) gwc_cal = np.array(gwc) * np.array(cal_list) gwc_list = gwc_cal.tolist() cal_result = [] for score in gwc_list: if score > 0: cal_result.append(0.5 + 0.5 * (score / mf)) else: cal_result.append(score) for index_tree, tree_id in enumerate(tree_ids): weight_count[index_omim][index_tree] = cal_result[index_tree] np.savetxt("./Result/weight_count.txt", weight_count, delimiter="\t", fmt="%f") # ---------------------------------计算phetypes similarity--------------------------------- print("-----phetypes similarity-------") # 根据cosine计算疾病的相似性 similarity_socre = cosine_similarity(weight_count) similarity_result = {} for i in range(len(omim_ids)): for j in range(i+1, len(omim_ids)): if similarity_socre[i][j] != 0: similarity_result["{}\t{}".format(omim_ids[i], omim_ids[j])] = similarity_socre[i][j] # 对疾病相似性排序 similarity_result = dict(sorted(similarity_result.items(),key = lambda x:x[1], reverse=True)) FileUtil.writeDic2File(similarity_result, output_file)