def DisSim_by_mpDisNet(): input_file1 = "./Dataset/mpDisNet/omim_dm.txt" input_file2 = "./Dataset/mpDisNet/omim_mg.txt" input_file3 = "./Dataset/mpDisNet/PPI.txt" disease_miRNA = FileUtil.readFile2List(input_file1) miRNA_gene = FileUtil.readFile2List(input_file2) gene_gene = FileUtil.readFile2List(input_file3) output_file = "./Result/mpDisNet_sim.txt" mpDisNet.calculateDisSim(disease_miRNA, miRNA_gene, gene_gene, output_file, path_type=2) # ------------------------ evaluation --------------------------------------- file_path1 = "./Evaluation/benchmark_MeSH_RADAR.txt" BenChmark_MeSH1 = FileUtil.readFile2List(file_path1) simi_Result = FileUtil.readFile2List(output_file) benchmark_evaluation.evaluate_by_benchmark(BenChmark_MeSH1, simi_Result, times=10)
def cal_path_sim(disease, edges, save_path_sim=False): print("begin calculate similarity based on path...") pathway = list(NetUtil.getColNodes(edges, col=1)) ajaMatrix = np.zeros((len(disease), len(pathway))) for line in edges: row_index = disease.index(line[0]) col_index = pathway.index(line[1]) ajaMatrix[row_index][col_index] = float(1) W = np.dot(ajaMatrix, ajaMatrix.T) print("construct similarity matrix...") pathSim = {} sim_matrix = np.zeros((len(disease), len(disease))) for i in range(0, len(disease)): for j in range(i + 1, len(disease)): # if W[i][j] != 0: pathSim["{}\t{}".format( disease[i], disease[j])] = 2 * W[i][j] / (W[i][i] + W[j][j]) sim_matrix[i][j] = 2 * W[i][j] / (W[i][i] + W[j][j]) sim_matrix[j][i] = 2 * W[i][j] / (W[i][i] + W[j][j]) if save_path_sim: print("sort the path similarity and save...") res = sorted(pathSim.items(), key=lambda x: x[1], reverse=True) FileUtil.writeSortedDic2File(res, "./path_sim.txt") return sim_matrix
def con_single_layer_net(edges, filter_value=0.1): disease, pathway = NetUtil.getNodes2HeterNet(edges) disease = list(disease) print("1st col -> {}\t 2nd col -> {}".format(len(disease), len(pathway))) nei_sim_matrix = cal_nei_sim(disease, edges, save_nei_sim=True) path_sim_matrix = cal_path_sim(disease, edges, save_path_sim=True) layer_net = [] layer_net_result = {} for i in range(0, len(disease)): for j in range(i + 1, len(disease)): fusion_sim = 0.5 * nei_sim_matrix[i][j] + 0.5 * path_sim_matrix[i][ j] if fusion_sim > filter_value: layer_net.append([disease[i], disease[j], fusion_sim]) layer_net_result["{}\t{}".format(disease[i], disease[j])] = fusion_sim print("complete a single layer similarity network...") layer_net_result = common.sortDict(layer_net_result) FileUtil.writeSortedDic2File(layer_net_result, "./layer_net.txt") return layer_net
def DisSim_by_IDN(): input_file = "./Dataset/disease-gene.txt" output_file = "./Result/IDN_gene.txt" lines = FileUtil.readFile2List(input_file) dis_sim = IDN.calculateDisSim(lines) FileUtil.writeDic2File(dis_sim, output_file)
def DisSim_by_NetSim(): input_file1 = "./Dataset/disease-gene_SIDD.txt" input_file2 = "./Dataset/HumanNet_symbol.txt" output_file = "./Result/NetSim_sim_DO.txt" disease2gene = FileUtil.readFile2DictSet(input_file1, header=True) gene_gene = FileUtil.readFile2List(input_file2) NetSim.calculateDisSim(disease2gene, gene_gene, output_file)
def calculateDisSim(disease_microbe, output_file): ''' :param disease_microbe: 二维list,表示disease-microbe network :param output_file: str,表示结果保存的路径 :return: ''' begin_time = time.clock() microbe2disease = defaultdict(set) disease2microbe = defaultdict(set) for line in disease_microbe: microbe2disease[line[0]].add(line[1]) disease2microbe[line[1]].add(line[0]) print("there are {} diseases and {} microbes in disease-microbe.".format( len(disease2microbe.keys()), len(microbe2disease.keys()))) diseases = list(disease2microbe.keys()) microbes = list(microbe2disease.keys()) weight = np.zeros((len(disease2microbe.keys()), len(microbe2disease))) E = np.ones((len(disease2microbe.keys()), len(microbe2disease))) for line in disease_microbe: indexRow = diseases.index(line[1]) indexCol = microbes.index(line[0]) weight[indexRow][indexCol] += 1 if line[3] == "increase": E[indexRow][indexCol] = 1 elif line[3] == "decrease": E[indexRow][indexCol] = -1 for indexRow in range(0, len(diseases)): for indexCol in range(0, len(microbes)): # print math.log(diseaseNum / len(n.get(microbeList[indexCol], 2))) weight[indexRow][indexCol] *= E[indexRow][indexCol] * math.log2( float(len(diseases)) / len(microbe2disease[microbes[indexCol]])) # ------------------------------------------------------------------ MicrobeSim = {} for i in range(0, len(diseases)): for j in range(i + 1, len(diseases)): cosine_value = common.cosinValue(weight[i], weight[j]) if cosine_value != 0: MicrobeSim["{}\t{}".format(diseases[i], diseases[j])] = cosine_value MicrobeSim = common.sortDict(MicrobeSim) FileUtil.writeSortedDic2File(MicrobeSim, output_file) end_time = time.clock() print("MicrobeSim costs {}s".format(end_time - begin_time)) pass
def DisSim_by_ModuleSim(): input_file1 = "./Dataset/disease-gene.txt" input_file2 = "./Dataset/PPI.txt" output_file = "./Result/ModuleSim_sim.txt" dis2gene = FileUtil.readFile2DictSet(input_file1) gene_net = ModuleSim.read_interactome(input_file2, False, False) print("number of vertices: {}, number of edges: {}".format( gene_net.vcount(), gene_net.ecount())) sims = ModuleSim.similarity_cal_spavgn(dis2gene, gene_net) FileUtil.write_sims(sims, output_file)
def read_db_config(filename='c:/temp/jquant/config.ini', section='mysql'): """ Read database configuration file and return a dictionary object :param filename: name of the configuration file :param section: section of database configuration :return: a dictionary of database parameters """ if not FileUtil.file_exist(filename): print("configuration file {} does not exit.".format(filter())) return # create parser and read ini configuration file parser = ConfigParser() parser.read(filename) # get section, default to mysql db = {} if parser.has_section(section): items = parser.items(section) for item in items: db[item[0]] = item[1] # {'host': 'localhost', 'database': 'quant', 'user': '******', 'password': '******'} else: raise Exception('{0} not found in the {1} file'.format( section, filename)) return db
def DisSim_by_CosineDFV(): input_file1 = "./Dataset/disease-symptom.txt" output_file = "./Result/CosineDFV_sim.txt" disease_symptom = FileUtil.readFile2List(input_file1, header=True) CosineDFV.calculateDisSim(disease_symptom, output_file)
def DisSim_by_MicrobeSim(): input_file1 = "./Dataset/disease-microbe.txt" output_file = "./Result/MicrobeSim_sim.txt" disease_microbe = FileUtil.readFile2List(input_file1, header=True) MicrobeSim.calculateDisSim(disease_microbe, output_file)
def DisSim_by_Resink(): input_file1 = "./Dataset/DO_DAG.txt" input_file3 = "./Dataset/disease-gene_SIDD.txt" output_file = "./Result/Resink_sim.txt" DO_DAG = FileUtil.readFile2List(input_file1, header=True) disease_genes = FileUtil.readFile2List(input_file3) ResinkSim.calculateDisSim(DO_DAG, output_file) # ------------------------------------------------------------------- file_path1 = "./Evaluation/benchmark_DOID.txt" BenChmark_DO = FileUtil.readFile2List(file_path1) simi_Result = FileUtil.readFile2List(output_file) benchmark_evaluation.evaluate_by_benchmark(BenChmark_DO, simi_Result, times=10)
def DisSim_by_FunSim(): input_file1 = "./Dataset/disease-gene_SIDD.txt" input_file2 = "./Dataset/HumanNet_symbol_weighted.txt" output_file = "./Result/FunSim_sim.txt" disease2genes = FileUtil.readFile2DictSet(input_file1) weighted_PPI = FileUtil.readFile2List(input_file2) FunSim.calculateDisSim(disease2genes, weighted_PPI, output_file) # ------------------------ evaluation --------------------------------------- file_path1 = "./Evaluation/benchmark_DOID.txt" BenChmark_DO = FileUtil.readFile2List(file_path1) simi_Result = FileUtil.readFile2List(output_file) benchmark_evaluation.evaluate_by_benchmark(BenChmark_DO, simi_Result, times=10)
def calculateDisSim(disease_symptom, output_file): begin_time = time.clock() disease2symptom = common.list2DictSet(disease_symptom, key=2, value=1) symptom2disease = common.list2DictSet(disease_symptom, key=1, value=2) diseases = list(disease2symptom.keys()) symptoms = list(symptom2disease.keys()) print("there are {} diseases and {} symptoms in disease-symptom".format(len(diseases), len(symptoms))) # -------------------------------------------------------------------------------------- weight = np.zeros((len(symptoms), len(diseases))) for line in disease_symptom: row_index = symptoms.index(line[0]) col_index = diseases.index(line[1]) weight[row_index][col_index] = float(line[2]) * math.log2( float(len(diseases))/ len(symptom2disease[line[0]])) weight = weight.transpose() # ---------------------------------------------------------------------- CosineDFV_sim = {} for i in range(0, len(diseases)): temp_time1 = time.clock() for j in range(i+1, len(diseases)): cosine_value = common.cosinValue(weight[i], weight[j]) if cosine_value != 0: CosineDFV_sim["{}\t{}".format(diseases[i], diseases[j])] = cosine_value temp_time2 = time.clock() print("{} -> {} costs {}s".format(i, diseases[i], temp_time2 - temp_time1)) FileUtil.writeDic2File(CosineDFV_sim, output_file) end_time = time.clock() print("CosineDFV costs {}s".format(end_time - begin_time)) pass
def DisSim_by_XuanSim(): input_file1 = "./Dataset/DO_DAG.txt" input_file2 = "./Dataset/disease-gene_SIDD.txt" output_file = "./Result/XuanSim_sim.txt" DO_DAG = FileUtil.readFile2List(input_file1, header=True) diseases2genes = FileUtil.readFile2DictSet(input_file2) XuanSim.calculateDisSim(DO_DAG, output_file, selected_diseases=set(diseases2genes.keys())) file_path1 = "./Evaluation/benchmark_DOID.txt" BenChmark_DO = FileUtil.readFile2List(file_path1) simi_Result = FileUtil.readFile2List(output_file) benchmark_evaluation.evaluate_by_benchmark(BenChmark_DO, simi_Result, times=10)
def random_walk_multi_layers(multi_layers_net, walk_iters=100, walk_legth=160, save_random_walk=False): ''' :param multi_layers_net: list,表示一个多层的大网络,每一层都是一个小网络,并且每一层的节点都相同 :param output_file: str,表示输出文件的路径,文件内容为节点及其对应的向量 :param walk_iters: int,表示单个节点游走的次数 :param walk_legth: int,表示一个节点在网络中游走的步数 :param save_random_walk: boolean,表示是否保存随机游走的路径 :return: ''' multi_layers = defaultdict() for i in range(len(multi_layers_net)): G = read_graph(multi_layers_net[i], weighted=True) print("{} layer -> {} nodes and {} edges.".format( i, len(nx.nodes(G)), len(nx.edges(G)))) multi_layers[i] = G nodes = multi_layers[0].nodes() walks = [] for index, node in enumerate(nodes): time1 = time.clock() max_weights = list() for key, G in multi_layers.items(): nei_weight = set() for nei_node in G.neighbors(node): nei_weight.add(float(G[node][nei_node]['weight'])) max_weights.append(max(nei_weight)) select_layer = max_weights.index(max(max_weights)) for walk_iter in range(walk_iters): walk = random_walk(multi_layers[select_layer], node, walk_legth) walks.append(walk) time2 = time.clock() print("{} * {} -> {} layer: cost {}s".format(index, node, select_layer + 1, time2 - time1)) if save_random_walk: FileUtil.write2DemList2File(walks, "./random_walk.txt") return walks
def calculateDisSim(walks, output_file, save_node_vectors=False): print("learn representations...") walks = [list(map(str, walk)) for walk in walks] model = Word2Vec(walks, size=128, window=10, min_count=0, sg=1, workers=8, iter=1) if save_node_vectors: temp_walk_fname = "./node_vectors.txt" else: _, temp_walk_fname = tempfile.mkstemp() print(temp_walk_fname) model.wv.save_word2vec_format(temp_walk_fname) node_vectors = defaultdict(list) with open(temp_walk_fname, 'r') as f: lines = f.readlines() for index in range(1, len(lines)): line = lines[index].strip().split(' ') vec = line[1:] vec = [float(i) for i in vec] node_vectors[ line[0]] = vec # key:label_disease的下标,value:对应的vectors dis_sim = {} disease = list(node_vectors.keys()) for x in range(0, len(disease)): for y in range(x + 1, len(disease)): sim = common.cosinValue(node_vectors[disease[x]], node_vectors[disease[y]]) if sim != 0: dis_sim["{}\t{}".format(disease[x], disease[y])] = sim FileUtil.writeDic2File(dis_sim, output_file)
def random_walk(edges, num_walks=100, walk_length=160): time1 = time.clock() nx_G = read_graph(edges, weighted=True, directed=False) print("{} nodes, {} edges.".format(len(nx.nodes(nx_G)), len(nx.edges(nx_G)))) time2 = time.clock() print("It cost {}s to read edges.".format(time2 - time1)) G = Graph(nx_G, p=1, q=1) print("generate transition matrix......") G.preprocess_transition_probs() time3 = time.clock() print("It cost {}s to generate transition matrix.".format(time3 - time2)) print("begin to random walk......") walks = G.simulate_walks(num_walks=num_walks, walk_length=walk_length) time4 = time.clock() print("It cost {}s to random walk.".format(time4 - time3)) FileUtil.write2DemList2File(walks, "./node2vec_walks.txt") return walks
def read_price_file(): # SystemEnv.read_config('config.ini') # print(SystemEnv.g_price_file) for key, value in SystemEnv.g_price_file.items(): print('{}={}'.format(key, value)) price_file = os.path.join(SystemEnv.g_price_file['sourcefolder'], "Yahoo_TSLA.csv") if not FileUtil.file_exist(price_file): print("Price File {} does not exist.".format(price_file)) return df_price = pd.read_csv(price_file) return df_price
def read_config(filename='config.ini'): """ Read database configuration file and return a dictionary object :param filename: name of the configuration file :param section: section of database configuration :return: a dictionary of database parameters """ global g_mysql_connection global g_price_file global g_tick_list def configSectionMap(p_section): dict1 = {} options = parser.options(p_section) for option in options: try: dict1[option] = parser.get(p_section, option) if dict1[option] == -1: print("skip: %s" % option) except Exception as ex: print("exception on %s!" % option) dict1[option] = None return dict1 if not FileUtil.file_exist(filename): print("configuration file {} does not exit.".format(filename)) return # create parser and read ini configuration file parser = ConfigParser() parser.read(filename) sections = parser.sections() for section in parser.sections(): if section == ConfigSection.E_MYSQL.value: g_mysql_connection = configSectionMap(section) elif section == ConfigSection.E_PRICE_FILE.value: g_price_file = configSectionMap(section) elif section == ConfigSection.E_TICKER.value: g_tick_list = _listfstr(configSectionMap(section))
def line2upper(input_file, output_file): lines = FileUtil.readFile2List(input_file) new_lines = [line.upper() for line in lines] FileUtil.writeList2File(new_lines, output_file)
def calculateDisSim(DAG, output_file, disease_genes = None): ''' :param DAG: 二维list,表示一个有向无循环图 :param output_file: str,表示结果的存储路径 :param disease_genes: 二维list,表示disease-gene associations :return: ''' begin_time = time.clock() diseases = NetUtil.getNodes2HomoNet(DAG) disease_DAG = nx.DiGraph() disease_DAG.add_edges_from(DAG) # ---------------------------------------------------------------------------- IC = defaultdict() if disease_genes: diseases_asso, genes = NetUtil.getNodes2HeterNet(disease_genes) disease2genes = common.list2DictSet(disease_genes, key= 1, value= 2) for di in diseases: if di in diseases_asso: IC[di] = - math.log2(float(len(disease2genes[di])) / len(genes)) else: IC[di] = 0 diseases = diseases & diseases_asso print("there are {} diseases for similarity based on DAG and associations.".format(len(diseases))) else: for di in diseases: descendants = nx.ancestors(disease_DAG, di) if descendants: IC[di] = - math.log2(float(len(descendants))/len(diseases)) print("there are {} diseases for similarity based on DAG.".format(len(diseases))) # -------------------------------------------------------------------------------- print("begin to calculate disease similarity......") diseases = list(diseases) simi_matrix = np.zeros((len(diseases), len(diseases))) for i in range(0, len(diseases)): sys.stdout.flush() temp_time1 = time.clock() di_A = diseases[i] for j in range(i + 1, len(diseases)): di_B = diseases[j] commonAncestors = getCommonAncesters(disease_DAG, di_A, di_B) sectionOfDoId2Gene = getSectionoFromDic(commonAncestors, IC) newDic = sorted(sectionOfDoId2Gene.items(), key=lambda x: x[1], reverse=True) if newDic: simi_matrix[i][j] = newDic[0][1] temp_time2 = time.clock() sys.stdout.write('\r{} -> {}, {}s'.format(i, diseases[i], (temp_time2 - temp_time1))) print() # --------------------------------------------------------------------------------------------- Resnik_simi = {} for i in range(0, len(diseases)): di_A = diseases[i] for j in range(i + 1, len(diseases)): di_B = diseases[j] if simi_matrix[i][j] > 0: Resnik_simi["{}\t{}".format( di_A, di_B)] = simi_matrix[i][j] Resnik_simi = common.normalizeDict(Resnik_simi) FileUtil.writeDic2File(Resnik_simi, output_file) end_time = time.clock() print("ResnikSim costs {}s.".format(end_time - begin_time)) pass
def calculateDisSim(DAG, output_file, selected_diseases=None): ''' :param DAG: 二维list,表示一个有向无循环图 :param output_file: str,表示结果的存储路径 :param selected_diseases: set,表示需要计算相似性的疾病集合。默认为None :return: ''' begin_time = time.clock() diseases = NetUtil.getNodes2HomoNet(DAG) print("there are {} diseases in DAG.".format(len(diseases))) disease_DAG = nx.DiGraph() disease_DAG.add_edges_from(DAG) # ---------------------------------------------------------- DV_diseases_dict = {} for di in diseases: ancestors = nx.descendants(disease_DAG, di) if ancestors: DV_disease_dict = {} ancestors.add(di) sub_graph = disease_DAG.subgraph(list(ancestors)) sub_graph_nodes = sub_graph.nodes for node in sub_graph_nodes: DV_disease_dict[node] = getNodeDVByIT(sub_graph, node) DV_diseases_dict[di] = DV_disease_dict else: DV_disease_dict = {} DV_disease_dict[di] = 1 DV_diseases_dict[di] = DV_disease_dict # ----------------------------------------------------------------------------------- if selected_diseases: diseases = list(selected_diseases & diseases) else: diseases = list(diseases) print("{} diseases are used to calculate similarity.".format( len(diseases))) XuanSim_sim = {} for i in range(0, len(diseases)): DV_disease_A = DV_diseases_dict[diseases[i]] for j in range(i + 1, len(diseases)): DV_disease_B = DV_diseases_dict[diseases[j]] common_diseases = set(DV_disease_A.keys()) & set( DV_disease_B.keys()) if common_diseases: common_DV = 0 for di in common_diseases: common_DV += DV_disease_A[di] + DV_disease_B[di] XuanSim_sim["{}\t{}".format( diseases[i], diseases[j])] = common_DV / (DV_disease_A[diseases[i]] + DV_disease_B[diseases[j]]) if i % 100 == 0: print("{}->{}".format(i, len(diseases))) FileUtil.writeDic2File(XuanSim_sim, output_file) end_time = time.clock() print("XuanSim costs {}s.".format(end_time - begin_time)) pass
def calculateDisSim(dm_edge, mg_edge, gg_edge, output_file, walk_length=1000, path_type=1, save_vectors=False): ''' :param dm_edge: 二维list,表示disease-miRNA network :param mg_edge: 二维list,表示miRNA-gene network :param gg_edge: 二维list,表示gene-gene network :param output_file: str,表示结果保存的路径 :param walk_length: int,表示随机游走中单个节点走的步数。默认为1000 :param path_type: int,表示随机游走的特定路径。默认为1 :param save_vectors: boolean,表示是否保存disease vectors。默认为False,表示不保存疾病特征 :return: ''' dm_d, dm_m = NetUtil.getNodes2HeterNet(dm_edge) d_name_id = NetUtil.labelNode(dm_d, "i") m_name_id = NetUtil.labelNode(dm_m, "f") print("有{}个disease加标签".format(len(d_name_id.keys()))) print("有{}个miRNA加标签".format(len(m_name_id.keys()))) # ------------------------------------------------------------ gg_g = NetUtil.getNodes2HomoNet(gg_edge) mg_m, mg_g = NetUtil.getNodes2HeterNet(mg_edge) g_name = mg_g & gg_g g_name = ['a' + str(i) for i in g_name] print("有{}个gene加标签".format(len(g_name))) # ----------------------------------------------------------------------------------- # dm = defaultdict(list) md = defaultdict(list) for line in dm_edge: if line[1].upper() in dm_m: dm[d_name_id[line[0].upper().strip()]].append( m_name_id[line[1].upper()]) md[m_name_id[line[1].upper()]].append( d_name_id[line[0].upper().strip()]) # ---------------------------------------------------------------------------------------- gg = defaultdict(list) for line in gg_edge: g1 = 'a' + str(line[0]) g2 = 'a' + str(line[1]) if (g1 in g_name) & (g2 in g_name): gg[g1].append(g2) gg[g2].append(g1) gene_del = [] for gene in gg.keys(): if len(gg[gene]) == 2: gene_del.append(gene) for gene in gene_del: del gg[gene] g_name = list(gg.keys()) # --------------------------------------------------------------- test_mg_m = set() mg = defaultdict(list) gm = defaultdict(list) for line in mg_edge: g = 'a' + str(line[1]) if (line[0].upper() in dm_m) & (g in g_name): mg[m_name_id[line[0].upper().strip()]].append(g) gm[g].append(m_name_id[line[0].upper().strip()]) print("mg中有{}个miRNA, 标签后{}个".format(len(test_mg_m), len(mg.keys()))) # ------------------------------------------------------------------------------------- print("mg中miRNA标签后{}个".format(len(mg.keys()))) disease_list = [] for d in dm.keys(): ms = dm[d] for m in ms: if set(mg[m]) & set(g_name): disease_list.append(d) disease_list = list(set(disease_list)) print("mg中miRNA标签后{}个".format(len(mg.keys()))) # print("there are {} diseases, {} genes in random walk.".format(len(disease_list ), len(g_name ))) print("there {} miRNA in mg, {} miRNA in dm".format( len(mg.keys()), len(md.keys()))) total_walk = [] ii = 0 for disease in disease_list: jj = 0 for i in range(walk_length): # print(str(ii) + ' ' + str(i)) temp = [disease] for k in range(50): j = 1 while ((j == 1) & (jj != walk_length)): try: temp2 = random_walk_disease(disease, dm, mg, gg, gm, md, path_type) temp.extend(temp2[1:]) disease = temp2[-1] except: j = 1 jj += 1 else: j = 0 if jj == walk_length: break total_walk.append(temp) ii += 1 # random_walk_path = "./Result/inputomim2.txt" # with open(random_walk_path, 'w') as f: # for lines in total_walk: # for line in lines: # f.write(line + ' ') # f.write('\n') print("learn representations...") _, temp_walk_fname = tempfile.mkstemp() print(temp_walk_fname) with open(temp_walk_fname, 'w') as f: for walk in total_walk: for line in walk: f.write(line + ' ') f.write('\n') _, temp_node_vec_fname = tempfile.mkstemp() statement = "Common/metapath2vec++ -train {} -output {} -pp 1 -size 128 -window 7 -negative 5 -threads 32".format( temp_walk_fname, temp_node_vec_fname) print(statement) os.system(statement) print("\ncalculate disease similarity...") node_vectors_path = temp_node_vec_fname + ".txt" node_vectors = defaultdict(list) with open(node_vectors_path, 'r') as f: lines = f.readlines() for line in lines: line = line.strip().split(' ') if line[0].startswith('i'): vec = line[1:] vec = [float(i) for i in vec] node_vectors[ line[0]] = vec # key:label_disease的下标,value:对应的vectors if save_vectors: FileUtil.writeDicSet2File(node_vectors, "./Result/mpDisNet_node_vectors.txt") print() new_label_disease = {value: key for key, value in d_name_id.items()} dis_sim = {} for x in range(0, len(disease_list)): for y in range(x + 1, len(disease_list)): sim = common.cosinValue(node_vectors[disease_list[x]], node_vectors[disease_list[y]]) if sim != 0: dis_sim["{}\t{}".format( new_label_disease[disease_list[x]], new_label_disease[disease_list[y]])] = sim FileUtil.writeDic2File(dis_sim, output_file)
def calculateDisSim(phenotypes_info, tree_id2synonyms, output_file): ''' 根据mesh和omim中的数据计算表型的相似性 :param phenotypes_info: 表示omim数据库中的表型信息,dict;key:omim id,value:对该表型的描述 :param tree_id2synonyms: 表示mesh tree的节点及对应的表型同义词,dict;key:tree id,value:表型同义词 :param output_file: 表型相似性的存储路径 :return: ''' # -------------------------------计算actual acount---------------------------------- print("-----actual acount-------") omim_ids = list(phenotypes_info.keys()) tree_ids = list(tree_id2synonyms.keys()) actual_count = np.zeros((len(omim_ids), len(tree_ids))) for index_omim, omim_id in enumerate(omim_ids): description = phenotypes_info[omim_id] for index_tree, tree_id in enumerate(tree_ids): synonyms = tree_id2synonyms[tree_id] for synonym in synonyms: actual_count[index_omim][index_tree] += description.count(synonym) sys.stdout.write("\r{}->{}".format(index_omim, omim_id)) np.savetxt("./Result/actual_count.txt", actual_count, delimiter="\t", fmt="%d") # ---------------------根据hirarchy_count----------- print("\n-----hirarchy_count-------") hiera_count = actual_count for index_omim, omim_id in enumerate(omim_ids): is_calculate = OrderedDict() tree_id_count = OrderedDict() for index_tree, tree_id in enumerate(tree_ids): tree_id_count[tree_id] = hiera_count[index_omim][index_tree] is_calculate[tree_id] = False for tree_id in tree_ids: if is_calculate[tree_id] == False: calculate_counter(tree_id, tree_ids, tree_id_count, is_calculate) for index_tree, tree_id in enumerate(tree_id_count.keys()): if is_calculate[tree_id] == True: hiera_count[index_omim][index_tree] = tree_id_count[tree_id] sys.stdout.write("\r{}->{}".format(index_omim, omim_id)) np.savetxt("./Result/hierachy_count.txt", hiera_count, delimiter="\t", fmt="%f") # ----------------------------------计算global weight------------------------------------ print("\n-----global weight-------") gwc_global = OrderedDict() for tree_id in tree_ids: gwc_global[tree_id] = 0 mostCount = [] for index_omim, omim_id in enumerate(omim_ids): most_occur = 0 for index_tree, tree_id in enumerate(tree_ids): meshNum = float(actual_count[index_omim][index_tree]) if meshNum > most_occur: most_occur = meshNum if meshNum > 0: gwc_global[tree_id] += 1 mostCount.append(most_occur) for key in gwc_global.keys(): recordNum = gwc_global[key] if recordNum > 0: gwc_global[key] = math.log2(len(omim_ids) / recordNum) else: gwc_global[key] = 0.0 # ------------------------------计算local weight------------------------------ print("-----local weight-------") gwc = list(gwc_global.values()) weight_count = np.zeros((len(omim_ids), len(tree_ids))) for index_omim, omim_id in enumerate(omim_ids): mf = mostCount[index_omim] cal_list = [] for index_tree, tree_id in enumerate(tree_ids): cal_list.append(float(hiera_count[index_omim][index_tree])) gwc_cal = np.array(gwc) * np.array(cal_list) gwc_list = gwc_cal.tolist() cal_result = [] for score in gwc_list: if score > 0: cal_result.append(0.5 + 0.5 * (score / mf)) else: cal_result.append(score) for index_tree, tree_id in enumerate(tree_ids): weight_count[index_omim][index_tree] = cal_result[index_tree] np.savetxt("./Result/weight_count.txt", weight_count, delimiter="\t", fmt="%f") # ---------------------------------计算phetypes similarity--------------------------------- print("-----phetypes similarity-------") # 根据cosine计算疾病的相似性 similarity_socre = cosine_similarity(weight_count) similarity_result = {} for i in range(len(omim_ids)): for j in range(i+1, len(omim_ids)): if similarity_socre[i][j] != 0: similarity_result["{}\t{}".format(omim_ids[i], omim_ids[j])] = similarity_socre[i][j] # 对疾病相似性排序 similarity_result = dict(sorted(similarity_result.items(),key = lambda x:x[1], reverse=True)) FileUtil.writeDic2File(similarity_result, output_file)
else: for i in range(0, top_number): simi_line = disease_pairs2[i] # print(simi_line) for j in range(0, len(disease_pairs1)): if simi_line[0] in disease_pairs1[j] and simi_line[ 1] in disease_pairs1[j]: top_match_number += 1 break if i % 10000 == 0: print("top {} match {}.".format(i, top_match_number)) if __name__ == "__main__": file_path1 = "./benchmark_MeSH_RADAR.txt" file_path3 = "./benchmark_DOID.txt" file_path2 = "../Result/ModuleSim_sim.txt" file_path4 = "../Result/standard_Resnik_result.txt" file_path5 = "../Result/mpDisNet_sim.txt" file_path6 = "../Result/FunSim_sim.txt" # # get_basic_info( file_path1, file_path3) # get_top_number_match(file_path1, file_path2, top_number= 110000) # --------------------------------------读取以MeSH id为标签的标准集--------------------------------------- BenChmark_MeSH1 = FileUtil.readFile2List(file_path3) simi_Result = FileUtil.readFile2List(file_path4) evaluate_by_benchmark(BenChmark_MeSH1, simi_Result, times=10)
def calculateDisSim(seed_list, net, output_path): ''' :param seed_list: dict,表示disease和其对应的genes。key:str,表示disease,value:set,表示genes :param net: 二维list,表示一个PPI网络 :param output_path: str,表示保存结果的路径 :return: ''' nodes = NetUtil.getNodes2HomoNet(net) print("there are {} diseases.".format(len(seed_list))) FRValueMatrix = np.zeros((len(seed_list), len(seed_list))) rowOfFR = 0 time1 = time.clock() for disease, genesOfDisease in seed_list.items(): leavaList = getCommonNodes(nodes, genesOfDisease) wk = walker.Walker(net) if len(leavaList) > 0: # run RWR(Random walk and restart),then get the proportion of all nodes temp_time1 = time.clock() nodesPercent = wk.run_exp(leavaList, 0.7, 1) temp_time = time.clock() print("{} - {} -> genes = {}, it cost {}s".format(rowOfFR, disease, len(leavaList), temp_time - temp_time1)) # calculate the FR_GeneSet's value colOfFR = 0 for disease2, genesOfDisease2 in seed_list.items(): FR = 0 for gene in genesOfDisease2: if gene in nodes: FR += float(nodesPercent[gene]) elif gene in genesOfDisease: FR += 1 else: FR += 0 FRValueMatrix[rowOfFR][colOfFR] = FR colOfFR += 1 rowOfFR += 1 print("begin to calculate NetSim value") # calculate the NetSim value of a pair of disease NetSimMatrix = np.zeros((len(seed_list), len(seed_list))) NetSimList = list(seed_list.keys()) rowOfFP = 0 for disease, genesOfDisease in seed_list.items(): colOfFP = 0 diseseGeneNum = len(genesOfDisease) for disease2, genesOfDisease2 in seed_list.items(): if disease is not disease2: disease2GeneNum = len(genesOfDisease2) NetSimMatrix[rowOfFP][colOfFP] = (FRValueMatrix[rowOfFP][colOfFP] + FRValueMatrix[colOfFP][rowOfFP]) / (diseseGeneNum + disease2GeneNum) colOfFP += 1 rowOfFP += 1 print("write the 'disease-diesae-value' to a file") simiResult = {} row, col = np.shape(NetSimMatrix) for i in range(0, row): for j in range(i + 1, col): if NetSimMatrix[i][j] > 0: simiResult['{}\t{}'.format(NetSimList[i], NetSimList[j])] = NetSimMatrix[i][j] sortedSimiResult = sorted(simiResult.items(), key=lambda x: x[1], reverse=True) FileUtil.writeSortedDic2File(sortedSimiResult, output_path) print("end") time2 = time.clock() print("NetSim total cost {}s.".format(time2-time1)) pass
def cal_nei_sim(disease, edges, save_nei_sim=False): print("begin to calculate similarity based on neighbours...") G = nx.Graph() G.add_edges_from(edges) # 将多种生物信息构造成异构矩阵 print( "step 1: epsilon -> 2, calculate first degree sequence and second degree sequence..." ) DegreeSequence1 = [] DegreeSequence2 = [] for di in disease: neighboursOne = G.neighbors(di) #获取节点的第一层邻居 degreeOfOne = [] neghboursTwo = [] for indexOfNeighbours in neighboursOne: degreeOfOne.append(nx.degree(G, indexOfNeighbours)) #保存第一层邻居的degree neghboursTwo.extend(G.neighbors(indexOfNeighbours)) #获取第一层邻居节点的邻居 sortedDegreeOfOne = sorted(degreeOfOne) #对第一层邻居的degree进行排序 DegreeSequence1.append(sortedDegreeOfOne) neghboursTwo = set(neghboursTwo) neghboursTwo.remove(di) #去除二层邻居节点的自己 degreeOfTwo = [] for indexOfNeighbours in neghboursTwo: degreeOfTwo.append(nx.degree(G, indexOfNeighbours)) #保存第二层邻居的degree sortedDegreeOfTwo = sorted(degreeOfTwo) #对第一层邻居的degree进行排序 DegreeSequence2.append(sortedDegreeOfTwo) cores = multiprocessing.cpu_count() # 获取计算机CPU数目 pool = multiprocessing.Pool(cores) # 构造一个线程池 print("step 2: compute neighbour_sim in parallel with {} cpus...".format( cores)) # 构造一个多线程的任务 resultsOne = [ pool.apply_async(dtw_distance_fast, (DegreeSequence1[i], DegreeSequence1[j])) for i in range(0, len(DegreeSequence1)) for j in range(i + 1, len(DegreeSequence1)) ] # 将成对的第一层degree sequence计算结果存储到数组中 arrOne = np.zeros((len(DegreeSequence1), len(DegreeSequence1))) i = 0 j = 1 for r in resultsOne: if j == len(DegreeSequence1): i += 1 j = i + 1 arrOne[i][j] = float(r.get()) j += 1 # 构造一个多线程任务 resultsTwo = [ pool.apply_async(dtw_distance_fast, (DegreeSequence2[i], DegreeSequence2[j])) for i in range(0, len(DegreeSequence2)) for j in range(i + 1, len(DegreeSequence2)) ] # 将成对的第二层degree sequence计算结果存储到数组中 arrTwo = np.zeros((len(DegreeSequence2), len(DegreeSequence2))) i = 0 j = 1 for r in resultsTwo: if j == len(DegreeSequence2): i += 1 j = i + 1 arrTwo[i][j] = float(r.get()) j += 1 # ---------------------------------------------------------------------------- print("step 3: construct similarity matrix...") alpha = 0.5 # a decaying weight factor α in the range between 0 and 1 NeiSim = {} sim_matrix = np.zeros((len(disease), len(disease))) for i in range(0, len(disease)): for j in range(i + 1, len(disease)): distance = math.pow(alpha, 1) * arrOne[i][j] + math.pow( alpha, 2) * arrTwo[i][j] NeiSim["{}\t{}".format(disease[i], disease[j])] = math.exp(-distance) sim_matrix[i][j] = math.exp(-distance) sim_matrix[j][i] = math.exp(-distance) if save_nei_sim: print("sort the path similarity and save...") res = sorted(NeiSim.items(), key=lambda x: x[1], reverse=True) FileUtil.writeSortedDic2File(res, "./nei_Sim.txt") return sim_matrix