Example #1
0
def DisSim_by_IDN():

    input_file = "./Dataset/disease-gene.txt"
    output_file = "./Result/IDN_gene.txt"
    lines = FileUtil.readFile2List(input_file)
    dis_sim = IDN.calculateDisSim(lines)
    FileUtil.writeDic2File(dis_sim, output_file)
Example #2
0
def calculateDisSim(disease_symptom, output_file):

    begin_time = time.clock()
    disease2symptom = common.list2DictSet(disease_symptom, key=2, value=1)
    symptom2disease = common.list2DictSet(disease_symptom, key=1, value=2)

    diseases = list(disease2symptom.keys())
    symptoms = list(symptom2disease.keys())
    print("there are {} diseases and {} symptoms in disease-symptom".format(len(diseases), len(symptoms)))

    # --------------------------------------------------------------------------------------

    weight = np.zeros((len(symptoms), len(diseases)))
    for line in disease_symptom:
        row_index = symptoms.index(line[0])
        col_index = diseases.index(line[1])

        weight[row_index][col_index] = float(line[2]) * math.log2(
            float(len(diseases))/ len(symptom2disease[line[0]]))

    weight = weight.transpose()
    # ----------------------------------------------------------------------

    CosineDFV_sim = {}
    for i in range(0, len(diseases)):
        temp_time1 = time.clock()
        for j in range(i+1,  len(diseases)):
            cosine_value = common.cosinValue(weight[i], weight[j])
            if cosine_value != 0:
                CosineDFV_sim["{}\t{}".format(diseases[i], diseases[j])] = cosine_value

        temp_time2 = time.clock()
        print("{} -> {} costs {}s".format(i, diseases[i], temp_time2 - temp_time1))


    FileUtil.writeDic2File(CosineDFV_sim, output_file)
    end_time = time.clock()

    print("CosineDFV costs {}s".format(end_time - begin_time))



    pass
Example #3
0
def calculateDisSim(walks, output_file, save_node_vectors=False):

    print("learn representations...")
    walks = [list(map(str, walk)) for walk in walks]
    model = Word2Vec(walks,
                     size=128,
                     window=10,
                     min_count=0,
                     sg=1,
                     workers=8,
                     iter=1)

    if save_node_vectors:
        temp_walk_fname = "./node_vectors.txt"
    else:
        _, temp_walk_fname = tempfile.mkstemp()

    print(temp_walk_fname)
    model.wv.save_word2vec_format(temp_walk_fname)

    node_vectors = defaultdict(list)
    with open(temp_walk_fname, 'r') as f:
        lines = f.readlines()
        for index in range(1, len(lines)):
            line = lines[index].strip().split(' ')
            vec = line[1:]
            vec = [float(i) for i in vec]
            node_vectors[
                line[0]] = vec  # key:label_disease的下标,value:对应的vectors

    dis_sim = {}
    disease = list(node_vectors.keys())
    for x in range(0, len(disease)):
        for y in range(x + 1, len(disease)):
            sim = common.cosinValue(node_vectors[disease[x]],
                                    node_vectors[disease[y]])
            if sim != 0:
                dis_sim["{}\t{}".format(disease[x], disease[y])] = sim

    FileUtil.writeDic2File(dis_sim, output_file)
Example #4
0
def calculateDisSim(dm_edge,
                    mg_edge,
                    gg_edge,
                    output_file,
                    walk_length=1000,
                    path_type=1,
                    save_vectors=False):
    '''

    :param dm_edge: 二维list,表示disease-miRNA network
    :param mg_edge: 二维list,表示miRNA-gene network
    :param gg_edge: 二维list,表示gene-gene network
    :param output_file: str,表示结果保存的路径
    :param walk_length: int,表示随机游走中单个节点走的步数。默认为1000
    :param path_type: int,表示随机游走的特定路径。默认为1
    :param save_vectors: boolean,表示是否保存disease vectors。默认为False,表示不保存疾病特征
    :return:
    '''

    dm_d, dm_m = NetUtil.getNodes2HeterNet(dm_edge)
    d_name_id = NetUtil.labelNode(dm_d, "i")
    m_name_id = NetUtil.labelNode(dm_m, "f")

    print("有{}个disease加标签".format(len(d_name_id.keys())))
    print("有{}个miRNA加标签".format(len(m_name_id.keys())))
    # ------------------------------------------------------------

    gg_g = NetUtil.getNodes2HomoNet(gg_edge)
    mg_m, mg_g = NetUtil.getNodes2HeterNet(mg_edge)
    g_name = mg_g & gg_g
    g_name = ['a' + str(i) for i in g_name]

    print("有{}个gene加标签".format(len(g_name)))

    # -----------------------------------------------------------------------------------
    #
    dm = defaultdict(list)
    md = defaultdict(list)

    for line in dm_edge:
        if line[1].upper() in dm_m:
            dm[d_name_id[line[0].upper().strip()]].append(
                m_name_id[line[1].upper()])
            md[m_name_id[line[1].upper()]].append(
                d_name_id[line[0].upper().strip()])

    # ----------------------------------------------------------------------------------------

    gg = defaultdict(list)
    for line in gg_edge:
        g1 = 'a' + str(line[0])
        g2 = 'a' + str(line[1])
        if (g1 in g_name) & (g2 in g_name):
            gg[g1].append(g2)
            gg[g2].append(g1)

    gene_del = []
    for gene in gg.keys():
        if len(gg[gene]) == 2:
            gene_del.append(gene)
    for gene in gene_del:
        del gg[gene]

    g_name = list(gg.keys())
    # ---------------------------------------------------------------
    test_mg_m = set()
    mg = defaultdict(list)
    gm = defaultdict(list)

    for line in mg_edge:
        g = 'a' + str(line[1])
        if (line[0].upper() in dm_m) & (g in g_name):
            mg[m_name_id[line[0].upper().strip()]].append(g)
            gm[g].append(m_name_id[line[0].upper().strip()])

    print("mg中有{}个miRNA, 标签后{}个".format(len(test_mg_m), len(mg.keys())))

    # -------------------------------------------------------------------------------------

    print("mg中miRNA标签后{}个".format(len(mg.keys())))
    disease_list = []
    for d in dm.keys():
        ms = dm[d]
        for m in ms:
            if set(mg[m]) & set(g_name):
                disease_list.append(d)
    disease_list = list(set(disease_list))

    print("mg中miRNA标签后{}个".format(len(mg.keys())))
    # print("there are {} diseases, {} genes in random walk.".format(len(disease_list ), len(g_name )))
    print("there {} miRNA in mg, {} miRNA in dm".format(
        len(mg.keys()), len(md.keys())))

    total_walk = []
    ii = 0
    for disease in disease_list:
        jj = 0
        for i in range(walk_length):
            # print(str(ii) + ' ' + str(i))
            temp = [disease]
            for k in range(50):
                j = 1
                while ((j == 1) & (jj != walk_length)):
                    try:
                        temp2 = random_walk_disease(disease, dm, mg, gg, gm,
                                                    md, path_type)
                        temp.extend(temp2[1:])
                        disease = temp2[-1]
                    except:
                        j = 1
                        jj += 1
                    else:
                        j = 0
                if jj == walk_length:
                    break
            total_walk.append(temp)
        ii += 1

    # random_walk_path = "./Result/inputomim2.txt"
    # with open(random_walk_path, 'w') as f:
    #     for lines in total_walk:
    #         for line in lines:
    #             f.write(line + ' ')
    #         f.write('\n')

    print("learn representations...")

    _, temp_walk_fname = tempfile.mkstemp()

    print(temp_walk_fname)
    with open(temp_walk_fname, 'w') as f:
        for walk in total_walk:
            for line in walk:
                f.write(line + ' ')
            f.write('\n')

    _, temp_node_vec_fname = tempfile.mkstemp()

    statement = "Common/metapath2vec++ -train {} -output {} -pp 1 -size 128 -window 7 -negative 5 -threads 32".format(
        temp_walk_fname, temp_node_vec_fname)

    print(statement)
    os.system(statement)

    print("\ncalculate disease similarity...")
    node_vectors_path = temp_node_vec_fname + ".txt"
    node_vectors = defaultdict(list)
    with open(node_vectors_path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip().split(' ')
            if line[0].startswith('i'):
                vec = line[1:]
                vec = [float(i) for i in vec]
                node_vectors[
                    line[0]] = vec  # key:label_disease的下标,value:对应的vectors

    if save_vectors:
        FileUtil.writeDicSet2File(node_vectors,
                                  "./Result/mpDisNet_node_vectors.txt")
        print()

    new_label_disease = {value: key for key, value in d_name_id.items()}
    dis_sim = {}
    for x in range(0, len(disease_list)):
        for y in range(x + 1, len(disease_list)):
            sim = common.cosinValue(node_vectors[disease_list[x]],
                                    node_vectors[disease_list[y]])
            if sim != 0:
                dis_sim["{}\t{}".format(
                    new_label_disease[disease_list[x]],
                    new_label_disease[disease_list[y]])] = sim

    FileUtil.writeDic2File(dis_sim, output_file)
Example #5
0
def calculateDisSim(DAG, output_file, disease_genes = None):
    '''

    :param DAG: 二维list,表示一个有向无循环图
    :param output_file: str,表示结果的存储路径
    :param disease_genes: 二维list,表示disease-gene associations
    :return:
    '''
    begin_time = time.clock()
    diseases = NetUtil.getNodes2HomoNet(DAG)
    disease_DAG = nx.DiGraph()
    disease_DAG.add_edges_from(DAG)

    # ----------------------------------------------------------------------------
    IC = defaultdict()
    if disease_genes:

        diseases_asso, genes = NetUtil.getNodes2HeterNet(disease_genes)
        disease2genes = common.list2DictSet(disease_genes, key= 1, value= 2)

        for di in diseases:
            if di in diseases_asso:
                IC[di] = - math.log2(float(len(disease2genes[di])) / len(genes))
            else:
                IC[di] = 0

        diseases = diseases & diseases_asso
        print("there are {} diseases for similarity based on DAG and associations.".format(len(diseases)))
    else:

        for di in diseases:
            descendants = nx.ancestors(disease_DAG, di)
            if descendants:
                IC[di] = - math.log2(float(len(descendants))/len(diseases))

        print("there are {} diseases for similarity based on DAG.".format(len(diseases)))
    # --------------------------------------------------------------------------------

    print("begin to calculate disease similarity......")

    diseases = list(diseases)
    simi_matrix = np.zeros((len(diseases), len(diseases)))

    for i in range(0, len(diseases)):

        sys.stdout.flush()
        temp_time1 = time.clock()
        di_A = diseases[i]
        for j in range(i + 1, len(diseases)):
            di_B = diseases[j]
            commonAncestors = getCommonAncesters(disease_DAG, di_A, di_B)
            sectionOfDoId2Gene = getSectionoFromDic(commonAncestors, IC)
            newDic = sorted(sectionOfDoId2Gene.items(), key=lambda x: x[1], reverse=True)
            if newDic:
                simi_matrix[i][j] = newDic[0][1]

        temp_time2 = time.clock()
        sys.stdout.write('\r{} -> {}, {}s'.format(i, diseases[i], (temp_time2 - temp_time1)))

    print()
    # ---------------------------------------------------------------------------------------------
    Resnik_simi = {}
    for i in range(0, len(diseases)):
        di_A = diseases[i]
        for j in range(i + 1, len(diseases)):
            di_B = diseases[j]
            if simi_matrix[i][j] > 0:
                Resnik_simi["{}\t{}".format( di_A,  di_B)] = simi_matrix[i][j]

    Resnik_simi = common.normalizeDict(Resnik_simi)
    FileUtil.writeDic2File(Resnik_simi, output_file)

    end_time = time.clock()
    print("ResnikSim costs {}s.".format(end_time - begin_time))

    pass
Example #6
0
def calculateDisSim(DAG, output_file, selected_diseases=None):
    '''

    :param DAG: 二维list,表示一个有向无循环图
    :param output_file: str,表示结果的存储路径
    :param selected_diseases: set,表示需要计算相似性的疾病集合。默认为None
    :return:
    '''
    begin_time = time.clock()
    diseases = NetUtil.getNodes2HomoNet(DAG)
    print("there are {} diseases in DAG.".format(len(diseases)))

    disease_DAG = nx.DiGraph()
    disease_DAG.add_edges_from(DAG)
    # ----------------------------------------------------------

    DV_diseases_dict = {}
    for di in diseases:
        ancestors = nx.descendants(disease_DAG, di)
        if ancestors:
            DV_disease_dict = {}
            ancestors.add(di)
            sub_graph = disease_DAG.subgraph(list(ancestors))
            sub_graph_nodes = sub_graph.nodes
            for node in sub_graph_nodes:
                DV_disease_dict[node] = getNodeDVByIT(sub_graph, node)
            DV_diseases_dict[di] = DV_disease_dict
        else:
            DV_disease_dict = {}
            DV_disease_dict[di] = 1
            DV_diseases_dict[di] = DV_disease_dict

    # -----------------------------------------------------------------------------------
    if selected_diseases:
        diseases = list(selected_diseases & diseases)
    else:
        diseases = list(diseases)

    print("{} diseases are used to calculate similarity.".format(
        len(diseases)))
    XuanSim_sim = {}
    for i in range(0, len(diseases)):
        DV_disease_A = DV_diseases_dict[diseases[i]]
        for j in range(i + 1, len(diseases)):
            DV_disease_B = DV_diseases_dict[diseases[j]]
            common_diseases = set(DV_disease_A.keys()) & set(
                DV_disease_B.keys())
            if common_diseases:
                common_DV = 0
                for di in common_diseases:
                    common_DV += DV_disease_A[di] + DV_disease_B[di]
                XuanSim_sim["{}\t{}".format(
                    diseases[i],
                    diseases[j])] = common_DV / (DV_disease_A[diseases[i]] +
                                                 DV_disease_B[diseases[j]])
        if i % 100 == 0:
            print("{}->{}".format(i, len(diseases)))
    FileUtil.writeDic2File(XuanSim_sim, output_file)
    end_time = time.clock()

    print("XuanSim costs {}s.".format(end_time - begin_time))

    pass
Example #7
0
def calculateDisSim(phenotypes_info, tree_id2synonyms, output_file):
    '''
    根据mesh和omim中的数据计算表型的相似性
    :param phenotypes_info: 表示omim数据库中的表型信息,dict;key:omim id,value:对该表型的描述
    :param tree_id2synonyms: 表示mesh tree的节点及对应的表型同义词,dict;key:tree id,value:表型同义词
    :param output_file: 表型相似性的存储路径
    :return:
    '''

    # -------------------------------计算actual acount----------------------------------
    print("-----actual acount-------")
    omim_ids = list(phenotypes_info.keys())
    tree_ids = list(tree_id2synonyms.keys())

    actual_count = np.zeros((len(omim_ids), len(tree_ids)))
    for index_omim, omim_id in enumerate(omim_ids):
        description = phenotypes_info[omim_id]
        for index_tree, tree_id in enumerate(tree_ids):
            synonyms = tree_id2synonyms[tree_id]
            for synonym in synonyms:
                actual_count[index_omim][index_tree] += description.count(synonym)

        sys.stdout.write("\r{}->{}".format(index_omim, omim_id))

    np.savetxt("./Result/actual_count.txt", actual_count, delimiter="\t", fmt="%d")

    # ---------------------根据hirarchy_count-----------
    print("\n-----hirarchy_count-------")
    hiera_count = actual_count
    for index_omim, omim_id in enumerate(omim_ids):
        is_calculate = OrderedDict()
        tree_id_count = OrderedDict()
        for index_tree, tree_id in enumerate(tree_ids):
            tree_id_count[tree_id] = hiera_count[index_omim][index_tree]
            is_calculate[tree_id] = False

        for tree_id in tree_ids:
            if is_calculate[tree_id] == False:
                calculate_counter(tree_id, tree_ids, tree_id_count, is_calculate)

        for index_tree, tree_id in enumerate(tree_id_count.keys()):
            if is_calculate[tree_id] == True:
                hiera_count[index_omim][index_tree] = tree_id_count[tree_id]

        sys.stdout.write("\r{}->{}".format(index_omim, omim_id))

    np.savetxt("./Result/hierachy_count.txt", hiera_count, delimiter="\t", fmt="%f")

    # ----------------------------------计算global weight------------------------------------
    print("\n-----global weight-------")
    gwc_global = OrderedDict()
    for tree_id in tree_ids:
        gwc_global[tree_id] = 0

    mostCount = []
    for index_omim, omim_id in enumerate(omim_ids):
        most_occur = 0
        for index_tree, tree_id in enumerate(tree_ids):
            meshNum = float(actual_count[index_omim][index_tree])
            if meshNum > most_occur:
                most_occur = meshNum
            if meshNum > 0:
                gwc_global[tree_id] += 1

        mostCount.append(most_occur)

    for key in gwc_global.keys():
        recordNum = gwc_global[key]
        if recordNum > 0:
            gwc_global[key] = math.log2(len(omim_ids) / recordNum)
        else:
            gwc_global[key] = 0.0

    # ------------------------------计算local weight------------------------------
    print("-----local weight-------")
    gwc = list(gwc_global.values())
    weight_count = np.zeros((len(omim_ids), len(tree_ids)))
    for index_omim, omim_id in enumerate(omim_ids):
        mf = mostCount[index_omim]
        cal_list = []
        for index_tree, tree_id in enumerate(tree_ids):
            cal_list.append(float(hiera_count[index_omim][index_tree]))
        gwc_cal = np.array(gwc) * np.array(cal_list)
        gwc_list = gwc_cal.tolist()
        cal_result = []
        for score in gwc_list:
            if score > 0:
                cal_result.append(0.5 + 0.5 * (score / mf))
            else:
                cal_result.append(score)
        for index_tree, tree_id in enumerate(tree_ids):
            weight_count[index_omim][index_tree] = cal_result[index_tree]

    np.savetxt("./Result/weight_count.txt", weight_count, delimiter="\t", fmt="%f")

    # ---------------------------------计算phetypes similarity---------------------------------
    print("-----phetypes similarity-------")
    # 根据cosine计算疾病的相似性
    similarity_socre = cosine_similarity(weight_count)
    similarity_result = {}
    for i in range(len(omim_ids)):
        for j in range(i+1, len(omim_ids)):
            if similarity_socre[i][j] != 0:
                similarity_result["{}\t{}".format(omim_ids[i], omim_ids[j])] = similarity_socre[i][j]

    # 对疾病相似性排序
    similarity_result = dict(sorted(similarity_result.items(),key = lambda x:x[1], reverse=True))
    FileUtil.writeDic2File(similarity_result, output_file)