Exemple #1
0
def DisSim_by_mpDisNet():

    input_file1 = "./Dataset/mpDisNet/omim_dm.txt"
    input_file2 = "./Dataset/mpDisNet/omim_mg.txt"
    input_file3 = "./Dataset/mpDisNet/PPI.txt"

    disease_miRNA = FileUtil.readFile2List(input_file1)
    miRNA_gene = FileUtil.readFile2List(input_file2)
    gene_gene = FileUtil.readFile2List(input_file3)

    output_file = "./Result/mpDisNet_sim.txt"
    mpDisNet.calculateDisSim(disease_miRNA,
                             miRNA_gene,
                             gene_gene,
                             output_file,
                             path_type=2)

    # ------------------------ evaluation ---------------------------------------
    file_path1 = "./Evaluation/benchmark_MeSH_RADAR.txt"
    BenChmark_MeSH1 = FileUtil.readFile2List(file_path1)

    simi_Result = FileUtil.readFile2List(output_file)
    benchmark_evaluation.evaluate_by_benchmark(BenChmark_MeSH1,
                                               simi_Result,
                                               times=10)
Exemple #2
0
def cal_path_sim(disease, edges, save_path_sim=False):

    print("begin calculate similarity based on path...")
    pathway = list(NetUtil.getColNodes(edges, col=1))

    ajaMatrix = np.zeros((len(disease), len(pathway)))
    for line in edges:
        row_index = disease.index(line[0])
        col_index = pathway.index(line[1])
        ajaMatrix[row_index][col_index] = float(1)

    W = np.dot(ajaMatrix, ajaMatrix.T)

    print("construct similarity matrix...")
    pathSim = {}
    sim_matrix = np.zeros((len(disease), len(disease)))
    for i in range(0, len(disease)):
        for j in range(i + 1, len(disease)):
            # if W[i][j] != 0:
            pathSim["{}\t{}".format(
                disease[i], disease[j])] = 2 * W[i][j] / (W[i][i] + W[j][j])
            sim_matrix[i][j] = 2 * W[i][j] / (W[i][i] + W[j][j])
            sim_matrix[j][i] = 2 * W[i][j] / (W[i][i] + W[j][j])

    if save_path_sim:
        print("sort the path similarity and save...")
        res = sorted(pathSim.items(), key=lambda x: x[1], reverse=True)
        FileUtil.writeSortedDic2File(res, "./path_sim.txt")

    return sim_matrix
Exemple #3
0
def con_single_layer_net(edges, filter_value=0.1):

    disease, pathway = NetUtil.getNodes2HeterNet(edges)
    disease = list(disease)

    print("1st col -> {}\t 2nd col -> {}".format(len(disease), len(pathway)))

    nei_sim_matrix = cal_nei_sim(disease, edges, save_nei_sim=True)
    path_sim_matrix = cal_path_sim(disease, edges, save_path_sim=True)

    layer_net = []
    layer_net_result = {}
    for i in range(0, len(disease)):
        for j in range(i + 1, len(disease)):
            fusion_sim = 0.5 * nei_sim_matrix[i][j] + 0.5 * path_sim_matrix[i][
                j]
            if fusion_sim > filter_value:
                layer_net.append([disease[i], disease[j], fusion_sim])
                layer_net_result["{}\t{}".format(disease[i],
                                                 disease[j])] = fusion_sim

    print("complete a single layer similarity network...")
    layer_net_result = common.sortDict(layer_net_result)
    FileUtil.writeSortedDic2File(layer_net_result, "./layer_net.txt")

    return layer_net
Exemple #4
0
def DisSim_by_IDN():

    input_file = "./Dataset/disease-gene.txt"
    output_file = "./Result/IDN_gene.txt"
    lines = FileUtil.readFile2List(input_file)
    dis_sim = IDN.calculateDisSim(lines)
    FileUtil.writeDic2File(dis_sim, output_file)
Exemple #5
0
def DisSim_by_NetSim():
    input_file1 = "./Dataset/disease-gene_SIDD.txt"
    input_file2 = "./Dataset/HumanNet_symbol.txt"
    output_file = "./Result/NetSim_sim_DO.txt"

    disease2gene = FileUtil.readFile2DictSet(input_file1, header=True)
    gene_gene = FileUtil.readFile2List(input_file2)
    NetSim.calculateDisSim(disease2gene, gene_gene, output_file)
Exemple #6
0
def calculateDisSim(disease_microbe, output_file):
    '''

    :param disease_microbe: 二维list,表示disease-microbe network
    :param output_file: str,表示结果保存的路径
    :return:
    '''

    begin_time = time.clock()
    microbe2disease = defaultdict(set)
    disease2microbe = defaultdict(set)

    for line in disease_microbe:
        microbe2disease[line[0]].add(line[1])
        disease2microbe[line[1]].add(line[0])

    print("there are {} diseases and {} microbes in disease-microbe.".format(
        len(disease2microbe.keys()), len(microbe2disease.keys())))

    diseases = list(disease2microbe.keys())
    microbes = list(microbe2disease.keys())
    weight = np.zeros((len(disease2microbe.keys()), len(microbe2disease)))
    E = np.ones((len(disease2microbe.keys()), len(microbe2disease)))

    for line in disease_microbe:
        indexRow = diseases.index(line[1])
        indexCol = microbes.index(line[0])

        weight[indexRow][indexCol] += 1
        if line[3] == "increase":
            E[indexRow][indexCol] = 1
        elif line[3] == "decrease":
            E[indexRow][indexCol] = -1

    for indexRow in range(0, len(diseases)):
        for indexCol in range(0, len(microbes)):
            # print math.log(diseaseNum / len(n.get(microbeList[indexCol], 2)))
            weight[indexRow][indexCol] *= E[indexRow][indexCol] * math.log2(
                float(len(diseases)) /
                len(microbe2disease[microbes[indexCol]]))

    # ------------------------------------------------------------------
    MicrobeSim = {}
    for i in range(0, len(diseases)):
        for j in range(i + 1, len(diseases)):
            cosine_value = common.cosinValue(weight[i], weight[j])
            if cosine_value != 0:
                MicrobeSim["{}\t{}".format(diseases[i],
                                           diseases[j])] = cosine_value

    MicrobeSim = common.sortDict(MicrobeSim)
    FileUtil.writeSortedDic2File(MicrobeSim, output_file)
    end_time = time.clock()

    print("MicrobeSim costs {}s".format(end_time - begin_time))

    pass
Exemple #7
0
def DisSim_by_ModuleSim():

    input_file1 = "./Dataset/disease-gene.txt"
    input_file2 = "./Dataset/PPI.txt"
    output_file = "./Result/ModuleSim_sim.txt"

    dis2gene = FileUtil.readFile2DictSet(input_file1)
    gene_net = ModuleSim.read_interactome(input_file2, False, False)
    print("number of vertices: {}, number of edges: {}".format(
        gene_net.vcount(), gene_net.ecount()))

    sims = ModuleSim.similarity_cal_spavgn(dis2gene, gene_net)
    FileUtil.write_sims(sims, output_file)
Exemple #8
0
def read_db_config(filename='c:/temp/jquant/config.ini', section='mysql'):
    """ Read database configuration file and return a dictionary object
    :param filename: name of the configuration file
    :param section: section of database configuration
    :return: a dictionary of database parameters
    """
    if not FileUtil.file_exist(filename):
        print("configuration file {} does not exit.".format(filter()))
        return

    # create parser and read ini configuration file
    parser = ConfigParser()
    parser.read(filename)

    # get section, default to mysql
    db = {}
    if parser.has_section(section):
        items = parser.items(section)
        for item in items:
            db[item[0]] = item[1]
# {'host': 'localhost', 'database': 'quant', 'user': '******', 'password': '******'}
    else:
        raise Exception('{0} not found in the {1} file'.format(
            section, filename))

    return db
Exemple #9
0
def DisSim_by_CosineDFV():

    input_file1 = "./Dataset/disease-symptom.txt"
    output_file = "./Result/CosineDFV_sim.txt"

    disease_symptom = FileUtil.readFile2List(input_file1, header=True)
    CosineDFV.calculateDisSim(disease_symptom, output_file)
Exemple #10
0
def DisSim_by_MicrobeSim():

    input_file1 = "./Dataset/disease-microbe.txt"
    output_file = "./Result/MicrobeSim_sim.txt"

    disease_microbe = FileUtil.readFile2List(input_file1, header=True)
    MicrobeSim.calculateDisSim(disease_microbe, output_file)
Exemple #11
0
def DisSim_by_Resink():

    input_file1 = "./Dataset/DO_DAG.txt"
    input_file3 = "./Dataset/disease-gene_SIDD.txt"
    output_file = "./Result/Resink_sim.txt"
    DO_DAG = FileUtil.readFile2List(input_file1, header=True)
    disease_genes = FileUtil.readFile2List(input_file3)

    ResinkSim.calculateDisSim(DO_DAG, output_file)

    # -------------------------------------------------------------------
    file_path1 = "./Evaluation/benchmark_DOID.txt"
    BenChmark_DO = FileUtil.readFile2List(file_path1)

    simi_Result = FileUtil.readFile2List(output_file)
    benchmark_evaluation.evaluate_by_benchmark(BenChmark_DO,
                                               simi_Result,
                                               times=10)
Exemple #12
0
def DisSim_by_FunSim():

    input_file1 = "./Dataset/disease-gene_SIDD.txt"
    input_file2 = "./Dataset/HumanNet_symbol_weighted.txt"
    output_file = "./Result/FunSim_sim.txt"

    disease2genes = FileUtil.readFile2DictSet(input_file1)
    weighted_PPI = FileUtil.readFile2List(input_file2)

    FunSim.calculateDisSim(disease2genes, weighted_PPI, output_file)

    # ------------------------ evaluation ---------------------------------------
    file_path1 = "./Evaluation/benchmark_DOID.txt"
    BenChmark_DO = FileUtil.readFile2List(file_path1)

    simi_Result = FileUtil.readFile2List(output_file)
    benchmark_evaluation.evaluate_by_benchmark(BenChmark_DO,
                                               simi_Result,
                                               times=10)
Exemple #13
0
def calculateDisSim(disease_symptom, output_file):

    begin_time = time.clock()
    disease2symptom = common.list2DictSet(disease_symptom, key=2, value=1)
    symptom2disease = common.list2DictSet(disease_symptom, key=1, value=2)

    diseases = list(disease2symptom.keys())
    symptoms = list(symptom2disease.keys())
    print("there are {} diseases and {} symptoms in disease-symptom".format(len(diseases), len(symptoms)))

    # --------------------------------------------------------------------------------------

    weight = np.zeros((len(symptoms), len(diseases)))
    for line in disease_symptom:
        row_index = symptoms.index(line[0])
        col_index = diseases.index(line[1])

        weight[row_index][col_index] = float(line[2]) * math.log2(
            float(len(diseases))/ len(symptom2disease[line[0]]))

    weight = weight.transpose()
    # ----------------------------------------------------------------------

    CosineDFV_sim = {}
    for i in range(0, len(diseases)):
        temp_time1 = time.clock()
        for j in range(i+1,  len(diseases)):
            cosine_value = common.cosinValue(weight[i], weight[j])
            if cosine_value != 0:
                CosineDFV_sim["{}\t{}".format(diseases[i], diseases[j])] = cosine_value

        temp_time2 = time.clock()
        print("{} -> {} costs {}s".format(i, diseases[i], temp_time2 - temp_time1))


    FileUtil.writeDic2File(CosineDFV_sim, output_file)
    end_time = time.clock()

    print("CosineDFV costs {}s".format(end_time - begin_time))



    pass
Exemple #14
0
def DisSim_by_XuanSim():

    input_file1 = "./Dataset/DO_DAG.txt"
    input_file2 = "./Dataset/disease-gene_SIDD.txt"
    output_file = "./Result/XuanSim_sim.txt"

    DO_DAG = FileUtil.readFile2List(input_file1, header=True)
    diseases2genes = FileUtil.readFile2DictSet(input_file2)
    XuanSim.calculateDisSim(DO_DAG,
                            output_file,
                            selected_diseases=set(diseases2genes.keys()))

    file_path1 = "./Evaluation/benchmark_DOID.txt"
    BenChmark_DO = FileUtil.readFile2List(file_path1)

    simi_Result = FileUtil.readFile2List(output_file)
    benchmark_evaluation.evaluate_by_benchmark(BenChmark_DO,
                                               simi_Result,
                                               times=10)
Exemple #15
0
def random_walk_multi_layers(multi_layers_net,
                             walk_iters=100,
                             walk_legth=160,
                             save_random_walk=False):
    '''
    :param multi_layers_net: list,表示一个多层的大网络,每一层都是一个小网络,并且每一层的节点都相同
    :param output_file: str,表示输出文件的路径,文件内容为节点及其对应的向量
    :param walk_iters: int,表示单个节点游走的次数
    :param walk_legth: int,表示一个节点在网络中游走的步数
    :param save_random_walk: boolean,表示是否保存随机游走的路径
    :return:
    '''
    multi_layers = defaultdict()
    for i in range(len(multi_layers_net)):
        G = read_graph(multi_layers_net[i], weighted=True)
        print("{} layer -> {} nodes and {} edges.".format(
            i, len(nx.nodes(G)), len(nx.edges(G))))
        multi_layers[i] = G

    nodes = multi_layers[0].nodes()
    walks = []
    for index, node in enumerate(nodes):
        time1 = time.clock()
        max_weights = list()
        for key, G in multi_layers.items():
            nei_weight = set()
            for nei_node in G.neighbors(node):
                nei_weight.add(float(G[node][nei_node]['weight']))
            max_weights.append(max(nei_weight))
        select_layer = max_weights.index(max(max_weights))
        for walk_iter in range(walk_iters):
            walk = random_walk(multi_layers[select_layer], node, walk_legth)
            walks.append(walk)
        time2 = time.clock()
        print("{} * {} -> {} layer: cost {}s".format(index, node,
                                                     select_layer + 1,
                                                     time2 - time1))

    if save_random_walk:
        FileUtil.write2DemList2File(walks, "./random_walk.txt")

    return walks
Exemple #16
0
def calculateDisSim(walks, output_file, save_node_vectors=False):

    print("learn representations...")
    walks = [list(map(str, walk)) for walk in walks]
    model = Word2Vec(walks,
                     size=128,
                     window=10,
                     min_count=0,
                     sg=1,
                     workers=8,
                     iter=1)

    if save_node_vectors:
        temp_walk_fname = "./node_vectors.txt"
    else:
        _, temp_walk_fname = tempfile.mkstemp()

    print(temp_walk_fname)
    model.wv.save_word2vec_format(temp_walk_fname)

    node_vectors = defaultdict(list)
    with open(temp_walk_fname, 'r') as f:
        lines = f.readlines()
        for index in range(1, len(lines)):
            line = lines[index].strip().split(' ')
            vec = line[1:]
            vec = [float(i) for i in vec]
            node_vectors[
                line[0]] = vec  # key:label_disease的下标,value:对应的vectors

    dis_sim = {}
    disease = list(node_vectors.keys())
    for x in range(0, len(disease)):
        for y in range(x + 1, len(disease)):
            sim = common.cosinValue(node_vectors[disease[x]],
                                    node_vectors[disease[y]])
            if sim != 0:
                dis_sim["{}\t{}".format(disease[x], disease[y])] = sim

    FileUtil.writeDic2File(dis_sim, output_file)
Exemple #17
0
def random_walk(edges, num_walks=100, walk_length=160):

    time1 = time.clock()
    nx_G = read_graph(edges, weighted=True, directed=False)
    print("{} nodes, {} edges.".format(len(nx.nodes(nx_G)),
                                       len(nx.edges(nx_G))))

    time2 = time.clock()
    print("It cost {}s to read edges.".format(time2 - time1))

    G = Graph(nx_G, p=1, q=1)
    print("generate transition matrix......")
    G.preprocess_transition_probs()
    time3 = time.clock()
    print("It cost {}s to generate transition matrix.".format(time3 - time2))

    print("begin to random walk......")
    walks = G.simulate_walks(num_walks=num_walks, walk_length=walk_length)
    time4 = time.clock()
    print("It cost {}s to random walk.".format(time4 - time3))

    FileUtil.write2DemList2File(walks, "./node2vec_walks.txt")

    return walks
Exemple #18
0
def read_price_file():
    # SystemEnv.read_config('config.ini')
    # print(SystemEnv.g_price_file)

    for key, value in SystemEnv.g_price_file.items():
        print('{}={}'.format(key, value))

    price_file = os.path.join(SystemEnv.g_price_file['sourcefolder'],
                              "Yahoo_TSLA.csv")

    if not FileUtil.file_exist(price_file):
        print("Price File {} does not exist.".format(price_file))
        return
    df_price = pd.read_csv(price_file)

    return df_price
Exemple #19
0
def read_config(filename='config.ini'):
    """ Read database configuration file and return a dictionary object
    :param filename: name of the configuration file
    :param section: section of database configuration
    :return: a dictionary of database parameters
    """
    global g_mysql_connection
    global g_price_file
    global g_tick_list

    def configSectionMap(p_section):
        dict1 = {}
        options = parser.options(p_section)
        for option in options:
            try:
                dict1[option] = parser.get(p_section, option)
                if dict1[option] == -1:
                    print("skip: %s" % option)
            except Exception as ex:
                print("exception on %s!" % option)
                dict1[option] = None
        return dict1

    if not FileUtil.file_exist(filename):
        print("configuration file {} does not exit.".format(filename))
        return

    # create parser and read ini configuration file
    parser = ConfigParser()
    parser.read(filename)
    sections = parser.sections()
    for section in parser.sections():

        if section == ConfigSection.E_MYSQL.value:
            g_mysql_connection = configSectionMap(section)
        elif section == ConfigSection.E_PRICE_FILE.value:
            g_price_file = configSectionMap(section)
        elif section == ConfigSection.E_TICKER.value:
            g_tick_list = _listfstr(configSectionMap(section))
Exemple #20
0
def line2upper(input_file, output_file):

    lines = FileUtil.readFile2List(input_file)
    new_lines = [line.upper() for line in lines]
    FileUtil.writeList2File(new_lines, output_file)
Exemple #21
0
def calculateDisSim(DAG, output_file, disease_genes = None):
    '''

    :param DAG: 二维list,表示一个有向无循环图
    :param output_file: str,表示结果的存储路径
    :param disease_genes: 二维list,表示disease-gene associations
    :return:
    '''
    begin_time = time.clock()
    diseases = NetUtil.getNodes2HomoNet(DAG)
    disease_DAG = nx.DiGraph()
    disease_DAG.add_edges_from(DAG)

    # ----------------------------------------------------------------------------
    IC = defaultdict()
    if disease_genes:

        diseases_asso, genes = NetUtil.getNodes2HeterNet(disease_genes)
        disease2genes = common.list2DictSet(disease_genes, key= 1, value= 2)

        for di in diseases:
            if di in diseases_asso:
                IC[di] = - math.log2(float(len(disease2genes[di])) / len(genes))
            else:
                IC[di] = 0

        diseases = diseases & diseases_asso
        print("there are {} diseases for similarity based on DAG and associations.".format(len(diseases)))
    else:

        for di in diseases:
            descendants = nx.ancestors(disease_DAG, di)
            if descendants:
                IC[di] = - math.log2(float(len(descendants))/len(diseases))

        print("there are {} diseases for similarity based on DAG.".format(len(diseases)))
    # --------------------------------------------------------------------------------

    print("begin to calculate disease similarity......")

    diseases = list(diseases)
    simi_matrix = np.zeros((len(diseases), len(diseases)))

    for i in range(0, len(diseases)):

        sys.stdout.flush()
        temp_time1 = time.clock()
        di_A = diseases[i]
        for j in range(i + 1, len(diseases)):
            di_B = diseases[j]
            commonAncestors = getCommonAncesters(disease_DAG, di_A, di_B)
            sectionOfDoId2Gene = getSectionoFromDic(commonAncestors, IC)
            newDic = sorted(sectionOfDoId2Gene.items(), key=lambda x: x[1], reverse=True)
            if newDic:
                simi_matrix[i][j] = newDic[0][1]

        temp_time2 = time.clock()
        sys.stdout.write('\r{} -> {}, {}s'.format(i, diseases[i], (temp_time2 - temp_time1)))

    print()
    # ---------------------------------------------------------------------------------------------
    Resnik_simi = {}
    for i in range(0, len(diseases)):
        di_A = diseases[i]
        for j in range(i + 1, len(diseases)):
            di_B = diseases[j]
            if simi_matrix[i][j] > 0:
                Resnik_simi["{}\t{}".format( di_A,  di_B)] = simi_matrix[i][j]

    Resnik_simi = common.normalizeDict(Resnik_simi)
    FileUtil.writeDic2File(Resnik_simi, output_file)

    end_time = time.clock()
    print("ResnikSim costs {}s.".format(end_time - begin_time))

    pass
Exemple #22
0
def calculateDisSim(DAG, output_file, selected_diseases=None):
    '''

    :param DAG: 二维list,表示一个有向无循环图
    :param output_file: str,表示结果的存储路径
    :param selected_diseases: set,表示需要计算相似性的疾病集合。默认为None
    :return:
    '''
    begin_time = time.clock()
    diseases = NetUtil.getNodes2HomoNet(DAG)
    print("there are {} diseases in DAG.".format(len(diseases)))

    disease_DAG = nx.DiGraph()
    disease_DAG.add_edges_from(DAG)
    # ----------------------------------------------------------

    DV_diseases_dict = {}
    for di in diseases:
        ancestors = nx.descendants(disease_DAG, di)
        if ancestors:
            DV_disease_dict = {}
            ancestors.add(di)
            sub_graph = disease_DAG.subgraph(list(ancestors))
            sub_graph_nodes = sub_graph.nodes
            for node in sub_graph_nodes:
                DV_disease_dict[node] = getNodeDVByIT(sub_graph, node)
            DV_diseases_dict[di] = DV_disease_dict
        else:
            DV_disease_dict = {}
            DV_disease_dict[di] = 1
            DV_diseases_dict[di] = DV_disease_dict

    # -----------------------------------------------------------------------------------
    if selected_diseases:
        diseases = list(selected_diseases & diseases)
    else:
        diseases = list(diseases)

    print("{} diseases are used to calculate similarity.".format(
        len(diseases)))
    XuanSim_sim = {}
    for i in range(0, len(diseases)):
        DV_disease_A = DV_diseases_dict[diseases[i]]
        for j in range(i + 1, len(diseases)):
            DV_disease_B = DV_diseases_dict[diseases[j]]
            common_diseases = set(DV_disease_A.keys()) & set(
                DV_disease_B.keys())
            if common_diseases:
                common_DV = 0
                for di in common_diseases:
                    common_DV += DV_disease_A[di] + DV_disease_B[di]
                XuanSim_sim["{}\t{}".format(
                    diseases[i],
                    diseases[j])] = common_DV / (DV_disease_A[diseases[i]] +
                                                 DV_disease_B[diseases[j]])
        if i % 100 == 0:
            print("{}->{}".format(i, len(diseases)))
    FileUtil.writeDic2File(XuanSim_sim, output_file)
    end_time = time.clock()

    print("XuanSim costs {}s.".format(end_time - begin_time))

    pass
Exemple #23
0
def calculateDisSim(dm_edge,
                    mg_edge,
                    gg_edge,
                    output_file,
                    walk_length=1000,
                    path_type=1,
                    save_vectors=False):
    '''

    :param dm_edge: 二维list,表示disease-miRNA network
    :param mg_edge: 二维list,表示miRNA-gene network
    :param gg_edge: 二维list,表示gene-gene network
    :param output_file: str,表示结果保存的路径
    :param walk_length: int,表示随机游走中单个节点走的步数。默认为1000
    :param path_type: int,表示随机游走的特定路径。默认为1
    :param save_vectors: boolean,表示是否保存disease vectors。默认为False,表示不保存疾病特征
    :return:
    '''

    dm_d, dm_m = NetUtil.getNodes2HeterNet(dm_edge)
    d_name_id = NetUtil.labelNode(dm_d, "i")
    m_name_id = NetUtil.labelNode(dm_m, "f")

    print("有{}个disease加标签".format(len(d_name_id.keys())))
    print("有{}个miRNA加标签".format(len(m_name_id.keys())))
    # ------------------------------------------------------------

    gg_g = NetUtil.getNodes2HomoNet(gg_edge)
    mg_m, mg_g = NetUtil.getNodes2HeterNet(mg_edge)
    g_name = mg_g & gg_g
    g_name = ['a' + str(i) for i in g_name]

    print("有{}个gene加标签".format(len(g_name)))

    # -----------------------------------------------------------------------------------
    #
    dm = defaultdict(list)
    md = defaultdict(list)

    for line in dm_edge:
        if line[1].upper() in dm_m:
            dm[d_name_id[line[0].upper().strip()]].append(
                m_name_id[line[1].upper()])
            md[m_name_id[line[1].upper()]].append(
                d_name_id[line[0].upper().strip()])

    # ----------------------------------------------------------------------------------------

    gg = defaultdict(list)
    for line in gg_edge:
        g1 = 'a' + str(line[0])
        g2 = 'a' + str(line[1])
        if (g1 in g_name) & (g2 in g_name):
            gg[g1].append(g2)
            gg[g2].append(g1)

    gene_del = []
    for gene in gg.keys():
        if len(gg[gene]) == 2:
            gene_del.append(gene)
    for gene in gene_del:
        del gg[gene]

    g_name = list(gg.keys())
    # ---------------------------------------------------------------
    test_mg_m = set()
    mg = defaultdict(list)
    gm = defaultdict(list)

    for line in mg_edge:
        g = 'a' + str(line[1])
        if (line[0].upper() in dm_m) & (g in g_name):
            mg[m_name_id[line[0].upper().strip()]].append(g)
            gm[g].append(m_name_id[line[0].upper().strip()])

    print("mg中有{}个miRNA, 标签后{}个".format(len(test_mg_m), len(mg.keys())))

    # -------------------------------------------------------------------------------------

    print("mg中miRNA标签后{}个".format(len(mg.keys())))
    disease_list = []
    for d in dm.keys():
        ms = dm[d]
        for m in ms:
            if set(mg[m]) & set(g_name):
                disease_list.append(d)
    disease_list = list(set(disease_list))

    print("mg中miRNA标签后{}个".format(len(mg.keys())))
    # print("there are {} diseases, {} genes in random walk.".format(len(disease_list ), len(g_name )))
    print("there {} miRNA in mg, {} miRNA in dm".format(
        len(mg.keys()), len(md.keys())))

    total_walk = []
    ii = 0
    for disease in disease_list:
        jj = 0
        for i in range(walk_length):
            # print(str(ii) + ' ' + str(i))
            temp = [disease]
            for k in range(50):
                j = 1
                while ((j == 1) & (jj != walk_length)):
                    try:
                        temp2 = random_walk_disease(disease, dm, mg, gg, gm,
                                                    md, path_type)
                        temp.extend(temp2[1:])
                        disease = temp2[-1]
                    except:
                        j = 1
                        jj += 1
                    else:
                        j = 0
                if jj == walk_length:
                    break
            total_walk.append(temp)
        ii += 1

    # random_walk_path = "./Result/inputomim2.txt"
    # with open(random_walk_path, 'w') as f:
    #     for lines in total_walk:
    #         for line in lines:
    #             f.write(line + ' ')
    #         f.write('\n')

    print("learn representations...")

    _, temp_walk_fname = tempfile.mkstemp()

    print(temp_walk_fname)
    with open(temp_walk_fname, 'w') as f:
        for walk in total_walk:
            for line in walk:
                f.write(line + ' ')
            f.write('\n')

    _, temp_node_vec_fname = tempfile.mkstemp()

    statement = "Common/metapath2vec++ -train {} -output {} -pp 1 -size 128 -window 7 -negative 5 -threads 32".format(
        temp_walk_fname, temp_node_vec_fname)

    print(statement)
    os.system(statement)

    print("\ncalculate disease similarity...")
    node_vectors_path = temp_node_vec_fname + ".txt"
    node_vectors = defaultdict(list)
    with open(node_vectors_path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip().split(' ')
            if line[0].startswith('i'):
                vec = line[1:]
                vec = [float(i) for i in vec]
                node_vectors[
                    line[0]] = vec  # key:label_disease的下标,value:对应的vectors

    if save_vectors:
        FileUtil.writeDicSet2File(node_vectors,
                                  "./Result/mpDisNet_node_vectors.txt")
        print()

    new_label_disease = {value: key for key, value in d_name_id.items()}
    dis_sim = {}
    for x in range(0, len(disease_list)):
        for y in range(x + 1, len(disease_list)):
            sim = common.cosinValue(node_vectors[disease_list[x]],
                                    node_vectors[disease_list[y]])
            if sim != 0:
                dis_sim["{}\t{}".format(
                    new_label_disease[disease_list[x]],
                    new_label_disease[disease_list[y]])] = sim

    FileUtil.writeDic2File(dis_sim, output_file)
Exemple #24
0
def calculateDisSim(phenotypes_info, tree_id2synonyms, output_file):
    '''
    根据mesh和omim中的数据计算表型的相似性
    :param phenotypes_info: 表示omim数据库中的表型信息,dict;key:omim id,value:对该表型的描述
    :param tree_id2synonyms: 表示mesh tree的节点及对应的表型同义词,dict;key:tree id,value:表型同义词
    :param output_file: 表型相似性的存储路径
    :return:
    '''

    # -------------------------------计算actual acount----------------------------------
    print("-----actual acount-------")
    omim_ids = list(phenotypes_info.keys())
    tree_ids = list(tree_id2synonyms.keys())

    actual_count = np.zeros((len(omim_ids), len(tree_ids)))
    for index_omim, omim_id in enumerate(omim_ids):
        description = phenotypes_info[omim_id]
        for index_tree, tree_id in enumerate(tree_ids):
            synonyms = tree_id2synonyms[tree_id]
            for synonym in synonyms:
                actual_count[index_omim][index_tree] += description.count(synonym)

        sys.stdout.write("\r{}->{}".format(index_omim, omim_id))

    np.savetxt("./Result/actual_count.txt", actual_count, delimiter="\t", fmt="%d")

    # ---------------------根据hirarchy_count-----------
    print("\n-----hirarchy_count-------")
    hiera_count = actual_count
    for index_omim, omim_id in enumerate(omim_ids):
        is_calculate = OrderedDict()
        tree_id_count = OrderedDict()
        for index_tree, tree_id in enumerate(tree_ids):
            tree_id_count[tree_id] = hiera_count[index_omim][index_tree]
            is_calculate[tree_id] = False

        for tree_id in tree_ids:
            if is_calculate[tree_id] == False:
                calculate_counter(tree_id, tree_ids, tree_id_count, is_calculate)

        for index_tree, tree_id in enumerate(tree_id_count.keys()):
            if is_calculate[tree_id] == True:
                hiera_count[index_omim][index_tree] = tree_id_count[tree_id]

        sys.stdout.write("\r{}->{}".format(index_omim, omim_id))

    np.savetxt("./Result/hierachy_count.txt", hiera_count, delimiter="\t", fmt="%f")

    # ----------------------------------计算global weight------------------------------------
    print("\n-----global weight-------")
    gwc_global = OrderedDict()
    for tree_id in tree_ids:
        gwc_global[tree_id] = 0

    mostCount = []
    for index_omim, omim_id in enumerate(omim_ids):
        most_occur = 0
        for index_tree, tree_id in enumerate(tree_ids):
            meshNum = float(actual_count[index_omim][index_tree])
            if meshNum > most_occur:
                most_occur = meshNum
            if meshNum > 0:
                gwc_global[tree_id] += 1

        mostCount.append(most_occur)

    for key in gwc_global.keys():
        recordNum = gwc_global[key]
        if recordNum > 0:
            gwc_global[key] = math.log2(len(omim_ids) / recordNum)
        else:
            gwc_global[key] = 0.0

    # ------------------------------计算local weight------------------------------
    print("-----local weight-------")
    gwc = list(gwc_global.values())
    weight_count = np.zeros((len(omim_ids), len(tree_ids)))
    for index_omim, omim_id in enumerate(omim_ids):
        mf = mostCount[index_omim]
        cal_list = []
        for index_tree, tree_id in enumerate(tree_ids):
            cal_list.append(float(hiera_count[index_omim][index_tree]))
        gwc_cal = np.array(gwc) * np.array(cal_list)
        gwc_list = gwc_cal.tolist()
        cal_result = []
        for score in gwc_list:
            if score > 0:
                cal_result.append(0.5 + 0.5 * (score / mf))
            else:
                cal_result.append(score)
        for index_tree, tree_id in enumerate(tree_ids):
            weight_count[index_omim][index_tree] = cal_result[index_tree]

    np.savetxt("./Result/weight_count.txt", weight_count, delimiter="\t", fmt="%f")

    # ---------------------------------计算phetypes similarity---------------------------------
    print("-----phetypes similarity-------")
    # 根据cosine计算疾病的相似性
    similarity_socre = cosine_similarity(weight_count)
    similarity_result = {}
    for i in range(len(omim_ids)):
        for j in range(i+1, len(omim_ids)):
            if similarity_socre[i][j] != 0:
                similarity_result["{}\t{}".format(omim_ids[i], omim_ids[j])] = similarity_socre[i][j]

    # 对疾病相似性排序
    similarity_result = dict(sorted(similarity_result.items(),key = lambda x:x[1], reverse=True))
    FileUtil.writeDic2File(similarity_result, output_file)
    else:
        for i in range(0, top_number):
            simi_line = disease_pairs2[i]
            # print(simi_line)
            for j in range(0, len(disease_pairs1)):
                if simi_line[0] in disease_pairs1[j] and simi_line[
                        1] in disease_pairs1[j]:
                    top_match_number += 1
                    break
            if i % 10000 == 0:
                print("top {} match {}.".format(i, top_match_number))


if __name__ == "__main__":
    file_path1 = "./benchmark_MeSH_RADAR.txt"
    file_path3 = "./benchmark_DOID.txt"

    file_path2 = "../Result/ModuleSim_sim.txt"
    file_path4 = "../Result/standard_Resnik_result.txt"
    file_path5 = "../Result/mpDisNet_sim.txt"
    file_path6 = "../Result/FunSim_sim.txt"

    # # get_basic_info( file_path1,  file_path3)
    # get_top_number_match(file_path1,  file_path2, top_number= 110000)

    # --------------------------------------读取以MeSH id为标签的标准集---------------------------------------
    BenChmark_MeSH1 = FileUtil.readFile2List(file_path3)

    simi_Result = FileUtil.readFile2List(file_path4)
    evaluate_by_benchmark(BenChmark_MeSH1, simi_Result, times=10)
Exemple #26
0
def calculateDisSim(seed_list, net, output_path):

    '''

    :param seed_list: dict,表示disease和其对应的genes。key:str,表示disease,value:set,表示genes
    :param net: 二维list,表示一个PPI网络
    :param output_path: str,表示保存结果的路径
    :return:
    '''
    nodes = NetUtil.getNodes2HomoNet(net)

    print("there are {} diseases.".format(len(seed_list)))

    FRValueMatrix = np.zeros((len(seed_list), len(seed_list)))
    rowOfFR = 0
    time1 = time.clock()
    for disease, genesOfDisease in seed_list.items():

        leavaList = getCommonNodes(nodes, genesOfDisease)
        wk = walker.Walker(net)

        if len(leavaList) > 0:
            # run RWR(Random walk and restart),then get the proportion of all nodes
            temp_time1 = time.clock()
            nodesPercent = wk.run_exp(leavaList, 0.7, 1)
            temp_time = time.clock()
            print("{} - {} -> genes = {}, it cost {}s".format(rowOfFR, disease, len(leavaList), temp_time - temp_time1))

            # calculate the FR_GeneSet's value
            colOfFR = 0
            for disease2, genesOfDisease2 in seed_list.items():
                FR = 0
                for gene in genesOfDisease2:
                    if gene in nodes:
                        FR += float(nodesPercent[gene])
                    elif gene in genesOfDisease:
                        FR += 1
                    else:
                        FR += 0
                FRValueMatrix[rowOfFR][colOfFR] = FR
                colOfFR += 1

        rowOfFR += 1

    print("begin to calculate NetSim value")
    # calculate the NetSim value of a pair of disease
    NetSimMatrix = np.zeros((len(seed_list), len(seed_list)))
    NetSimList = list(seed_list.keys())
    rowOfFP = 0
    for disease, genesOfDisease in seed_list.items():

        colOfFP = 0
        diseseGeneNum = len(genesOfDisease)
        for disease2, genesOfDisease2 in seed_list.items():
            if disease is not disease2:
                disease2GeneNum = len(genesOfDisease2)
                NetSimMatrix[rowOfFP][colOfFP] = (FRValueMatrix[rowOfFP][colOfFP] +
                            FRValueMatrix[colOfFP][rowOfFP]) / (diseseGeneNum + disease2GeneNum)
            colOfFP += 1
        rowOfFP += 1

    print("write the 'disease-diesae-value' to a file")
    simiResult = {}
    row, col = np.shape(NetSimMatrix)
    for i in range(0, row):
        for j in range(i + 1, col):
            if NetSimMatrix[i][j] > 0:
                simiResult['{}\t{}'.format(NetSimList[i], NetSimList[j])] = NetSimMatrix[i][j]

    sortedSimiResult = sorted(simiResult.items(), key=lambda x: x[1], reverse=True)
    FileUtil.writeSortedDic2File(sortedSimiResult,  output_path)
    print("end")
    time2 = time.clock()
    print("NetSim total cost {}s.".format(time2-time1))

    pass
Exemple #27
0
def cal_nei_sim(disease, edges, save_nei_sim=False):

    print("begin to calculate similarity based on neighbours...")

    G = nx.Graph()
    G.add_edges_from(edges)  # 将多种生物信息构造成异构矩阵

    print(
        "step 1: epsilon -> 2, calculate first degree sequence and second degree sequence..."
    )
    DegreeSequence1 = []
    DegreeSequence2 = []

    for di in disease:

        neighboursOne = G.neighbors(di)  #获取节点的第一层邻居
        degreeOfOne = []
        neghboursTwo = []
        for indexOfNeighbours in neighboursOne:
            degreeOfOne.append(nx.degree(G,
                                         indexOfNeighbours))  #保存第一层邻居的degree
            neghboursTwo.extend(G.neighbors(indexOfNeighbours))  #获取第一层邻居节点的邻居
        sortedDegreeOfOne = sorted(degreeOfOne)  #对第一层邻居的degree进行排序
        DegreeSequence1.append(sortedDegreeOfOne)

        neghboursTwo = set(neghboursTwo)
        neghboursTwo.remove(di)  #去除二层邻居节点的自己

        degreeOfTwo = []
        for indexOfNeighbours in neghboursTwo:
            degreeOfTwo.append(nx.degree(G,
                                         indexOfNeighbours))  #保存第二层邻居的degree
        sortedDegreeOfTwo = sorted(degreeOfTwo)  #对第一层邻居的degree进行排序
        DegreeSequence2.append(sortedDegreeOfTwo)

    cores = multiprocessing.cpu_count()  # 获取计算机CPU数目
    pool = multiprocessing.Pool(cores)  # 构造一个线程池
    print("step 2: compute neighbour_sim in parallel with {} cpus...".format(
        cores))

    # 构造一个多线程的任务
    resultsOne = [
        pool.apply_async(dtw_distance_fast,
                         (DegreeSequence1[i], DegreeSequence1[j]))
        for i in range(0, len(DegreeSequence1))
        for j in range(i + 1, len(DegreeSequence1))
    ]

    # 将成对的第一层degree sequence计算结果存储到数组中
    arrOne = np.zeros((len(DegreeSequence1), len(DegreeSequence1)))
    i = 0
    j = 1
    for r in resultsOne:
        if j == len(DegreeSequence1):
            i += 1
            j = i + 1
        arrOne[i][j] = float(r.get())
        j += 1

    # 构造一个多线程任务
    resultsTwo = [
        pool.apply_async(dtw_distance_fast,
                         (DegreeSequence2[i], DegreeSequence2[j]))
        for i in range(0, len(DegreeSequence2))
        for j in range(i + 1, len(DegreeSequence2))
    ]

    # 将成对的第二层degree sequence计算结果存储到数组中
    arrTwo = np.zeros((len(DegreeSequence2), len(DegreeSequence2)))
    i = 0
    j = 1
    for r in resultsTwo:
        if j == len(DegreeSequence2):
            i += 1
            j = i + 1
        arrTwo[i][j] = float(r.get())
        j += 1

    # ----------------------------------------------------------------------------
    print("step 3: construct similarity matrix...")
    alpha = 0.5  # a decaying weight factor α in the range between 0 and 1
    NeiSim = {}
    sim_matrix = np.zeros((len(disease), len(disease)))
    for i in range(0, len(disease)):
        for j in range(i + 1, len(disease)):
            distance = math.pow(alpha, 1) * arrOne[i][j] + math.pow(
                alpha, 2) * arrTwo[i][j]
            NeiSim["{}\t{}".format(disease[i],
                                   disease[j])] = math.exp(-distance)
            sim_matrix[i][j] = math.exp(-distance)
            sim_matrix[j][i] = math.exp(-distance)

    if save_nei_sim:
        print("sort the path similarity and save...")
        res = sorted(NeiSim.items(), key=lambda x: x[1], reverse=True)
        FileUtil.writeSortedDic2File(res, "./nei_Sim.txt")

    return sim_matrix