Beispiel #1
0
    def changeIndex():
        if not os.path.exists(args.index_path):
            printT("don't find index file, fail to change index.")
            return
        # change index
        with open(args.index_path, 'r') as f:
            lines = f.readlines()
        index = [line.strip() for line in lines]

        for i in range(data.type_num):
            data.trainDatas_df[i].index = index
Beispiel #2
0
def del_non_value():
    remainIndexs = None
    for i in range(data.type_num):
        nanIndex = data.edges_df_all[i][data.edges_df_all[i].isnull().all()].index
        remainIndexs = nanIndex if remainIndexs is None else remainIndexs & nanIndex

    for i in range(data.type_num):
        data.edges_df_all[i] = data.edges_df_all[i].drop(index=remainIndexs, columns=remainIndexs)
        printT("  output file", args.result_dir + str(i) + '_cor_new.csv')
        data.edges_df_all[i].to_csv(args.result_dir + str(i) + '_cor_new.csv')
    printT(" remain shape", data.edges_df_all[0].shape)
Beispiel #3
0
def cutOffByMin(minNum):
    remainIndexs = None
    for i in range(data.type_num):
        meanFt = data.trainDatas_df[i].mean(1)  # get mean
        remainIndex = meanFt[meanFt > minNum].index
        if remainIndexs is None:
            remainIndexs = remainIndex
        else:
            remainIndexs = remainIndexs & remainIndex
    for i in range(data.type_num):
        data.trainDatas_df[i] = data.trainDatas_df[i].loc[remainIndexs]
    printT('  remain features count:', data.trainDatas_df[0].shape[0])

    return
Beispiel #4
0
def run(cor_cut, limit_up, limit_down, model_4trend, path_model_cor, name_list,
        model_edges_count):
    dir_int_str = {}

    trends_pos_int_list = []
    trends_neg_int_list = []

    trends81_int_list = []
    for a in range(3):
        a_ = a - 1
        for b in range(3):
            b_ = b - 1
            for c in range(3):
                c_ = c - 1
                for d in range(3):
                    d_ = d - 1

                    pre = a_ * 1000 + b_ * 100 + c_ * 10 + d_
                    pre_str = "%2d%2d%2d%2d" % (a_, b_, c_, d_)

                    trends81_int_list.append(pre)

                    dir_int_str[pre] = pre_str
                    sum_ = a_ + b_ + c_ + d_
                    if (sum_ > 2) & (a_ == 1):
                        trends_pos_int_list.append(pre)
                    elif (sum_ < -2) & (a_ == -1):
                        trends_neg_int_list.append(pre)
    printT(dir_int_str)
    printT(trends81_int_list)

    pl = multiprocessing.Pool(processes=5)
    resultBuffer = []
    for name in name_list:
        resultBuffer.append(
            pl.apply_async(each_data,
                           (name, cor_cut, limit_up, limit_down, model_4trend,
                            path_model_cor, dir_int_str, trends_pos_int_list,
                            trends_neg_int_list, model_edges_count)))
    pl.close()
    pl.join()
Beispiel #5
0
def draw_boxplot(
    name,
    f_list,
    data_path_model,
    data_path_p_model,
    out_path_pic_model,
):
    data_f = []
    for f in f_list:
        data_f_pre = []
        for i in range(5):
            printT("read data ", data_path_model % (name, name, i, f))
            with open(data_path_model % (name, name, i, f), 'rb') as fr:
                data_pre = pickle.load(fr)[0]
            with open(data_path_p_model % (name, name, i, f), 'rb') as fr:
                p_pre = pickle.load(fr)[0]

            data_pre[p_pre > 0.01] = 0
            data_pre[data_pre < 0] = -data_pre
            data_pre_np = data_pre.values
            data_list = data_pre_np.ravel()[np.flatnonzero(data_pre_np)]
            data_f_pre.append(data_list)
        data_f.append(data_f_pre)

    plt.figure(figsize=(20, 10))

    p_list = ["Normal", "S1", "S2", "S3", "S4"]

    y = np.linspace(0, 1, 6)
    printT("draw ", name)

    plt.subplot(1, 2, 1)
    plt.boxplot(data_f[0], labels=p_list)
    plt.yticks(y)
    plt.title(f_list[0])

    plt.subplot(1, 2, 2)
    plt.boxplot(data_f[1], labels=p_list)
    plt.yticks(y)
    plt.title(f_list[1])

    plt.suptitle(name)

    printT("save ", name)
    plt.savefig(out_path_pic_model % name)
    printT("finish ", name, f)
Beispiel #6
0
def cal_out_subgraph(connect_graph, data_name, index, trend_data_pre,
                     subgraphs_model, trend_pre, subgraph_nodelist_model_new,
                     subgraphs_max_edges_model,
                     subgraphs_max_model_edges_v_new):
    connect_graph_pd = pd.DataFrame(connect_graph, index=index, columns=index)

    remainIndex = connect_graph_pd.index

    subgraphs = []
    added_index = []
    for pre in remainIndex:
        if pre in added_index:
            continue
        pre_index = connect_graph_pd.loc[connect_graph_pd[pre] >= 1].index
        subgraphs.append(pre_index.tolist())
        added_index.extend(pre_index)

    printT(data_name, trend_pre, "subgraph count:", len(subgraphs))

    subgraphs.sort(key=functools.cmp_to_key(cmp))

    output = open(subgraphs_model % (trend_pre, data_name), 'w')
    for row in subgraphs:
        row_str = str(row).replace("[", "").replace("]", "")
        output.write(row_str + '\n')
    submax_pd = pd.DataFrame(subgraphs[0])
    submax_pd.to_csv(subgraph_nodelist_model_new % (data_name, trend_pre),
                     header=None,
                     index=False)

    subnet = trend_data_pre[trend_data_pre['Node_A'].isin(subgraphs[0])
                            & trend_data_pre['Node_B'].isin(subgraphs[0])]
    printT(data_name, trend_pre, "subnet nodes count", len(subgraphs[0]))
    printT(data_name, trend_pre, "subnet edges count", subnet.shape[0])
    subnet[['Node_A', 'Node_B'
            ]].to_csv(subgraphs_max_edges_model % (trend_pre, data_name),
                      index=False,
                      sep="\t")

    subnet.to_csv(subgraphs_max_model_edges_v_new % (trend_pre, data_name),
                  index=False)
Beispiel #7
0
def draw_heatmap(
    name,
    f_list,
    data_path_model,
    out_path_pic_model,
):
    sort_by = ["4", "0"]
    for i, f in enumerate(f_list):
        printT("read data ", data_path_model % (f, name))
        edges_pre = pd.read_csv(data_path_model % (f, name))
        edges_pre.sort_values(sort_by)

        printT("draw ", name)

        plt.figure()
        plt.suptitle(name + f)

        sns.clustermap(edges_pre[["0", "1", "2", "3", "4"]],
                       col_cluster=False,
                       cmap=sns.diverging_palette(270, 5, as_cmap=True))

        plt.savefig(out_path_pic_model % (name, f))

    printT("finish ", name)
Beispiel #8
0
def runAll():
    initFile(args)
    initCode()

    # start

    printT("1.0 read data")
    initData.createDataSet()

    printT("1.1 initData min by", args.cut_value)
    initData.cutOffByMin(minNum=args.cut_value)
    printT("1.2 change data to log")
    initData.toLog()

    printT("2 calculate the similarity matrix")
    calculate.getSprearmon(need_output=True)

    printT("3 delete pvalue larger than", args.remain_p_value)
    calculate.del_non_value()

    # # if want to compare co-expression with expression value
    # printT("cal wilcox")
    # cal_wilconxon.get_wilcox()

    printT("over.")
Beispiel #9
0
def run(namelist, path_model, sub_model, stat_model):
    df_stat = pd.DataFrame(None, columns=namelist)
    trendlist = ['pos', 'neg']

    df_result = pd.DataFrame(None, columns=['C1', 'C2', 'C3', 'C4'])

    for name in namelist:
        index_maxsub_s = []
        index_secsub_s = []
        printT()
        printT(name)
        for trend in trendlist:
            subgraph_path = sub_model % (trend, name)
            printT(subgraph_path, end=" ")
            with open(subgraph_path, 'r') as f:
                lines = f.readlines()
                points_list = lines[0].split(', ')
                points_list2 = lines[1].split(', ')
            points_list = [pre.replace("'", "").strip() for pre in points_list]
            points_list2 = [
                pre.replace("'", "").strip() for pre in points_list2
            ]
            printT(len(points_list))
            index_maxsub_s.append(points_list)
            index_secsub_s.append(points_list2)
            df_stat.at['sub_' + trend + '_points',
                       name] = len(list(set(points_list)))
            df_stat.at['sec_' + trend + '_points',
                       name] = len(list(set(points_list2)))

        node_count_s = []
        for i, trend in enumerate(trendlist):
            data_path = path_model % (trend, name)
            data = pd.read_csv(data_path)
            df_stat.at['all_' + trend + '_edges', name] = data.shape[0]
            df_stat.at['all_' + trend + '_points', name] = len(
                list(set(data['Node_A'].tolist() + data['Node_B'].tolist())))
            printT(data.shape, end="->")

            data_sec = data[data['Node_A'].isin(index_secsub_s[i])]
            data_sec = data_sec[data_sec['Node_B'].isin(index_secsub_s[i])]
            printT(data_sec.shape)
            df_stat.at['sec_' + trend + '_edges', name] = data_sec.shape[0]

            data = data[data['Node_A'].isin(index_maxsub_s[i])]
            data = data[data['Node_B'].isin(index_maxsub_s[i])]
            printT(data.shape, data_path)
            df_stat.at['sub_' + trend + '_edges', name] = data.shape[0]

            index = data['Node_A'].tolist() + data['Node_B'].tolist()
            df_stat.at['sub_' + trend + '_points',
                       name] = len(list(set(index)))
            result = Counter(index)
            node_count_pre = pd.DataFrame(list(result.most_common()),
                                          columns=['Node', trend + '_num'])
            node_count_pre.set_index(["Node"], inplace=True)

            printT(i, "node count", node_count_pre.shape)
            node_count_s.append(node_count_pre)
        node_count_df = node_count_s[0].join(node_count_s[1], how='outer')
        printT("all node count", node_count_df.shape)

        node_count_df = node_count_df.fillna(0)
        c1 = node_count_df[(node_count_df[trendlist[0] + '_num'] != 0)
                           & (node_count_df[trendlist[1] + '_num'] != 0)]
        c2 = node_count_df[(node_count_df[trendlist[0] + '_num'] > 5)
                           & (node_count_df[trendlist[1] + '_num'] > 5)]
        c3_pos = node_count_df[(node_count_df[trendlist[0] + '_num'] >= 1)
                               & (node_count_df[trendlist[1] + '_num'] == 0)]
        c3_neg = node_count_df[(node_count_df[trendlist[0] + '_num'] == 0)
                               & (node_count_df[trendlist[1] + '_num'] >= 1)]

        df_result.loc[name] = [
            c1.shape[0], c2.shape[0], c3_pos.shape[0], c3_neg.shape[0]
        ]

    df_stat = df_stat.sort_index()
    print(df_stat)
    df_stat.to_csv(stat_model)
Beispiel #10
0
def each_data(data_name, cor_cut, limit_up, limit_down, model_4trend,
              path_model_cor, dir_int_str, trends_pos_int_list,
              trends_neg_int_list, model_edges_count):
    adj_sum = None
    na_num_list = []

    cors_ori_pd = []
    cors_sign_np = []
    cal_sign_np = None
    for i in range(5):
        # init
        printT(path_model_cor % (data_name, i))
        cor_pre_pd = pd.read_csv(path_model_cor % (data_name, i), index_col=0)
        cor_pre_np = np.triu(cor_pre_pd.values, 1)
        cor_pre_pd[np.abs(cor_pre_pd) < cor_cut] = 0
        cor_pre_pd = pd.DataFrame(cor_pre_np,
                                  index=cor_pre_pd.index,
                                  columns=cor_pre_pd.columns)

        cor_pre_pd = cor_pre_pd.fillna(0)

        if adj_sum is None:
            adj_sum = cor_pre_pd
        else:
            adj_sum = adj_sum + abs(cor_pre_pd)
        na_num_list.append((cor_pre_pd != 0).sum().sum())

        # cal signal
        cors_ori_pd.append(cor_pre_pd)
        cors_sign_pre_np = np.where(cor_pre_pd > 0, 1, cor_pre_pd)
        cors_sign_pre_np = np.where(cors_sign_pre_np < 0, 10, cor_pre_pd)
        cors_sign_np.append(cors_sign_pre_np)
        if cal_sign_np is None:
            cal_sign_np = cors_sign_pre_np
        else:
            cal_sign_np = cal_sign_np + cors_sign_pre_np
    na_num_list.append((adj_sum != 0).sum().sum())
    pd.DataFrame(na_num_list).to_csv(model_edges_count % data_name)

    # set abs or ori
    cal_sign_np = np.where((cal_sign_np % 10 == 0) & (cal_sign_np != 0), -1,
                           1)  # all<0 | all=0
    cors_abs = []
    printT("calculate sign", data_name)
    for i in range(5):
        cors_abs.append(cal_sign_np * cors_ori_pd[i])

    # cal trend up
    printT("calculate trend", data_name)
    trends = None
    trends_add = None
    for i in range(4):
        pre_distance = cors_abs[i + 1] - cors_abs[i]

        if i == 0:
            pre_distance[pre_distance >= limit_up] = 1
            pre_distance[pre_distance <= -limit_up] = -1
            pre_distance[(pre_distance < limit_up)
                         & (pre_distance > -limit_up)] = 0
        else:
            """
            add>0 (-d,u)
            add=0 (-u,u)
            add<0 (-u,d)
            """

            pre_distance[(pre_distance >= limit_up) & (trends_add >= 0)] = 1
            pre_distance[(pre_distance >= limit_down) & (trends_add < 0)] = 1

            pre_distance[(pre_distance <= -limit_down) & (trends_add > 0)] = -1
            pre_distance[(pre_distance <= -limit_up) & (trends_add <= 0)] = -1

            pre_distance[(pre_distance < limit_up)
                         & (pre_distance > -limit_down) & (trends_add > 0)] = 0
            pre_distance[(pre_distance < limit_up) & (pre_distance > -limit_up)
                         & (trends_add == 0)] = 0
            pre_distance[(pre_distance < limit_down)
                         & (pre_distance > -limit_up) & (trends_add < 0)] = 0

        if trends is None:
            trends = pre_distance
            trends_add = pre_distance
        else:
            trends = trends * 10 + pre_distance
            trends_add = trends_add + pre_distance

    # output

    index = trends.index
    columns = trends.columns

    for trend_pre, trends_int_list in zip(
        ["pos", "neg"], [trends_pos_int_list, trends_neg_int_list]):
        trends4_pre = pd.DataFrame(
            None,
            columns=["trend", "Node_A", "Node_B", "0", "1", "2", "3", "4"])

        printT("collect", data_name, trend_pre)
        for pre_trend in trends_int_list:
            locs = np.where(trends == pre_trend)
            index_pre_name = [index[i] for i in locs[0]]
            columns_pre_name = [columns[i] for i in locs[1]]

            pd_pre = pd.DataFrame(None, columns=trends4_pre.columns)
            pd_pre["Node_A"] = index_pre_name
            pd_pre["Node_B"] = columns_pre_name
            for i in range(5):
                pd_pre[str(i)] = [
                    cors_ori_pd[i].at[a, b]
                    for a, b in zip(index_pre_name, columns_pre_name)
                ]
            pd_pre["trend"] = dir_int_str[pre_trend]
            trends4_pre = pd.concat([trends4_pre, pd_pre], axis=0)

        printT(data_name, trend_pre, trends4_pre.shape, "output",
               model_4trend % ("pos", data_name))
        trends4_pre.to_csv(model_4trend % (trend_pre, data_name))
Beispiel #11
0
 def readFile(path):
     dataSet = pd.read_csv(path, sep='\t')
     printT(' ' + path, dataSet.shape)
     data.trainDatas_df.append(dataSet)
     return
Beispiel #12
0
def each_process_fold(sample_num, i, trainDataFrame, need_output):  # non-multi
    import pickle
    printT("  calculate spearmon %d, " % i, trainDataFrame.shape)

    # # doesn't use fold
    # scc_unfold, p_unfold = getSimilarAndPvalue(trainDataFrame)
    # cDF_unfold = pd.DataFrame(scc_unfold, index=trainDataFrame.index, columns=trainDataFrame.index)
    # pDF_unfold = pd.DataFrame(p_unfold, index=trainDataFrame.index, columns=trainDataFrame.index)
    # printT("  output file", args.result_dir + args.data_name + '_' + str(i) + '_cor_unfold.pkl')
    # with open(args.result_dir + args.data_name + '_' + str(i) + '_cor_unfold.pkl', 'wb') as fw:
    #     pickle.dump([cDF_unfold], fw, 0)
    # printT("  output file", args.result_dir + args.data_name + '_' + str(i) + '_p_unfold.pkl')
    # with open(args.result_dir + args.data_name + '_' + str(i) + '_p_unfold.pkl', 'wb') as fw:
    #     pickle.dump([pDF_unfold], fw, 0)

    np.random.seed(66)

    fold_num = 10
    sccDF = 0
    pDF = 1

    need_g = 1 + int((sample_num * fold_num) / data.trainDatas_df[i].shape[1])
    list_all = []
    for nee in range(need_g):
        list_pre = [j for j in range(data.trainDatas_df[i].shape[1])]
        shuffle(list_pre)
        list_all.extend(list_pre)

    for j in range(fold_num):  # 10-fold
        train_index = list_all[j * sample_num:(j + 1) * sample_num]
        printT("  spearmon %d by fold=%d/%d %d sample=%s" % (i, j, fold_num, len(train_index), train_index))

        trainDataFrame_pre = trainDataFrame.iloc[:, train_index]
        scc, p = getSimilarAndPvalue(trainDataFrame_pre)
        sccDF_pre = pd.DataFrame(scc, index=trainDataFrame_pre.index, columns=trainDataFrame_pre.index)
        pDF_pre = pd.DataFrame(p, index=trainDataFrame_pre.index, columns=trainDataFrame_pre.index)

        sccDF = sccDF + sccDF_pre
        pDF = pDF * pDF_pre

    printT("  spearmon %d finish" % i)
    cDF_result = sccDF / fold_num
    pDF_result = pDF ** (1 / fold_num)
    if need_output:
        printT("  output file", args.result_dir + args.data_name + '_' + str(i) + '_cor_fold.pkl')
        with open(args.result_dir + args.data_name + '_' + str(i) + '_cor_fold.pkl', 'wb') as fw:
            pickle.dump([cDF_result], fw, 0)

        printT("  output file", args.result_dir + args.data_name + '_' + str(i) + '_p_fold.pkl')
        with open(args.result_dir + args.data_name + '_' + str(i) + '_p_fold.pkl', 'wb') as fw:
            pickle.dump([pDF_result], fw, 0)

    printT("  delete spearmon %d by p>%f" % (i, args.remain_p_value))
    cDF_result[pDF_result > args.remain_p_value] = np.nan

    printT("  calculate spearmon %d finish" % i)
    return cDF_result