コード例 #1
0
def init_specific_params(score_file_name, score_method, omitted_genes,
                         network_file_name, ts):
    if os.path.exists(os.path.join(ALGO_DIR, "results")):
        shutil.rmtree(os.path.join(ALGO_DIR, "results"))

    if score_method != constants.PREDEFINED_SCORE:
        deg = infra.load_gene_expression_profile_by_genes(
            gene_expression_path=score_file_name)
        h_rows, h_cols, deg_data = infra.separate_headers(deg)
        ind = np.where(h_cols == "qval")[0][0]
        ordered_ind = np.argsort(deg_data[:, ind])
        deg_data = deg_data[ordered_ind, :]
        h_rows = h_rows[ordered_ind]
        sig_binary_col = deg_data[:, np.where(h_cols == "qval")[0][0]] < 0.05
        sig_binary_output = np.c_[h_rows,
                                  np.array(sig_binary_col, dtype=np.int)]
        score_file_name = os.path.join(
            constants.CACHE_DIR, "binary_score_{}.txt".format(score_method))
        file(score_file_name, "w+").write("\n".join(
            ["\t".join(["id", "pval", "qval"])] +
            ["\t".join(list(x) + list([x[-1]])) for x in sig_binary_output]))

    new_network_file_name = remove_subgraph_by_nodes(omitted_genes,
                                                     network_file_name,
                                                     ts=ts)
    return score_file_name, new_network_file_name
コード例 #2
0
ファイル: hotnet2_runner.py プロジェクト: hag007/nn_sb
def init_specific_params(score_file_name,
                         method=constants.DEG_EDGER,
                         network_file_name=os.path.join(
                             constants.NETWORKS_DIR, "dip.sif")):

    script_file_name = format_script(os.path.join(constants.SH_DIR,
                                                  "prepare_hotnet2.sh"),
                                     ALGO_DIR=ALGO_DIR,
                                     CACHE_DIR=constants.CACHE_DIR,
                                     cwd=ALGO_DIR)

    heat_file_name = os.path.join(constants.CACHE_DIR, "heatfile.txt")

    deg = infra.load_gene_expression_profile_by_genes(
        gene_expression_path=score_file_name)
    h_rows, h_cols, deg_data = infra.separate_headers(deg)
    ind = np.where(h_cols == "qval")[0][0]

    lns = []
    if method == constants.PREDEFINED_SCORE and constants.IS_PVAL_SCORES:
        for i, cur in enumerate(deg_data):
            lns.append(" ".join([str(h_rows[i]), str(-log10(cur[ind]))]))
    else:
        for i, cur in enumerate(deg_data):
            lns.append(" ".join([str(h_rows[i]), str(cur[ind])]))

    file(heat_file_name, "w+").write("\n".join(lns))

    sif2hotnet2(network_file_name, script_file_name)
    os.remove(script_file_name)
    # file(os.path.join(constants.OUTPUT_DIR, "hotnet2_bg_genes.txt"), "w+").write("\n".join(bg_genes))
    return heat_file_name, network_file_name
コード例 #3
0
def prepare_input(method=constants.DEG_EDGER, network_name="dip"):
    ge_file_name = os.path.join(
        constants.CACHE_DIR,
        "deg_{}.tsv".format(method).format(method).format(method))
    network_file_name = os.path.join(constants.NETWORKS_DIR,
                                     "{}.sif".format(network_name))

    network_df = pd.read_csv(network_file_name, sep="\t")
    src = np.array(network_df["ID_interactor_A"])
    dst = np.array(network_df["ID_interactor_B"])

    vertices = list(set(np.append(src, dst)))
    ppi_i = []

    deg = infra.load_gene_expression_profile_by_genes(
        gene_expression_path=ge_file_name)
    h_rows, h_cols, deg_data = infra.separate_headers(deg)
    bg_genes = list(set(vertices).intersection(set(h_rows)))

    for i, cur_r in network_df.iterrows():
        if (cur_r["ID_interactor_A"] in bg_genes
                and cur_r["ID_interactor_B"] in bg_genes):
            ppi_i.append("\t".join([
                str(bg_genes.index(cur_r["ID_interactor_A"]) + 1),
                str(bg_genes.index(cur_r["ID_interactor_B"]) + 1)
            ]))

    avg_rd = np.average(deg_data[:, 0:5])
    avg_p_deg = np.average(deg_data[:, 6])
    avg_q_deg = np.average(deg_data[:, 7])

    normalized_ge = []

    for cur_v in bg_genes:
        normalized_ge.append(deg_data[np.where(h_rows == cur_v)[0][0]])

    pd.DataFrame(normalized_ge, index=bg_genes, columns=h_cols).to_csv(
        sep="\t",
        path_or_buf=os.path.join(constants.OUTPUT_DIR, "cosine_ge.tsv"))

    bg_genes_file_name = os.path.join(constants.OUTPUT_DIR,
                                      "cosine_bg_genes.txt")
    file(os.path.join(constants.OUTPUT_DIR, bg_genes_file_name),
         "w+").write("\n".join(bg_genes))

    file(os.path.join(constants.OUTPUT_GLOBAL_DIR, "ppi_i.txt"),
         "w+").write("\n".join(ppi_i))

    return network_file_name, bg_genes, vertices, \
           os.path.join(constants.OUTPUT_GLOBAL_DIR, "ppi_i.txt"), os.path.join(constants.OUTPUT_DIR, "cosine_ge.tsv")
コード例 #4
0
def prepare_input(gene_expression_file_name="ge.tsv", groups=None):
    if groups is not None:
        groups = infra.load_classes()
    elif os.path.exists(os.path.join(constants.DATA_DIR, "classes.tsv")):
        groups = infra.load_classes()
    else:
        groups = [1, 1, 1, 2, 2, 2]

    ge_raw = infra.load_gene_expression_profile_by_genes(
        gene_expression_file_name=gene_expression_file_name)
    genes, conditions, data = infra.separate_headers(ge_raw)
    conditions = np.array(conditions)
    groups = np.array(groups, dtype=np.int)
    data = pd.DataFrame(data, index=genes, columns=conditions, dtype=np.float)
    return conditions, data, genes, groups
コード例 #5
0
ファイル: add_t_test_to_ge.py プロジェクト: hag007/nn_sb
def calc_ttest(dataset=constants.DATASET_NAME,
               gene_expression_file_name="ge.tsv"):
    h_rows, h_cols, ge_dataset = infra.separate_headers(
        infra.load_gene_expression_profile_by_genes(
            gene_expression_file_name=gene_expression_file_name))
    classes = np.array(infra.load_classes()).astype(np.int)
    pvals = []
    rows_to_delete = []
    pval_dict = {}

    for i, cur in enumerate(list(h_rows)):
        pval_dict[cur] = ttest_ind(ge_dataset[i, classes == 1],
                                   ge_dataset[i, classes == 2]).pvalue
        if np.isnan(pval_dict[cur]):
            print "case: {}, wt: {}".format(ge_dataset[i, classes == 1],
                                            ge_dataset[i, classes == 2])
            rows_to_delete.append(i)
        else:
            pvals.append(pval_dict[cur])
    ind = np.ones((len(h_rows), ), bool)
    ind[rows_to_delete] = False
    h_rows = h_rows[ind]
    ge_dataset = ge_dataset[ind, :]
    # print pvals
    qvals = fdrcorrection0(pvals, alpha=0.05, method='indep',
                           is_sorted=False)[1]
    qscores = []
    for i, cur in enumerate(h_rows):
        qscores.append(-log10(qvals[i]))

    output_h_cols = ["id"] + list(h_cols) + ["pval", "qval", "qscore"]

    output_matrix = np.c_[h_rows, ge_dataset, pvals, qvals, qscores]
    output_matrix = np.r_[np.reshape(output_h_cols, (1, len(output_h_cols))),
                          output_matrix]

    lines = []
    for i, cur in enumerate(output_matrix):
        lines.append("\t".join(cur))

    file(os.path.join(constants.CACHE_DIR, "deg_t.tsv"),
         "w+").write("\n".join(lines))
    return {
        "result":
        pd.read_csv(os.path.join(constants.CACHE_DIR, "deg_t.tsv"),
                    sep="\t",
                    index_col=0)
    }
コード例 #6
0
ファイル: add_edgeR_test.py プロジェクト: hag007/nn_sb
def main(count_file):
    base, ext = os.path.splitext(count_file)
    outfile = "%s-diffs.csv" % (base)
    ge_raw = infra.load_gene_expression_profile_by_genes()

    genes, conditions, data = infra.separate_headers(ge_raw)

    group = [1, 1, 1, 2, 2, 2]
    conditions = np.array(conditions)
    group = np.array(group)
    data = pd.DataFrame(
        data,
        index=genes,
        columns=conditions,
        dtype=np.int,
    )
    probs = run_rscript(data=data,
                        genes=genes,
                        conditions=conditions,
                        group=group)
コード例 #7
0
ファイル: matisse_runner.py プロジェクト: hag007/nn_sb
def init_specific_params(ge_file_name=os.path.join(constants.DATA_DIR,
                                                   "ge.tsv"),
                         network_file_name=os.path.join(
                             constants.NETWORKS_DIR, NETWORK_NAME + ".sif")):
    h_rows, h_columns, values = infra.separate_headers(
        infra.load_gene_expression_profile_by_genes(
            gene_expression_file_name=ge_file_name))
    df_ge = pd.DataFrame(columns=h_columns, index=h_rows, data=values)
    df_ge_cond_col = df_ge.columns
    df_ge["gene ID"] = df_ge.index
    df_ge["GeneName"] = [
        e2g_convertor([cur])[0] if len(e2g_convertor([cur])) > 0 else np.NAN
        for cur in df_ge.index
    ]
    df_ge = df_ge[["gene ID", "GeneName"] + list(df_ge_cond_col)]
    df_ge = df_ge[~df_ge['gene ID'].duplicated(keep='first')]
    ge_file_name_mts = os.path.splitext(ge_file_name)[0] + "_mts.tsv"
    df_ge.to_csv(ge_file_name_mts, index=False, sep="\t")

    output_file_name = os.path.join(constants.OUTPUT_DIR, "matisse_output.txt")
    return ge_file_name_mts, network_file_name, output_file_name
コード例 #8
0
def init_specific_params(score_file_name, dest_algo_dir):

    deg = infra.load_gene_expression_profile_by_genes(
        gene_expression_path=score_file_name)
    h_rows, h_cols, deg_data = infra.separate_headers(deg)

    ind = np.where(h_cols == "qval")[0][0]
    ordered_ind = np.argsort(deg_data[:, ind])
    deg_data = deg_data[ordered_ind, :]
    h_rows = h_rows[ordered_ind]
    last_q_index = np.where(
        deg_data[:, np.where(h_cols == "qval")[0][0]] > 0.05)[0][0]
    ge_list_file_name = os.path.join(constants.OUTPUT_DIR, "ge_list.txt")
    file(ge_list_file_name, "w+").write("\n".join([
        x for x in h_rows[:last_q_index]
        if len(ensembl2entrez_convertor([x])) > 0
    ]))  # ensembl2entrez_convertor([x])[0]

    conf_file = "conf.props"
    conf_file_name = format_script(os.path.join(dest_algo_dir, conf_file),
                                   pval_threshold=0.05,
                                   sp_threshold=2,
                                   gene_file=ge_list_file_name)
    return conf_file_name
コード例 #9
0
if __name__ == "__main__":
    constants.update_dirs(DATASET_NAME_u="TNFa")
    params = get_parameters()
    if params != None:
        args, NETWORK_NAME, dataset_name = params

    print subprocess.Popen(
        "bash ../sh/scripts/run_pinnaclez.sh.format",
        shell=True,
        stdout=subprocess.PIPE).stdout.read()  # cwd=dir_path

    results = file(os.path.join(constants.OUTPUT_DIR,
                                "pinnaclez_results.txt")).read().split()
    module_genes = list(set([x for x in results if x.startswith("ENSG")]))
    dip_network = pd.read_csv(os.path.join(constants.NETWORKS_DIR,
                                           "dip_out.sif"),
                              sep="\t",
                              index_col=False,
                              header=None)
    bg_genes = set(dip_network.ix[:, 0]).union(set(dip_network.ix[:, 2]))
    exp_genes, _1, _2 = infra.separate_headers(
        infra.load_gene_expression_profile())
    # bg_genes = list(bg_genes.union(set(exp_genes)))
    bg_genes = list(bg_genes)
    file(os.path.join(constants.OUTPUT_DIR, "pinnaclez_bg_genes.txt"),
         "w+").write("\n".join(bg_genes))
    file(os.path.join(constants.OUTPUT_DIR, "pinnaclez_module_genes.txt"),
         "w+").write("\n".join(module_genes))

    utils.go.check_group_enrichment(module_genes, bg_genes)
コード例 #10
0
def prepare_input(method=constants.DEG_EDGER, network_name="dip"):
    deg_file_name = os.path.join(constants.CACHE_DIR,
                                 "deg_{}.tsv".format(method))
    network_file_name = os.path.join(constants.NETWORKS_DIR,
                                     "{}.sif".format(network_name))

    network_df = pd.read_csv(network_file_name, sep="\t")
    src = np.array(network_df["ID_interactor_A"])
    dst = np.array(network_df["ID_interactor_B"])

    vertices = list(set(np.append(src, dst)))
    A = np.zeros((len(vertices), len(vertices)))

    v_list_data = np.c_[np.array([i + 1 for i in range(len(vertices))]),
                        np.ones(len(vertices)),
                        np.zeros(len(vertices))]
    vlist = pd.DataFrame(v_list_data,
                         index=[i for i in range(len(vertices))],
                         columns=['content', 'weight', 'degree'],
                         dtype=np.int)

    for i, cur_r in network_df.iterrows():
        A[vertices.index(cur_r["ID_interactor_A"]),
          vertices.index(cur_r["ID_interactor_B"])] = 1
        A[vertices.index(cur_r["ID_interactor_B"]),
          vertices.index(cur_r["ID_interactor_A"])] = 1
        vlist.loc[list(
            set([
                vertices.index(cur_r["ID_interactor_B"]),
                vertices.index(cur_r["ID_interactor_A"])
            ])), ["degree"]] += 1

    deg = infra.load_gene_expression_profile_by_genes(
        gene_expression_path=deg_file_name)
    h_rows, h_cols, deg_data = infra.separate_headers(deg)
    ind = np.where(h_cols == "qval")[0][0]
    ordered_ind = np.argsort(deg_data[:, ind])
    deg_data = deg_data[ordered_ind, :]
    h_rows = h_rows[ordered_ind]
    sig_last_index = np.where(
        deg_data[:, np.where(h_cols == "qval")[0][0]] > 0.05)[0][0]

    degs = list(set(h_rows[:sig_last_index]).intersection(vertices))
    background = list(set(vertices).intersection(set(h_rows)))
    random_sets = [random.sample(vertices, len(degs)) for x in range(10000)]

    bg_genes = vertices
    bg_genes_file_name = os.path.join(constants.OUTPUT_DIR,
                                      "keypathwayminer_bg_genes.txt")
    file(os.path.join(constants.OUTPUT_DIR, bg_genes_file_name),
         "w+").write("\n".join(bg_genes))

    file(os.path.join(constants.OUTPUT_DIR, "A"),
         "w+").write("\n".join(["\t".join([str(y) for y in x]) for x in A]))
    file(os.path.join(constants.OUTPUT_DIR, "degs"),
         "w+").write("\n".join(degs))
    file(os.path.join(constants.OUTPUT_DIR, "proteins"),
         "w+").write("\n".join(vertices))
    file(os.path.join(constants.OUTPUT_DIR, "vlist"),
         "w+").write("\t".join(vlist.columns) + "\n" + "\n".join(
             ["\t".join([str(y) for y in x]) for i, x in vlist.iterrows()]))
    file(os.path.join(constants.OUTPUT_DIR, "background"),
         "w+").write("\n".join(background))
    file(os.path.join(constants.OUTPUT_DIR, "random_sets"),
         "w+").write("\n".join(["\t".join(x) for x in random_sets]))

    return network_file_name, bg_genes, \
           os.path.join(constants.OUTPUT_DIR, "A"), os.path.join(constants.OUTPUT_DIR,"degs"), \
           os.path.join(constants.OUTPUT_DIR, "proteins"), os.path.join(constants.OUTPUT_DIR,"vlist"), \
           os.path.join(constants.OUTPUT_DIR, "background"), os.path.join(constants.OUTPUT_DIR,"random_sets"),\
           0.05, os.path.join(constants.OUTPUT_DIR, "moduledicoverer_output.txt")