def init_specific_params(score_file_name, score_method, omitted_genes, network_file_name, ts): if os.path.exists(os.path.join(ALGO_DIR, "results")): shutil.rmtree(os.path.join(ALGO_DIR, "results")) if score_method != constants.PREDEFINED_SCORE: deg = infra.load_gene_expression_profile_by_genes( gene_expression_path=score_file_name) h_rows, h_cols, deg_data = infra.separate_headers(deg) ind = np.where(h_cols == "qval")[0][0] ordered_ind = np.argsort(deg_data[:, ind]) deg_data = deg_data[ordered_ind, :] h_rows = h_rows[ordered_ind] sig_binary_col = deg_data[:, np.where(h_cols == "qval")[0][0]] < 0.05 sig_binary_output = np.c_[h_rows, np.array(sig_binary_col, dtype=np.int)] score_file_name = os.path.join( constants.CACHE_DIR, "binary_score_{}.txt".format(score_method)) file(score_file_name, "w+").write("\n".join( ["\t".join(["id", "pval", "qval"])] + ["\t".join(list(x) + list([x[-1]])) for x in sig_binary_output])) new_network_file_name = remove_subgraph_by_nodes(omitted_genes, network_file_name, ts=ts) return score_file_name, new_network_file_name
def init_specific_params(score_file_name, method=constants.DEG_EDGER, network_file_name=os.path.join( constants.NETWORKS_DIR, "dip.sif")): script_file_name = format_script(os.path.join(constants.SH_DIR, "prepare_hotnet2.sh"), ALGO_DIR=ALGO_DIR, CACHE_DIR=constants.CACHE_DIR, cwd=ALGO_DIR) heat_file_name = os.path.join(constants.CACHE_DIR, "heatfile.txt") deg = infra.load_gene_expression_profile_by_genes( gene_expression_path=score_file_name) h_rows, h_cols, deg_data = infra.separate_headers(deg) ind = np.where(h_cols == "qval")[0][0] lns = [] if method == constants.PREDEFINED_SCORE and constants.IS_PVAL_SCORES: for i, cur in enumerate(deg_data): lns.append(" ".join([str(h_rows[i]), str(-log10(cur[ind]))])) else: for i, cur in enumerate(deg_data): lns.append(" ".join([str(h_rows[i]), str(cur[ind])])) file(heat_file_name, "w+").write("\n".join(lns)) sif2hotnet2(network_file_name, script_file_name) os.remove(script_file_name) # file(os.path.join(constants.OUTPUT_DIR, "hotnet2_bg_genes.txt"), "w+").write("\n".join(bg_genes)) return heat_file_name, network_file_name
def prepare_input(method=constants.DEG_EDGER, network_name="dip"): ge_file_name = os.path.join( constants.CACHE_DIR, "deg_{}.tsv".format(method).format(method).format(method)) network_file_name = os.path.join(constants.NETWORKS_DIR, "{}.sif".format(network_name)) network_df = pd.read_csv(network_file_name, sep="\t") src = np.array(network_df["ID_interactor_A"]) dst = np.array(network_df["ID_interactor_B"]) vertices = list(set(np.append(src, dst))) ppi_i = [] deg = infra.load_gene_expression_profile_by_genes( gene_expression_path=ge_file_name) h_rows, h_cols, deg_data = infra.separate_headers(deg) bg_genes = list(set(vertices).intersection(set(h_rows))) for i, cur_r in network_df.iterrows(): if (cur_r["ID_interactor_A"] in bg_genes and cur_r["ID_interactor_B"] in bg_genes): ppi_i.append("\t".join([ str(bg_genes.index(cur_r["ID_interactor_A"]) + 1), str(bg_genes.index(cur_r["ID_interactor_B"]) + 1) ])) avg_rd = np.average(deg_data[:, 0:5]) avg_p_deg = np.average(deg_data[:, 6]) avg_q_deg = np.average(deg_data[:, 7]) normalized_ge = [] for cur_v in bg_genes: normalized_ge.append(deg_data[np.where(h_rows == cur_v)[0][0]]) pd.DataFrame(normalized_ge, index=bg_genes, columns=h_cols).to_csv( sep="\t", path_or_buf=os.path.join(constants.OUTPUT_DIR, "cosine_ge.tsv")) bg_genes_file_name = os.path.join(constants.OUTPUT_DIR, "cosine_bg_genes.txt") file(os.path.join(constants.OUTPUT_DIR, bg_genes_file_name), "w+").write("\n".join(bg_genes)) file(os.path.join(constants.OUTPUT_GLOBAL_DIR, "ppi_i.txt"), "w+").write("\n".join(ppi_i)) return network_file_name, bg_genes, vertices, \ os.path.join(constants.OUTPUT_GLOBAL_DIR, "ppi_i.txt"), os.path.join(constants.OUTPUT_DIR, "cosine_ge.tsv")
def prepare_input(gene_expression_file_name="ge.tsv", groups=None): if groups is not None: groups = infra.load_classes() elif os.path.exists(os.path.join(constants.DATA_DIR, "classes.tsv")): groups = infra.load_classes() else: groups = [1, 1, 1, 2, 2, 2] ge_raw = infra.load_gene_expression_profile_by_genes( gene_expression_file_name=gene_expression_file_name) genes, conditions, data = infra.separate_headers(ge_raw) conditions = np.array(conditions) groups = np.array(groups, dtype=np.int) data = pd.DataFrame(data, index=genes, columns=conditions, dtype=np.float) return conditions, data, genes, groups
def calc_ttest(dataset=constants.DATASET_NAME, gene_expression_file_name="ge.tsv"): h_rows, h_cols, ge_dataset = infra.separate_headers( infra.load_gene_expression_profile_by_genes( gene_expression_file_name=gene_expression_file_name)) classes = np.array(infra.load_classes()).astype(np.int) pvals = [] rows_to_delete = [] pval_dict = {} for i, cur in enumerate(list(h_rows)): pval_dict[cur] = ttest_ind(ge_dataset[i, classes == 1], ge_dataset[i, classes == 2]).pvalue if np.isnan(pval_dict[cur]): print "case: {}, wt: {}".format(ge_dataset[i, classes == 1], ge_dataset[i, classes == 2]) rows_to_delete.append(i) else: pvals.append(pval_dict[cur]) ind = np.ones((len(h_rows), ), bool) ind[rows_to_delete] = False h_rows = h_rows[ind] ge_dataset = ge_dataset[ind, :] # print pvals qvals = fdrcorrection0(pvals, alpha=0.05, method='indep', is_sorted=False)[1] qscores = [] for i, cur in enumerate(h_rows): qscores.append(-log10(qvals[i])) output_h_cols = ["id"] + list(h_cols) + ["pval", "qval", "qscore"] output_matrix = np.c_[h_rows, ge_dataset, pvals, qvals, qscores] output_matrix = np.r_[np.reshape(output_h_cols, (1, len(output_h_cols))), output_matrix] lines = [] for i, cur in enumerate(output_matrix): lines.append("\t".join(cur)) file(os.path.join(constants.CACHE_DIR, "deg_t.tsv"), "w+").write("\n".join(lines)) return { "result": pd.read_csv(os.path.join(constants.CACHE_DIR, "deg_t.tsv"), sep="\t", index_col=0) }
def main(count_file): base, ext = os.path.splitext(count_file) outfile = "%s-diffs.csv" % (base) ge_raw = infra.load_gene_expression_profile_by_genes() genes, conditions, data = infra.separate_headers(ge_raw) group = [1, 1, 1, 2, 2, 2] conditions = np.array(conditions) group = np.array(group) data = pd.DataFrame( data, index=genes, columns=conditions, dtype=np.int, ) probs = run_rscript(data=data, genes=genes, conditions=conditions, group=group)
def init_specific_params(ge_file_name=os.path.join(constants.DATA_DIR, "ge.tsv"), network_file_name=os.path.join( constants.NETWORKS_DIR, NETWORK_NAME + ".sif")): h_rows, h_columns, values = infra.separate_headers( infra.load_gene_expression_profile_by_genes( gene_expression_file_name=ge_file_name)) df_ge = pd.DataFrame(columns=h_columns, index=h_rows, data=values) df_ge_cond_col = df_ge.columns df_ge["gene ID"] = df_ge.index df_ge["GeneName"] = [ e2g_convertor([cur])[0] if len(e2g_convertor([cur])) > 0 else np.NAN for cur in df_ge.index ] df_ge = df_ge[["gene ID", "GeneName"] + list(df_ge_cond_col)] df_ge = df_ge[~df_ge['gene ID'].duplicated(keep='first')] ge_file_name_mts = os.path.splitext(ge_file_name)[0] + "_mts.tsv" df_ge.to_csv(ge_file_name_mts, index=False, sep="\t") output_file_name = os.path.join(constants.OUTPUT_DIR, "matisse_output.txt") return ge_file_name_mts, network_file_name, output_file_name
def init_specific_params(score_file_name, dest_algo_dir): deg = infra.load_gene_expression_profile_by_genes( gene_expression_path=score_file_name) h_rows, h_cols, deg_data = infra.separate_headers(deg) ind = np.where(h_cols == "qval")[0][0] ordered_ind = np.argsort(deg_data[:, ind]) deg_data = deg_data[ordered_ind, :] h_rows = h_rows[ordered_ind] last_q_index = np.where( deg_data[:, np.where(h_cols == "qval")[0][0]] > 0.05)[0][0] ge_list_file_name = os.path.join(constants.OUTPUT_DIR, "ge_list.txt") file(ge_list_file_name, "w+").write("\n".join([ x for x in h_rows[:last_q_index] if len(ensembl2entrez_convertor([x])) > 0 ])) # ensembl2entrez_convertor([x])[0] conf_file = "conf.props" conf_file_name = format_script(os.path.join(dest_algo_dir, conf_file), pval_threshold=0.05, sp_threshold=2, gene_file=ge_list_file_name) return conf_file_name
if __name__ == "__main__": constants.update_dirs(DATASET_NAME_u="TNFa") params = get_parameters() if params != None: args, NETWORK_NAME, dataset_name = params print subprocess.Popen( "bash ../sh/scripts/run_pinnaclez.sh.format", shell=True, stdout=subprocess.PIPE).stdout.read() # cwd=dir_path results = file(os.path.join(constants.OUTPUT_DIR, "pinnaclez_results.txt")).read().split() module_genes = list(set([x for x in results if x.startswith("ENSG")])) dip_network = pd.read_csv(os.path.join(constants.NETWORKS_DIR, "dip_out.sif"), sep="\t", index_col=False, header=None) bg_genes = set(dip_network.ix[:, 0]).union(set(dip_network.ix[:, 2])) exp_genes, _1, _2 = infra.separate_headers( infra.load_gene_expression_profile()) # bg_genes = list(bg_genes.union(set(exp_genes))) bg_genes = list(bg_genes) file(os.path.join(constants.OUTPUT_DIR, "pinnaclez_bg_genes.txt"), "w+").write("\n".join(bg_genes)) file(os.path.join(constants.OUTPUT_DIR, "pinnaclez_module_genes.txt"), "w+").write("\n".join(module_genes)) utils.go.check_group_enrichment(module_genes, bg_genes)
def prepare_input(method=constants.DEG_EDGER, network_name="dip"): deg_file_name = os.path.join(constants.CACHE_DIR, "deg_{}.tsv".format(method)) network_file_name = os.path.join(constants.NETWORKS_DIR, "{}.sif".format(network_name)) network_df = pd.read_csv(network_file_name, sep="\t") src = np.array(network_df["ID_interactor_A"]) dst = np.array(network_df["ID_interactor_B"]) vertices = list(set(np.append(src, dst))) A = np.zeros((len(vertices), len(vertices))) v_list_data = np.c_[np.array([i + 1 for i in range(len(vertices))]), np.ones(len(vertices)), np.zeros(len(vertices))] vlist = pd.DataFrame(v_list_data, index=[i for i in range(len(vertices))], columns=['content', 'weight', 'degree'], dtype=np.int) for i, cur_r in network_df.iterrows(): A[vertices.index(cur_r["ID_interactor_A"]), vertices.index(cur_r["ID_interactor_B"])] = 1 A[vertices.index(cur_r["ID_interactor_B"]), vertices.index(cur_r["ID_interactor_A"])] = 1 vlist.loc[list( set([ vertices.index(cur_r["ID_interactor_B"]), vertices.index(cur_r["ID_interactor_A"]) ])), ["degree"]] += 1 deg = infra.load_gene_expression_profile_by_genes( gene_expression_path=deg_file_name) h_rows, h_cols, deg_data = infra.separate_headers(deg) ind = np.where(h_cols == "qval")[0][0] ordered_ind = np.argsort(deg_data[:, ind]) deg_data = deg_data[ordered_ind, :] h_rows = h_rows[ordered_ind] sig_last_index = np.where( deg_data[:, np.where(h_cols == "qval")[0][0]] > 0.05)[0][0] degs = list(set(h_rows[:sig_last_index]).intersection(vertices)) background = list(set(vertices).intersection(set(h_rows))) random_sets = [random.sample(vertices, len(degs)) for x in range(10000)] bg_genes = vertices bg_genes_file_name = os.path.join(constants.OUTPUT_DIR, "keypathwayminer_bg_genes.txt") file(os.path.join(constants.OUTPUT_DIR, bg_genes_file_name), "w+").write("\n".join(bg_genes)) file(os.path.join(constants.OUTPUT_DIR, "A"), "w+").write("\n".join(["\t".join([str(y) for y in x]) for x in A])) file(os.path.join(constants.OUTPUT_DIR, "degs"), "w+").write("\n".join(degs)) file(os.path.join(constants.OUTPUT_DIR, "proteins"), "w+").write("\n".join(vertices)) file(os.path.join(constants.OUTPUT_DIR, "vlist"), "w+").write("\t".join(vlist.columns) + "\n" + "\n".join( ["\t".join([str(y) for y in x]) for i, x in vlist.iterrows()])) file(os.path.join(constants.OUTPUT_DIR, "background"), "w+").write("\n".join(background)) file(os.path.join(constants.OUTPUT_DIR, "random_sets"), "w+").write("\n".join(["\t".join(x) for x in random_sets])) return network_file_name, bg_genes, \ os.path.join(constants.OUTPUT_DIR, "A"), os.path.join(constants.OUTPUT_DIR,"degs"), \ os.path.join(constants.OUTPUT_DIR, "proteins"), os.path.join(constants.OUTPUT_DIR,"vlist"), \ os.path.join(constants.OUTPUT_DIR, "background"), os.path.join(constants.OUTPUT_DIR,"random_sets"),\ 0.05, os.path.join(constants.OUTPUT_DIR, "moduledicoverer_output.txt")