def main(): list_of_files_format = ["{ds}-{rg}/exp_seq.{ds}-{rg}.tsv.gz"] constants.ALL_CANCER_TYPES = ["PRAD", "LICA", "RECA", "LIRI", "BRCA", "OV", "PACA", "PACA", "PAEN"] # ["KIRC", "KIRP", "LUSC", "LUAD", "COAD", "BRCA", "STAD", "LIHC", "READ", "PRAD", "BLCA", "HNSC", "THCA", "UCEC", "OV", "PAAD"] all_regions = ["FR", "FR", "EU", "JP", "KR", "AU", "AU", "CA", "AU"] for cur, cur_rg in zip(constants.ALL_CANCER_TYPES, all_regions): if cur == "PANCAN": continue constants.update_dirs(DATASET_NAME_u="ICGC_{}_{}".format(cur, cur_rg)) if not os.path.exists(constants.DATA_DIR): os.makedirs(constants.DATA_DIR) print "fetching data for {} ({}\{})".format(cur,constants.ALL_CANCER_TYPES.index(cur), len(constants.ALL_CANCER_TYPES)) list_of_files = [fr.format(ds=cur, rg=cur_rg) for fr in list_of_files_format] for cur_file_name in list_of_files: if not os.path.exists(os.path.join(constants.DATA_DIR,cur_file_name.format(cur))) and not os.path.exists(os.path.join(constants.DATA_DIR,".".join(cur_file_name.split(".")[:-1]))): # run_and_printchar(["wget", "https://gdc.xenahubs.net/download/TCGA-{}/Xena_Matrices/{}".format(cur, cur_file_name), constants.TCGA_DATA_DIR]) download("https://dcc.icgc.org/api/v1/download?fn=/current/Projects/"+ cur_file_name, constants.DATA_DIR) print "extract data for {}".format(cur) for cur_file_name in list_of_files: if os.path.exists(os.path.join(constants.DATA_DIR,cur_file_name.split('/')[-1])) and not os.path.exists(os.path.join(constants.DATA_DIR,".".join(cur_file_name.split('/')[-1].split(".")[:-1]))): with gzip.open(os.path.join(constants.DATA_DIR,cur_file_name.split('/')[-1]), 'rb') as f_in: with open(os.path.join(constants.DATA_DIR,".".join(cur_file_name.split('/')[-1].split(".")[:-1])), 'wb') as f_out: print os.path.join(constants.DATA_DIR,cur_file_name.split('/')[-1]) shutil.copyfileobj(f_in, f_out) print "delete redundant gz files {}".format(cur) for cur_file_name in list_of_files: if os.path.exists(os.path.join(constants.DATA_DIR,cur_file_name.split('/')[-1])) and os.path.exists(os.path.join(constants.DATA_DIR,".".join(cur_file_name.split('/')[-1].split(".")[:-1]))): os.remove(os.path.join(constants.DATA_DIR,cur_file_name.split('/')[-1])) if not os.path.exists(constants.OUTPUT_DIR): os.makedirs(constants.OUTPUT_DIR) if not os.path.exists(constants.CACHE_DIR): os.makedirs(constants.CACHE_DIR)
def main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes=None, score_method=constants.DEG_EDGER, network_file_name="dip.sif"): constants.update_dirs(DATASET_NAME_u=dataset_name) network_file_name, score_file_name, score_method, bg_genes = server.init_common_params( network_file_name, score_method) heat_file_name, network_file_name = init_specific_params( score_file_name, score_method, network_file_name) script_file_name = format_script( os.path.join(constants.SH_DIR, "run_{}.sh".format(ALGO_NAME)), ALGO_DIR=ALGO_DIR, CACHE_DIR=constants.CACHE_DIR, OUTPUT_DIR=constants.OUTPUT_DIR, NETWORK_NAME=os.path.splitext(os.path.basename(network_file_name))[0]) print subprocess.Popen( "bash {}".format(script_file_name), shell=True, stdout=subprocess.PIPE).stdout.read() # cwd=dir_path os.remove(script_file_name) modules, all_bg_genes = extract_modules_and_bg(bg_genes) output_base_dir = "" if constants.REPORTS: output_base_dir = build_all_reports(ALGO_NAME, dataset_name, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes) output_file_name = os.path.join(constants.OUTPUT_DIR, "{}_client_output.txt".format(ALGO_NAME)) output_modules(output_file_name, modules, score_file_name, output_base_dir)
def aggregate_disease(DATASET_NAME): counter = 1 report_all = pd.DataFrame() report_k = pd.DataFrame() while True: constants.update_dirs(DATASET_NAME_u=DATASET_NAME + "_{}".format(counter)) if not os.path.exists(constants.DATASET_DIR): break reports_metadata = { "all": aggregate_all( os.path.join(constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME), "all_modules_disease"), "k": aggregate_all( os.path.join(constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME), "k_{}_modules_disease".format(network.MODULE_TH)) } report_all = pd.concat([report_all, reports_metadata["all"][0]]) report_k = pd.concat([report_k, reports_metadata["k"][0]]) counter += 1 # report_all = report_all.set_index("algo") # report_k = report_k.set_index("algo") diseases_summary_headers = [ "TP_mean", "TP_std", "TP+FN_mean", "TP+FN_std", "TP+TN_mean", "TP+TN_std", "TP/(TP+TN)_mean", "TP/(TP+TN)_std", "TP/(TP+FN)_mean", "TP/(TP+FN)_std", "TP/(TP+TN)_std", "F1_mean", "F1_std" ] diseases_headers = [ 'disease_name', 'TP', 'TP+FN_(_true_)', 'TP+TN_(_retrieved_)', 'TP/(TP+FN)_(_recall_)', 'TP/(TP+TN)_(_precision_)', 'F1', 'module_size_avg', 'module_size_std', 'num_of_modules' ] report_all = report_all[diseases_headers] report_k = report_k[diseases_headers] df_summary_all = calc_p_r_f_scores(report_all)[diseases_summary_headers] df_summary_k = calc_p_r_f_scores(report_k)[diseases_summary_headers] for k, v in { "all": [report_all, df_summary_all], "k_{}".format(network.MODULE_TH): [report_k, df_summary_k] }.iteritems(): format_script(os.path.join(constants.TEMPLATES_DIR, "report.html"), REPORT=[], SCORE=[], DISEASE_GENES=json.dumps(to_full_list(v[0], "algo")), DISEASE_GENES_SUMMARY=json.dumps( to_full_list(v[1], "algo")), MODULE_FILTER=(k + "_modules"), MODULES_SCORE=[], EMB_WU=[]) output_dir = os.path.join(constants.OUTPUT_GLOBAL_DIR, DATASET_NAME) if not os.path.exists(output_dir): os.makedirs(output_dir) shutil.move(os.path.join(constants.TEMPLATES_DIR, "report.html"), os.path.join(output_dir, "report_{}.html".format(k)))
def main(): list_of_files_format = ["TCGA.{ds}.sampleMap/HiSeqV2.gz","TCGA.{ds}.sampleMap/{ds}_clinicalMatrix.gz","TCGA.{ds}.sampleMap/miRNA_HiSeq_gene.gz"] constants.ALL_CANCER_TYPES = ["PAAD", "OV"] # ["ESCA", "KIRC", "KIRP", "KICH", "LUSC", "LUAD", "COAD", "BRCA", "STAD", "LIHC", "READ", "PRAD", "BLCA", "HNSC", "THCA", "UCEC"] for cur in constants.ALL_CANCER_TYPES: if cur == "PANCAN": continue constants.update_dirs(DATASET_NAME_u="TCGA_"+cur) if not os.path.exists(constants.DATA_DIR): os.makedirs(constants.DATA_DIR) print "fetching data for {} ({}\{})".format(cur,constants.ALL_CANCER_TYPES.index(cur), len(constants.ALL_CANCER_TYPES)) list_of_files = [fr.format(ds=cur) for fr in list_of_files_format] for cur_file_name in list_of_files: if not os.path.exists(os.path.join(constants.DATA_DIR,cur_file_name.format(cur))) and not os.path.exists(os.path.join(constants.DATA_DIR,".".join(cur_file_name.split(".")[:-1]))): # run_and_printchar(["wget", "https://gdc.xenahubs.net/download/TCGA-{}/Xena_Matrices/{}".format(cur, cur_file_name), constants.TCGA_DATA_DIR]) download("https://tcga.xenahubs.net/download/"+ cur_file_name, constants.DATA_DIR) print "extract data for {}".format(cur) for cur_file_name in list_of_files: if os.path.exists(os.path.join(constants.DATA_DIR,cur_file_name.split('/')[-1])) and not os.path.exists(os.path.join(constants.DATA_DIR,".".join(cur_file_name.split('/')[-1].split(".")[:-1]))): with gzip.open(os.path.join(constants.DATA_DIR,cur_file_name.split('/')[-1]), 'rb') as f_in: with open(os.path.join(constants.DATA_DIR,".".join(cur_file_name.split('/')[-1].split(".")[:-1])), 'wb') as f_out: shutil.copyfileobj(f_in, f_out) print "delete redundant gz files {}".format(cur) for cur_file_name in list_of_files: if os.path.exists(os.path.join(constants.DATA_DIR,cur_file_name.split('/')[-1])) and os.path.exists(os.path.join(constants.DATA_DIR,".".join(cur_file_name.split('/')[-1].split(".")[:-1]))): os.remove(os.path.join(constants.DATA_DIR,cur_file_name.split('/')[-1])) if not os.path.exists(constants.OUTPUT_DIR): os.makedirs(constants.OUTPUT_DIR) if not os.path.exists(constants.CACHE_DIR): os.makedirs(constants.CACHE_DIR)
def main(): list_of_files_format = ["TCGA-{}.htseq_counts.tsv.gz","TCGA-{}.htseq_fpkm.tsv.gz","TCGA-{}.htseq_fpkm-uq.tsv.gz","TCGA-{}.GDC_phenotype.tsv.gz","TCGA-{}.survival.tsv.gz","TCGA-{}.mutect2_snv.tsv.gz", "TCGA-{}.mirna.tsv.gz"] for cur in constants.ALL_CANCER_TYPES: if cur == "PANCAN": continue constants.update_dirs(CANCER_TYPE_u=cur) if not os.path.exists(constants.TCGA_DATA_DIR): os.makedirs(constants.TCGA_DATA_DIR) print "fetching data for {} ({}\{})".format(cur,constants.ALL_CANCER_TYPES.index(cur), len(constants.ALL_CANCER_TYPES)) list_of_files = [fr.format(cur) for fr in list_of_files_format] for cur_file_name in list_of_files: if not os.path.exists(os.path.join(constants.TCGA_DATA_DIR,cur_file_name.format(cur))) and not os.path.exists(os.path.join(constants.TCGA_DATA_DIR,".".join(cur_file_name.format(cur).split(".")[:-1]))): # run_and_printchar(["wget", "https://gdc.xenahubs.net/download/TCGA-{}/Xena_Matrices/{}".format(cur, cur_file_name), constants.TCGA_DATA_DIR]) download("https://gdc.xenahubs.net/download/TCGA-{}/Xena_Matrices/{}".format(cur, cur_file_name), constants.TCGA_DATA_DIR) print "extract data for {}".format(cur) for cur_file_name in list_of_files: if os.path.exists(os.path.join(constants.TCGA_DATA_DIR,cur_file_name.format(cur))) and not os.path.exists(os.path.join(constants.TCGA_DATA_DIR,".".join(cur_file_name.format(cur).split(".")[:-1]))): with gzip.open(os.path.join(constants.TCGA_DATA_DIR,cur_file_name.format(cur)), 'rb') as f_in: with open(os.path.join(constants.TCGA_DATA_DIR,".".join(cur_file_name.format(cur).split(".")[:-1])), 'wb') as f_out: shutil.copyfileobj(f_in, f_out) print "delete redundant gz files {}".format(cur) for cur_file_name in list_of_files: if os.path.exists(os.path.join(constants.TCGA_DATA_DIR,cur_file_name.format(cur))) and os.path.exists(os.path.join(constants.TCGA_DATA_DIR,".".join(cur_file_name.format(cur).split(".")[:-1]))): os.remove(os.path.join(constants.TCGA_DATA_DIR,cur_file_name.format(cur))) if not os.path.exists(constants.OUTPUT_DIR): os.makedirs(constants.OUTPUT_DIR) if not os.path.exists(constants.CACHE_DIR): os.makedirs(constants.CACHE_DIR) # main()
def main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes = None, score_method=constants.DEG_EDGER, network_file_name="dip.sif"): constants.update_dirs(DATASET_NAME_u=dataset_name) search_method = "sa" network_file_name, score_file_name, score_method, bg_genes= server.init_common_params(network_file_name, score_method) results_file_name = init_specific_params(search_method) script_file_name=format_script(os.path.join(constants.SH_DIR, "run_{}.sh".format(ALGO_NAME)), BASE_FOLDER=constants.BASE_PROFILE, DATASET_DIR=constants.DATASET_DIR, ALGO_DIR=ALGO_DIR, NETWORK_NAME=network_file_name, SCORE_FILE_NAME=score_file_name, IS_GREEDY=str(search_method == "greedy"), OUTPUT_FILE=results_file_name, NUM_OF_MODULES=10, OVERLAP_THRESHOLD=0) subprocess.Popen("bash {}".format(script_file_name), shell=True, stdout=subprocess.PIPE, cwd=ALGO_DIR).stdout.read() os.remove(script_file_name) modules_genes_file_name = os.path.join(constants.OUTPUT_DIR, "{}_{}_module_genes.txt".format(ALGO_NAME, search_method)) all_bg_genes, modules = extract_modules_and_bg(bg_genes, results_file_name, modules_genes_file_name) output_base_dir = "" if constants.REPORTS: output_base_dir = build_all_reports(ALGO_NAME + "_" + search_method, dataset_name, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes) output_file_name=os.path.join(constants.OUTPUT_DIR, "{}_{}_client_output.txt".format(ALGO_NAME, search_method)) output_modules(output_file_name, modules, score_file_name, output_base_dir)
def main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes = None, score_method=constants.DEG_EDGER, network_file_name="dip.sif", fdr=0.05): constants.update_dirs(DATASET_NAME_u=dataset_name) network_file_name, score_file_name, score_method, bg_genes = server.init_common_params(network_file_name , score_method) all_bg_genes, modules = run_bionet_for_all_modules(fdr, network_file_name, score_file_name, constants.IS_PVAL_SCORES) output_base_dir = "" if constants.REPORTS: output_base_dir = build_all_reports(ALGO_NAME, dataset_name, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes) output_file_name = os.path.join(constants.OUTPUT_DIR, "{}_client_output.txt".format(ALGO_NAME)) output_modules(output_file_name, modules, score_file_name, output_base_dir)
def main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes=None): global NETWORK_NAME constants.update_dirs(DATASET_NAME_u=dataset_name) network_file_name, score_file_name, score_method, bg_genes = server.init_common_params( NETWORK_NAME) STRATEGY = "INES" algorithm = "OPTIMAL" omitted_genes = [] modules = [] all_bg_genes = [] cur_network_name = NETWORK_NAME for cur_i_module in range(40): binary_score_file_name, cur_network_file_name = init_specific_params( score_file_name, score_method, omitted_genes, network_file_name, str(cur_i_module)) format_scripts(algo_name=ALGO_NAME, score_file_name=binary_score_file_name, network_name=cur_network_file_name, STRATEGY=STRATEGY, algorithm=algorithm) print subprocess.Popen("bash {}/run_{}.sh".format( constants.SH_DIR, ALGO_NAME), shell=True, stdout=subprocess.PIPE, cwd=ALGO_DIR).stdout.read() module, all_bg_gene = extract_module_genes(bg_genes, STRATEGY, algorithm) if len(module[0]) > 3: modules.append(module[0]) all_bg_genes.append(all_bg_gene[0]) omitted_genes += list(module[0]) output_base_dir = "" if constants.REPORTS: output_base_dir = build_all_reports( ALGO_NAME + "_" + STRATEGY + "_" + algorithm, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes) output_file_name = os.path.join( constants.OUTPUT_DIR, "{}_{}_{}_client_output.txt".format(ALGO_NAME, STRATEGY, algorithm)) output_modules(output_file_name, modules, score_file_name, output_base_dir)
def main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes = None): global NETWORK_NAME constants.update_dirs(DATASET_NAME_u=dataset_name) network_file_name, score_file_name, score_method, bg_genes = server.init_common_params(NETWORK_NAME) STRATEGY = "GLONE" binary_score_file_name = init_common_params(score_file_name, score_method) format_scripts(algo_name=ALGO_NAME, score_file_name=binary_score_file_name, network_name=NETWORK_NAME, STRATEGY=STRATEGY) print subprocess.Popen("bash {}/run_{}.sh".format(constants.SH_DIR, ALGO_NAME), shell=True, stdout=subprocess.PIPE, cwd=ALGO_DIR).stdout.read() modules, all_bg_genes = extract_module_genes(bg_genes, STRATEGY) output_base_dir = "" if constants.REPORTS: output_base_dir = build_all_reports(ALGO_NAME + "_" + STRATEGY, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes) output_file_name = os.path.join(constants.OUTPUT_DIR, "{}_client_output.txt".format(ALGO_NAME)) output_modules(output_file_name, modules, score_file_name, output_base_dir )
def main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes = None, score_method=constants.DEG_EDGER, network_file_name="dip.sif"): constants.update_dirs(DATASET_NAME_u=dataset_name) network_file_name, score_file_name, score_method, bg_genes = server.init_common_params(network_file_name, score_method) strategy = "INES" algorithm = "GREEDY" omitted_genes = [] modules = [] all_bg_genes = [] dest_algo_dir = "{}_{}".format(ALGO_DIR, random.random()) shutil.copytree(ALGO_DIR, dest_algo_dir) empty_counter = 0 for cur_i_module in range(40): binary_score_file_name, cur_network_file_name = init_specific_params(score_file_name, score_method, omitted_genes, network_file_name, str(random.random()), dest_algo_dir) script_file_name=format_scripts(score_file_name=binary_score_file_name, network_name=cur_network_file_name, STRATEGY=strategy, algorithm=algorithm, algo_dir=dest_algo_dir, dataset_name=dataset_name) print subprocess.Popen("bash {}".format(script_file_name), shell=True, stdout=subprocess.PIPE, cwd=dest_algo_dir).stdout.read() module, all_bg_gene = extract_module_genes(bg_genes, strategy, algorithm, dest_algo_dir) if len(module[0]) > 3: empty_counter=0 modules.append(module[0]) all_bg_genes.append(all_bg_gene[0]) else: empty_counter+=1 omitted_genes += list(module[0]) os.remove(script_file_name) if empty_counter>3: print "got more that 3 smalle modules in row. continue..." break shutil.rmtree(dest_algo_dir) output_base_dir = "" if constants.REPORTS: output_base_dir = build_all_reports("{}_{}_{}".format(ALGO_NAME,strategy, algorithm), dataset_name, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes) output_file_name = os.path.join(constants.OUTPUT_DIR, "{}_client_output.txt".format("{}_{}_{}".format(ALGO_NAME,strategy, algorithm))) output_modules(output_file_name, modules, score_file_name, output_base_dir )
def main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes=None, score_method=constants.DEG_EDGER): global NETWORK_NAME constants.update_dirs(DATASET_NAME_u=dataset_name) network_file_name, score_file_name, score_method, bg_genes = server.init_common_params( NETWORK_NAME, score_method) ge_file_name, network_file_name, output_file_name = init_specific_params( ge_file_name=os.path.join(constants.DATA_DIR, "ge.tsv"), network_file_name=os.path.join(constants.NETWORKS_DIR, NETWORK_NAME + ".sif")) format_script(os.path.join(constants.SH_DIR, "run_{}.sh".format(ALGO_NAME)), ALGO_BASE_DIR=constants.ALGO_BASE_DIR, GE_FILE_NAME=ge_file_name, NETWORK_FILE_NAME=network_file_name, BETA=0.95, MINIMAL_MODULE_SIZE=4, MAXIMAL_MODULE_SIZE=1000, OUTPUT_FILE_NAME=output_file_name) subprocess.Popen("bash {}/run_{}.sh".format(constants.SH_DIR, ALGO_NAME), shell=True, stdout=subprocess.PIPE, cwd=ALGO_DIR).stdout.read() modules, all_bg_genes = extract_modules_and_bg(bg_genes, output_file_name) output_base_dir = "" if constants.REPORTS: output_base_dir = build_all_reports(ALGO_NAME, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes) output_file_name = os.path.join(constants.OUTPUT_DIR, "{}_client_output.txt".format(ALGO_NAME)) output_modules(output_file_name, modules, score_file_name, output_base_dir)
def main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes = None, score_method=constants.DEG_EDGER): global NETWORK_NAME constants.update_dirs(DATASET_NAME_u=dataset_name) network_file_name, score_file_name, score_method, bg_genes = server.init_common_params(NETWORK_NAME, score_method) if score_method == constants.PREDEFINED_SCORE: raise Exception("Cannot run this algo on scor-based metrics. please provide gene expression file") bg_genes, network_file_name = init_specific_params(NETWORK_NAME) format_script(os.path.join(constants.SH_DIR, "run_{}.sh".format(ALGO_NAME)), BASE_FOLDER=constants.BASE_PROFILE, DATASET_DIR=constants.DATASET_DIR, ALGO_DIR=ALGO_DIR, NETWORK_NAME=NETWORK_NAME) subprocess.Popen("bash {}/run_{}.sh".format(constants.SH_DIR, ALGO_NAME), shell=True, stdout=subprocess.PIPE, cwd=ALGO_DIR).stdout.read() modules, all_bg_genes = extract_modules_and_bg(bg_genes) output_base_dir = "" if constants.REPORTS: output_base_dir = build_all_reports(ALGO_NAME, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes) output_file_name = os.path.join(constants.OUTPUT_DIR, "{}_client_output.txt".format(ALGO_NAME)) output_modules(output_file_name, modules, score_file_name, output_base_dir )
def main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes=None, score_method=constants.DEG_EDGER, network_file_name="dip.sif"): constants.update_dirs(DATASET_NAME_u=dataset_name) network_file_name, score_file_name, score_method, bg_genes = server.init_common_params( network_file_name, score_method) script_name = "run_{}.sh".format(ALGO_NAME) dest_algo_dir = "{}_{}".format(ALGO_DIR, random.random()) shutil.copytree(ALGO_DIR, dest_algo_dir) conf_file_name = init_specific_params(score_file_name, dest_algo_dir) script_file_name = format_script(os.path.join(constants.SH_DIR, script_name), BASE_FOLDER=constants.BASE_PROFILE, DATASET_DIR=constants.DATASET_DIR, CONFIG_FILE_NAME=conf_file_name, NETBOX_DIR=dest_algo_dir) print subprocess.Popen("bash {}".format(script_file_name), shell=True, stdout=subprocess.PIPE, cwd=dest_algo_dir).stdout.read() modules, all_bg_genes = extract_modules_and_bg(bg_genes, dest_algo_dir) os.remove(script_file_name) os.remove(conf_file_name) shutil.rmtree(dest_algo_dir) output_base_dir = "" if constants.REPORTS: output_base_dir = build_all_reports(ALGO_NAME, dataset_name, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes) output_file_name = os.path.join(constants.OUTPUT_DIR, "{}_client_output.txt".format(ALGO_NAME))
import pandas as pd import constants import os constants.update_dirs(DATASET_NAME_u="IES") df_ge = pd.read_csv(os.path.join(constants.DATA_DIR, "ge_mouse.tsv"), sep="\t") df_ge = df_ge.set_index("id") df_ge = df_ge[~df_ge.index.duplicated(keep='first')] df_mouse2human = pd.read_csv(os.path.join(constants.DICTIONARIES_DIR, "mouse2human.txt"), sep="\t") df_mouse2human = df_mouse2human.set_index("Mouse gene stable ID") df_mouse2human = df_mouse2human[~df_mouse2human.index.duplicated(keep='first')] df_converted_ge = pd.concat([df_ge, df_mouse2human], join="inner", axis=1) df_converted_ge = df_converted_ge.set_index("Gene stable ID") df_converted_ge = df_converted_ge[~df_converted_ge.index.duplicated(keep="first")] df_converted_ge.to_csv(os.path.join(constants.DATA_DIR, "ge.tsv"), sep="\t", index_label="id")
def __init__(self, dataset_names, meta_groups_files, metagroups_names): self.labels = np.array([]) self.labels_unique=np.array([]) self.samples = pd.DataFrame() self.survival= pd.DataFrame() label_counter=0 all_genes=np.array([]) for dataset_name, meta_groups_file, metagroups_name in zip(dataset_names, meta_groups_files, metagroups_names): constants.update_dirs(DATASET_NAME_u=dataset_name) meta_groups = [json.load(file(meta_groups_file))] data_normalizaton = "rsem" gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, mirna_file_name, pval_preprocessing_file_name =param_builder_func( dataset=dataset_name, data_normalizaton=data_normalizaton) # gene_expression_file_name=mirna_file_name tested_gene_list_file_name = "protein_coding.txt" # "mir_total.txt" # total_gene_list_file_name = "protein_coding.txt" # "mir_total.txt" # filter_expression = None print gene_expression_file_name data = infra.load_integrated_ge_data(tested_gene_list_file_name=tested_gene_list_file_name, total_gene_list_file_name=total_gene_list_file_name, gene_expression_file_name=gene_expression_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=None, meta_groups=meta_groups, filter_expression=filter_expression) gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns, labels_assignment, survival_dataset = data all_genes=np.append(all_genes, gene_expression_top_var_headers_columns) if survival_dataset is not None: self.survival=pd.concat([self.survival, pd.DataFrame(survival_dataset[1:, 1:], index=survival_dataset[1:, 0])]) # self.labels_unique = np.array([x['_name'] for x in meta_groups[0]]) labels_assignment=np.array(labels_assignment)[0] for cur_label in np.unique(labels_assignment): cur_label_name=[cur["_name"] for cur in meta_groups[0] if "_label" in cur and int(cur["_label"])==cur_label] cur_label_name = "{}, {}".format(metagroups_name, cur_label_name[0] if len(cur_label_name) > 0 else "unknown") print metagroups_name # cur_label_name = "{}".format(cur_label_name[0] if len(cur_label_name) > 0 else "unknown") # print cur_label_name # if "unknown" in cur_label_name: continue df_new = pd.DataFrame(data=gene_expression_top_var[labels_assignment==cur_label], index=gene_expression_top_var_headers_rows[labels_assignment==cur_label], columns=gene_expression_top_var_headers_columns) self.samples = pd.concat([self.samples, df_new], axis=0) # print "number of nan samples: {}".format(np.sum(np.isnan(np.sum(self.samples.values, axis=0)))) # print "shape size: {}".format(df_new.shape) all_genes=np.unique(np.append(all_genes, df_new.columns.values)) # print "numnber of all genes: {}. current: {}".format(all_genes.shape[0], df_new.columns.values.shape[0]) # print "current nan cols: {}".format(self.samples.columns[np.isnan(np.sum(self.samples.values, axis=0))]) # print "current filtered shape: {}".format(self.samples.dropna(axis=1).shape) self.labels = np.append(self.labels, [cur_label_name for x in range(len(df_new.index))]) self.labels_unique = np.append(self.labels_unique, [cur_label_name]) label_counter+=1 # print "all genes: {}".format(len(set(all_genes))) var_th_index =n_input_layer-1 if var_th_index is not None: print "filtering top vars" self.samples=self.samples.dropna(axis=1) gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns = infra.filter_top_var_genes( self.samples.values.T, self.samples.index.values, self.samples.columns, var_th_index) self.samples = pd.DataFrame(data=gene_expression_top_var, index=gene_expression_top_var_headers_rows, columns=gene_expression_top_var_headers_columns).T self.samples = self.samples.divide(self.samples.max(axis=1), axis=0) # self.samples = self.samples / self.samples.max() print "total shape: {}".format(self.samples.shape)
def main(dataset, cur_json, ds_types="GDC"): constants.update_dirs(DATASET_NAME_u=dataset) meta_groups = None filter_expression = None meta_groups = [json.load(file("../filters/{}.json".format(cur_json)))] filter_expression = json.load(file("../filters/{}.json".format(cur_json))) gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, mirna_file_name, pval_preprocessing_file_name = build_params( type=ds_types, dataset=constants.DATASET_NAME, data_normalizaton=data_normalizaton) gene_expression_normalized_file_name = "ge_normalized.tsv" # gene_expression_file_name survival_file_name = "none" tested_gene_expression, h_rows, h_cols, labels_assignment, survival_dataset = infra.load_integrated_ge_data( "dip_bg.txt", "dip_bg.txt", gene_expression_file_name, survival_file_name, phenotype_file_name, gene_filter_file_name=None, filter_expression=filter_expression, meta_groups=meta_groups, var_th_index=None) file(os.path.join(constants.DATA_DIR, "classes.tsv"), 'w+').write('\t'.join([str(x) for x in labels_assignment[0]])) h_cols = [x.split('.')[0] for x in h_cols] df_data = pd.DataFrame(index=h_rows, columns=h_cols, data=tested_gene_expression).T df_data.to_csv(os.path.join(constants.DATA_DIR, "ge.tsv"), index_label="eid", sep="\t") var_th_index = None if not use_algo_cache: run_dataset(dataset, score_method=deg_method, algos=algos) # exit(0) gene_list_file_names = [] prs = pd.DataFrame(columns=[ 'algo', 'algo_pr', 'kmean_pr_avg', 'kmean_pr_std', 'algo_kmean_ratio', 'top_sig_pr', "algo_top_sig_ratio", 'rand_pr_avg', 'rand_pr_std', "algo_rand_ratio", 'algo_pr_rank_from_rand', "num of modules", 'num_of_genes' ]) for cur_algo in algos: print "about to start running {}".format(cur_algo) if not os.path.exists( os.path.join(constants.OUTPUT_GLOBAL_DIR, 'pca', dataset, cur_algo)): os.makedirs( os.path.join(constants.OUTPUT_GLOBAL_DIR, 'pca', dataset, cur_algo)) algo_output = json.loads( file( os.path.join(constants.OUTPUT_DIR, "{}_client_output.txt".format( cur_algo))).read().split("\n")[1]) module_i = 0 algo_pvals = [] df_mean = pd.DataFrame() gene_2_module = {} num_of_genes = 0 algo_genes_flatted = [] while True: module_genes_flatted = [ x['eid'] for x in algo_output if module_i in x['modules'] ] algo_genes_flatted += module_genes_flatted num_of_genes += len(module_genes_flatted) print "# of genes in module {} : {}".format( module_i, len(module_genes_flatted)) for cur in module_genes_flatted: gene_2_module[cur] = module_i algo_genes_flatted += module_genes_flatted if len(module_genes_flatted) == 0 and module_i > 0: break if len(module_genes_flatted) < 4 or sum( df_data.index.isin(module_genes_flatted)) == 0: module_i += 1 continue gene_list_file_names.append( os.path.join(constants.LIST_DIR, cur_algo + ".txt")) file(gene_list_file_names[-1], 'w+').write("\n".join(module_genes_flatted)) df_mean = pd.concat( (df_mean, pd.DataFrame(zscore( df_data[df_data.index.isin(module_genes_flatted)], axis=1).mean(axis=0).reshape(1, len(df_data.columns)), columns=df_data.columns))) module_i += 1 module_i = len(df_mean.index) if module_i < 2: print "not enough modules. retrieved {}".format(module_i) continue mean_file_name = os.path.join(constants.DATA_DIR, "mean.tsv") df_mean.index = np.arange(df_mean.shape[0]) df_mean.to_csv(mean_file_name, sep="\t", index_label="eid") index_file_name = os.path.join(constants.LIST_DIR, "{}_indices.txt".format(cur_algo)) file(index_file_name, 'w+').write('\n'.join([str(x) for x in df_mean.index.values])) algo_genes_file_name = os.path.join( constants.LIST_DIR, "{}_all_genes.txt".format(cur_algo)) file(algo_genes_file_name, 'w+').write('\n'.join([str(x) for x in algo_genes_flatted])) df_algo_gene_matrix = df_data[df_data.index.isin(algo_genes_flatted)] results = plot_detailed_pca( tested_gene_list_file_name="{}_indices.txt".format(cur_algo), total_gene_list_file_name="protein_coding.txt", gene_expression_file_name=mean_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, filter_expression=filter_expression, meta_groups=meta_groups, var_th_index=var_th_index, algo_name=cur_algo, plot_svm=plot_svm) if results is None: continue X, y, algo_pr, algo_roc = results print "results for mean: {}".format(algo_pr) algo_bg_pr_mean = 0 algo_bg_pr_std = 0 if KMEANS_TIMES > 1: all_algo_bg_pr = [] for kmean_i in range(KMEANS_TIMES): _1, clusters, _2 = kmeanssample(X=df_algo_gene_matrix.values, k=module_i, metric="euclidean") bg_modules = [ df_algo_gene_matrix.index.values[clusters == cur_i] for cur_i in range(module_i) ] df_mean_bg = pd.DataFrame() for i, module in enumerate(bg_modules): print "# of genes in background module {} : {}".format( i, len(module)) gene_list_file_names.append( os.path.join(constants.LIST_DIR, cur_algo + "_bg.txt")) file(gene_list_file_names[-1], 'w+').write("\n".join(module_genes_flatted)) df_mean_bg = pd.concat( (df_mean_bg, pd.DataFrame( zscore(df_data[df_data.index.isin(module)], axis=1).mean(axis=0).reshape( 1, len(df_data.columns)), columns=df_data.columns))) bg_mean_file_name = os.path.join(constants.DATA_DIR, "mean_bg.tsv") df_mean_bg.index = np.arange(df_mean_bg.shape[0]) df_mean_bg.to_csv(bg_mean_file_name, sep="\t", index_label="eid") index_file_name = os.path.join( constants.LIST_DIR, "{}_bg_indices.txt".format(cur_algo)) file(index_file_name, 'w+').write('\n'.join( [str(x) for x in df_mean_bg.index.values])) all_genes_file_name = os.path.join( constants.LIST_DIR, "{}_all_genes.txt".format(cur_algo)) file(all_genes_file_name, 'w+').write('\n'.join(algo_genes_flatted)) bg_genes = pd.read_csv( os.path.join(constants.CACHE_DIR, deg_file_name.format(deg_method)), sep='\t', index_col=0).index.values[:len(df_mean.index)] bg_genes_file_name = os.path.join( constants.LIST_DIR, "{}_{}_bg_genes.txt".format(cur_algo, deg_method)) file(bg_genes_file_name, 'w+').write('\n'.join([x.split('.')[0] for x in bg_genes])) X, y, algo_bg_pr, algo_bg_roc = plot_detailed_pca( tested_gene_list_file_name="{}_bg_indices.txt".format( cur_algo), total_gene_list_file_name="protein_coding.txt", gene_expression_file_name=bg_mean_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, filter_expression=filter_expression, meta_groups=meta_groups, var_th_index=var_th_index, algo_name=cur_algo, plot_svm=plot_svm) print "results for mean: {}".format(algo_bg_pr) all_algo_bg_pr.append(algo_bg_pr) all_algo_bg_pr = np.array(all_algo_bg_pr) algo_bg_pr_mean = all_algo_bg_pr.mean() algo_bg_pr_std = all_algo_bg_pr.mean() top_sig_pr = 0 if TOP_SIG: top_sig_genes = pd.read_csv(os.path.join(constants.CACHE_DIR, deg_file_name), sep='\t', index_col=0) top_sig_genes = top_sig_genes.index.values[:len( top_sig_genes.index) / 200] # len(df_mean.index) top_sig_genes_file_name = os.path.join( constants.LIST_DIR, "{}_{}_top_sig_genes.txt".format(cur_algo, deg_method)) file(top_sig_genes_file_name, 'w+').write('\n'.join( [x.split('.')[0] for x in top_sig_genes])) X, y, top_sig_pr, top_sig_roc = plot_detailed_pca( tested_gene_list_file_name=os.path.basename( top_sig_genes_file_name), total_gene_list_file_name="protein_coding.txt", gene_expression_file_name=gene_expression_normalized_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, filter_expression=filter_expression, meta_groups=meta_groups, var_th_index=var_th_index, algo_name=cur_algo, plot_svm=plot_svm) print "results for top: {}".format(top_sig_pr) rand_prs_mean = 0 rand_prs_std = 0 rand_prs = [] trials = 0 if RAND_TIMES > 1: while trials < RAND_TIMES: random_set_file_name = generate_random_set( random_size=len(df_mean.index), # df_mean.index meta_gene_set="dip_bg.txt".format(cur_algo)) print "running {} iteration for {} random bg with {} genes".format( trials, cur_algo, len(df_mean.index)) results = plot_detailed_pca( tested_gene_list_file_name=random_set_file_name, total_gene_list_file_name="protein_coding.txt", gene_expression_file_name= gene_expression_normalized_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, filter_expression=filter_expression, meta_groups=meta_groups, var_th_index=var_th_index, feature_names=gene_2_module, algo_name=cur_algo, plot_svm=plot_svm) if results is None: print "not enough genes retrieved. retry.." continue X, y, rand_pr, rand_roc = results trials += 1 rand_prs.append(rand_pr) print "results for random {}: {}".format(trials, rand_pr) rand_prs = np.array(rand_prs) rand_prs_mean = rand_prs.mean() rand_prs_std = rand_prs.std() row = { 'algo': cur_algo, 'algo_pr': algo_pr, 'kmean_pr_avg': algo_bg_pr_mean, 'kmean_pr_std': algo_bg_pr_std, 'algo_kmean_ratio': algo_pr / algo_bg_pr_mean, 'top_sig_pr': top_sig_pr, "algo_top_sig_ratio": algo_pr / top_sig_pr, 'rand_pr_mean': rand_prs_mean, 'rand_pr_std': rand_prs_std, "algo_rand_ratio": algo_pr / rand_pr, 'algo_pr_rank_from_rand': len([cur for cur in rand_prs if cur > algo_pr]), "num of modules": module_i, 'num_of_genes': num_of_genes } row.update({'rand_pr' + str(i): v for i, v in enumerate(rand_prs)}) prs = prs.append(row, ignore_index=True) prs = prs.set_index('algo') prs.to_csv(os.path.join(constants.OUTPUT_GLOBAL_DIR, 'pca', constants.DATASET_NAME, "pr_summary_{}_{}.tsv".format(dataset, cur_json)), sep='\t') print " algo pvals" print algo_pvals
# datasets = ["GWAS_fasting_insulin", "GWAS_2hr_glucose", "GWAS_adhd", "GWAS_alzheimers", "GWAS_anorexia", # "GWAS_autism", "GWAS_beta-cell_function", "GWAS_bipolar_disorder", "GWAS_blood_pressure_systolic", # "GWAS_body_mass_index", "GWAS_coronary_artery_disease", "GWAS_crohns_disease", "GWAS_cross_disorder"] datasets = [ name for name in os.listdir(constants.OUTPUT_GLOBAL_DIR) if os.path.isdir(os.path.join(constants.OUTPUT_GLOBAL_DIR, name)) and name.startswith("GWAS_random") ] # and not name.startswith("GWAS_random") and not name.startswith("GWAS_cancer") # datasets = ["TNFa_2", "MCF7_2", "SOC", "HC12", "IEM", "IES"] # datasets=['GWAS_schizophrenia'] all_embs = np.array([]) fig, ax = plt.subplots(figsize=(15, 5)) for cur_ds in datasets: print "current ds: {}".format(cur_ds) constants.update_dirs(DATASET_NAME_u=cur_ds) root_path = os.path.join(constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME) all_algo_modules = {} for name in os.listdir(root_path): if os.path.isdir(os.path.join(root_path, name)) and name not in [ "data", "cache", "output" ]: modules_summary = pd.read_csv(os.path.join( root_path, name, "modules_summary.tsv"), sep="\t") if len(modules_summary.index) == 0: continue modules_summary = modules_summary.set_index("module") all_algo_modules[name] = np.array(modules_summary.index)
def main(datasets, algos): colormap = cm.rainbow colorlist = [ml_colors.rgb2hex(colormap(i)) for i in np.array(list(range(len(algos)))) / float(len(algos) - 1)] df_matrix = pd.DataFrame() df_summary = pd.DataFrame() for cur_ds in datasets: constants.update_dirs(DATASET_NAME_u=cur_ds) total_num_genes=[] avg_num_genes=[] std_num_genes=[] algos_signals=[] algo_go_sims = [] for i_algo, cur_algo in enumerate(algos): print "current aggregation: {}, {}".format(cur_ds,cur_algo) try: total_num_genes.append(pd.read_csv( os.path.join(constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME, cur_algo, "all_modules_general.tsv"), sep="\t")["total_num_genes"][0]) avg_num_genes.append(pd.read_csv( os.path.join(constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME, cur_algo, "modules_summary.tsv"), sep="\t")["#_genes"].mean()) std_num_genes.append(pd.read_csv( os.path.join(constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME, cur_algo, "modules_summary.tsv"), sep="\t")["#_genes"].std()) except: print "no genes were found for: {}, {}".format(cur_ds, cur_algo) total_num_genes.append(0) algos_signals.append(float(file(os.path.join(constants.OUTPUT_GLOBAL_DIR, "emp_fdr", "ds_2_alg_scores", "{}_{}_{}".format(cur_ds, cur_algo, "n_sig.txt"))).read())) algo_go_sims.append(float(file(os.path.join(constants.OUTPUT_GLOBAL_DIR, "emp_fdr", "ds_2_alg_scores", "{}_{}_{}".format(cur_ds, cur_algo, "var.txt"))).read())) fig, ax = plt.subplots(figsize=(10, 10)) print "all data: \n{}\n{}\n{}\n{}".format(algos_signals, algo_go_sims, algos, total_num_genes) for h, s, c, a, gene_size, module_mean, module_std in zip(algos_signals, algo_go_sims, colorlist, algos, total_num_genes, avg_num_genes, std_num_genes): # [0 for x in range(len(algo_go_sim_score))] print (h, s) ax.scatter(h, s, s=(50 + 2000 * (float(gene_size) / (1+np.max(total_num_genes)))), c=c, cmap='jet', label=a) df_series=pd.Series({"algo": a, "dataset": cur_ds, "sig_terms": h, "sig_terms_rank": pd.Series(np.array(algos_signals)).rank(ascending=0).values[ np.where(np.array(algos_signals) == h)[0][0]], "variability": s, "variability_rank": pd.Series(np.array(algo_go_sims)).rank(ascending=0).values[ np.where((np.array(algo_go_sims)) == s)[0][0]], "n_genes": gene_size, "module_size_mean": module_mean, "module_size_std": module_std}) df_series.name = "{}_{}".format(cur_ds, a) df_summary=df_summary.append(df_series) df_matrix.loc[a, cur_ds]=h colorlist = [ml_colors.rgb2hex(colormap(i)) for i in np.array(list(range(len(algos)))) / float(len(algos) - 1)] patches = [Line2D([0], [0], marker='o', color='gray', label=a, markerfacecolor=c) for i, a, c in zip(list(range(len(algos))), algos, colorlist)] ax.set_xlabel("# GO terms (-log10(qval)) above threshold") ax.set_ylabel("GO terms variability") ax.legend(handles=patches) ax.grid(True) plt.savefig(os.path.join(constants.OUTPUT_GLOBAL_DIR, "hs_plot_terms_signal_algo_{}.png".format(constants.DATASET_NAME))) return df_summary, df_matrix
else: small_modules += 1 return all_bg_genes, modules def main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes = None, score_method=constants.DEG_EDGER, network_file_name="dip.sif", fdr=0.05): constants.update_dirs(DATASET_NAME_u=dataset_name) network_file_name, score_file_name, score_method, bg_genes = server.init_common_params(network_file_name , score_method) all_bg_genes, modules = run_bionet_for_all_modules(fdr, network_file_name, score_file_name, constants.IS_PVAL_SCORES) output_base_dir = "" if constants.REPORTS: output_base_dir = build_all_reports(ALGO_NAME, dataset_name, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes) output_file_name = os.path.join(constants.OUTPUT_DIR, "{}_client_output.txt".format(ALGO_NAME)) output_modules(output_file_name, modules, score_file_name, output_base_dir) if __name__ == "__main__": constants.update_dirs(DATASET_NAME_u="GE_ERS_1") main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes = None, score_method=constants.DEG_EDGER, network_file_name="dip.sif")
def main(dataset_name): global df_go_metadata, all_hg_score_modules, df_hg_output colormap = cm.rainbow constants.update_dirs(DATASET_NAME_u=dataset_name) GO_RANK_CUTOFF = 150 ########################## if TERMS_SIMILARITY_TO_NUM_OF_TERMS: ontology_type = 'GeneOntology' ignore_parameters = {'ignore': {}} source_type = 'obo' source = os.path.join( os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) print "\n######################" print "# Loading ontology... #" print "######################\n" ontology = ontologies.load(source=source, source_type=source_type, ontology_type=ontology_type, parameters=ignore_parameters) print "\n######################" print "# Loading Annotation Corpus... #" print "######################\n" ac = AnnotationCorpus.AnnotationCorpus(ontology) ac.parse(os.path.join(constants.GO_DIR, "goa_human.gaf"), "gaf-2.0") ac.isConsistent() print "\n#################################" print "# Annotation corpus successfully loaded." print "#################################\n" semsim = GSESAMESemSim(ontology, ac) # maxSemSim(ontology, ac) # ################# if ENABLE_GO_GRAPH: dict_result, go2geneids, geneids2go, entrez2ensembl = utils.go_hierarcies.build_hierarcy( roots=['GO:0008150', 'GO:0005575', 'GO:0003674']) ################# all_homogeneity = [] all_separability = [] agg_homogeneity = [] agg_separability = [] algo_go_sim_score = [] colors = [] df_all_hg_pval = pd.DataFrame() df_go_metadata = pd.DataFrame() all_hg_score_labels = [] all_hg_score_modules = [] labels_by_sample = [] total_num_genes = [] algos = [ "keypathwayminer_INES_GREEDY", "netbox", "hotnet2", "jactivemodules_greedy", "bionet", "jactivemodules_sa" ] # "matisse", "reactomefi" # "keypathwayminer_INES_OPTIMAL", "keypathwayminer_INES_ACO" algos_signals = [] modules_signals = [] df_all_hg_qval = pd.DataFrame() df_module2best_rows = [] df_module2avg_rows = [] df_algo2best_rows = [] df_algo2avg_rows = [] for i_algo, ALGO_NAME in enumerate(algos): df_all_hg_qval = pd.DataFrame() print "current algo: {}".format(ALGO_NAME) go2modules = {} modules2go = {} homogeneity = [] separability = [] all_go_terms = [] try: total_num_genes.append( pd.read_csv(os.path.join(constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME, ALGO_NAME, "all_modules_general.tsv"), sep="\t")["total_num_genes"][0]) except pd.errors.EmptyDataError: total_num_genes.append(0) algo2best_go_ratio = 0 algo2avg_go_ratio = 0 module2best_go_ratio = [] module2avg_go_ratio = [] df_modules_summary = pd.read_csv(os.path.join( constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME, ALGO_NAME, "modules_summary.tsv"), sep='\t') i = -1 for i in range(len(df_modules_summary.index)): hg_file_name = os.path.join( constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME, ALGO_NAME, "module_{}_separated_modules_hg_samples.tsv".format(i)) print "reading module: {} from file".format(i) if os.path.getsize(hg_file_name) < 2: modules2go[i] = np.array([]) modules_signals.append(0) else: df_hg_output = pd.read_csv(hg_file_name, sep="\t") df_hg_output.index = df_hg_output["GO id"] df_go_metadata = pd.concat( [df_go_metadata, df_hg_output[["GO id", "GO name"]]], axis=0) df_go_metadata = df_go_metadata[~df_go_metadata.index. duplicated(keep='first')] df_all_hg_pval = pd.concat([ df_all_hg_pval, df_hg_output["pval"].apply(lambda x: -np.log10(x)) ], join='outer', axis=1) df_all_hg_qval = pd.concat([ df_all_hg_qval, df_hg_output["qval"].apply(lambda x: -np.log10(x)) ], join='outer', axis=1) df_hg_output = df_hg_output.iloc[:min(len(df_hg_output.index), GO_RANK_CUTOFF), :] df_hg_output = df_hg_output[ df_hg_output["qval"] <= QVAL_TH] # .iloc[:min(len(df_hg_output.index),5),:] # score = df_hg_output['value'].iloc[0] / float( df_modules_summary.loc[i, "#_genes"]) if len( df_hg_output.index) > 0 else 0 df_module2best_rows.append({ 'name': "{}_{}".format(ALGO_NAME, i), 'algo': ALGO_NAME, 'module': i, 'score': score, 'num_of_genes': df_modules_summary.loc[i, "#_genes"] }) algo2best_go_ratio += score score = (df_hg_output['value'] / float(df_modules_summary.loc[i, "#_genes"]) if len(df_hg_output.index) > 0 else np.array( [0])).mean() df_module2avg_rows.append({ 'name': "{}_{}".format(ALGO_NAME, i), 'algo': ALGO_NAME, 'module': i, 'score': score, 'num_of_genes': df_modules_summary.loc[i, "#_genes"] }) algo2avg_go_ratio += score modules_signals.append(len(df_hg_output.index)) all_hg_score_modules.append(i) all_hg_score_labels.append(i_algo) for x in df_hg_output["GO id"]: if x in go2modules: go2modules[x].append(i) else: go2modules[x] = [i] modules2go[str(i)] = df_hg_output["GO id"].values all_go_terms = np.append(all_go_terms, df_hg_output["GO id"].values) i += 1 if RATIO_TO_GO_TERM: df_algo2best_rows.append({ 'name': '{}_total_avg'.format(ALGO_NAME), 'score': algo2best_go_ratio / max(i, 1) }) df_algo2avg_rows.append({ 'name': '{}_total_avg'.format(ALGO_NAME), 'score': algo2avg_go_ratio / max(i, 1) }) df_all_hg_pval[pd.isna(df_all_hg_pval)] = 0 df_all_hg_qval[pd.isna(df_all_hg_qval)] = 0 max_per_go_term = 0 if df_all_hg_qval.values.size > 0: max_per_go_term = np.sum( np.max(df_all_hg_qval.values, axis=1) >= -np.log10(QVAL_TH)) algos_signals.append(max_per_go_term) all_go_terms = list(np.unique(all_go_terms)) print "added signal : {}".format(algos_signals[-1]) print "all_go_terms : {}".format(len(all_go_terms)) if ENABLE_GO_GRAPH: plot_go_tree(dict_result, all_go_terms, ALGO_NAME, i) if ENABLE_GO_GRAPH and IS_GO_GRAPH_ONLY: continue adj = np.ones((len(all_go_terms), len(all_go_terms))) * (-2) if TERMS_SIMILARITY_TO_NUM_OF_TERMS: for i_x, x in enumerate(all_go_terms): print "calc distance between terms {}/ {}".format( i_x, len(all_go_terms)) for i_y, y in enumerate(all_go_terms): if adj[i_x, i_y] != -2: continue adj[i_x, i_y] = semsim.SemSim(x, y) # , ResnikSemSim(ontology,ac)) if np.isnan(adj[i_x, i_y]): adj[i_x, i_y] = -1 adj[i_y, i_x] = adj[i_x, i_y] algo_go_sim_score.append([ 1 if np.isnan(x) else x for x in [np.sum(adj[adj != -1]) / (np.size(adj) - np.sum(adj == -1))] ][0]) for k, v in sorted([(int(k), v) for k, v in modules2go.iteritems()], key=lambda x: x[0]): print "calc homogeneity and seperability for module: {}".format( k) v_filtered = [x for x in v] # if x in G.nodes labels_by_sample.append(ALGO_NAME) homogeneity.append( np.nan_to_num( np.sum([ adj[all_go_terms.index(x), all_go_terms.index(y)] for x in v for y in v if adj[all_go_terms.index(x), all_go_terms.index(y)] != -1 and x != y ]) / (len(v_filtered) * (len(v_filtered) - 1)))) separability.append( np.nan_to_num( np.sum([ adj[all_go_terms.index(x), all_go_terms.index(y)] for x in v for y in all_go_terms if y not in v and adj[all_go_terms.index(x), all_go_terms.index(y)] != -1 ]) / (len(v_filtered) * (len(all_go_terms) - len(v_filtered))))) all_separability = all_separability + separability all_homogeneity = all_homogeneity + homogeneity agg_separability.append(np.average(separability)) agg_homogeneity.append(np.average(homogeneity)) fig, ax = plt.subplots(figsize=(15, 15)) ax.scatter(homogeneity, separability) ax.legend() ax.set_xlabel("Intra-Similarity") ax.set_ylabel("Inter-Similarity") ax.grid(True) colors = colors + [ float(i_algo) / len(algos) for x in range(len(modules2go)) ] for module_i, txt in enumerate(range(i)): ax.annotate(str(txt), (homogeneity[module_i], separability[module_i])) plt.savefig( os.path.join( constants.OUTPUT_GLOBAL_DIR, "hs_plot_{}_{}.png".format(ALGO_NAME, constants.DATASET_NAME))) if RATIO_TO_GO_TERM: pd.DataFrame(df_module2best_rows).set_index('name').to_csv( os.path.join(constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME, "GO_terms_per_module_ratio_best.tsv"), sep='\t') pd.DataFrame(df_module2avg_rows).set_index('name').to_csv(os.path.join( constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME, "GO_terms_per_module_ratio_avg.tsv"), sep='\t') pd.DataFrame(df_algo2best_rows).set_index('name').to_csv(os.path.join( constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME, "GO_terms_ratio_best.tsv"), sep='\t') pd.DataFrame(df_algo2avg_rows).set_index('name').to_csv(os.path.join( constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME, "GO_terms_ratio_avg.tsv"), sep='\t') if TERMS_SIMILARITY_TO_NUM_OF_TERMS: fig, ax = plt.subplots(figsize=(15, 15)) colorlist = [ml_colors.rgb2hex(colormap(i)) for i in colors] for h, s, c, a in zip(all_homogeneity, all_separability, colorlist, labels_by_sample): ax.scatter(h, s, s=50, c=c, vmin=0, vmax=1, cmap='jet') colorlist = [ ml_colors.rgb2hex(colormap(i)) for i in np.array(list(range(len(algos)))) / float(len(algos) - 1) ] patches = [ Line2D([0], [0], marker='o', color='gray', label=a, markerfacecolor=c) for i, a, c in zip(list(range(len(algos))), algos, colorlist) ] ax.legend(handles=patches) ax.set_xlabel("Intra-Similarity") ax.set_ylabel("Inter-Similarity") ax.grid(True) plt.savefig( os.path.join(constants.OUTPUT_GLOBAL_DIR, "hs_plot_all_{}.png".format(constants.DATASET_NAME))) fig, ax = plt.subplots(figsize=(10, 10)) colorlist = [ml_colors.rgb2hex(colormap(i)) for i in colors] for h, s, c, a in zip(modules_signals, all_separability, colorlist, labels_by_sample): ax.scatter(h, s, s=50, c=c, vmin=0, vmax=1, cmap='jet') colorlist = [ ml_colors.rgb2hex(colormap(i)) for i in np.array(list(range(len(algos)))) / float(len(algos) - 1) ] patches = [ Line2D([0], [0], marker='o', color='gray', label=a, markerfacecolor=c) for i, a, c in zip(list(range(len(algos))), algos, colorlist) ] ax.legend(handles=patches) ax.set_xlabel("# GO terms (-log10(qval)) above threshold") ax.set_ylabel("Algorithm Inter-Similarity") ax.grid(True) plt.savefig( os.path.join( constants.OUTPUT_GLOBAL_DIR, "hs_plot_signal_all_{}.png".format(constants.DATASET_NAME))) colorlist = [ ml_colors.rgb2hex(colormap(i)) for i in np.array(list(range(len(agg_homogeneity)))) / float(len(agg_homogeneity) - 1) ] fig, ax = plt.subplots(figsize=(10, 10)) for h, s, c, a in zip(agg_homogeneity, agg_separability, colorlist, algos): ax.scatter(h, s, s=50, c=c, cmap='jet', label=a) ax.set_xlabel("Intra-Similarity") ax.set_ylabel("Inter-Similarity") ax.legend() ax.grid(True) for module_i, txt in enumerate(algos): ax.annotate( str(txt), (agg_homogeneity[module_i], agg_separability[module_i])) plt.savefig( os.path.join(constants.OUTPUT_GLOBAL_DIR, "hs_plot_agg_{}.png".format(constants.DATASET_NAME))) # colorlist = [ml_colors.rgb2hex(colormap(i)) for i in algo_go_sim_score/np.max(algo_go_sim_score)] colorlist = [ ml_colors.rgb2hex(colormap(i)) for i in np.array(list(range(len(algos)))) / float(len(algos) - 1) ] fig, ax = plt.subplots(figsize=(10, 10)) for h, s, c, a, gene_size in zip( algos_signals, algo_go_sim_score, colorlist, algos, total_num_genes): # [0 for x in range(len(algo_go_sim_score))] print(h, s) ax.scatter(h, s, s=(50 + 2000 * (float(gene_size) / np.max(total_num_genes))), c=c, cmap='jet', label=a) colorlist = [ ml_colors.rgb2hex(colormap(i)) for i in np.array(list(range(len(algos)))) / float(len(algos) - 1) ] patches = [ Line2D([0], [0], marker='o', color='gray', label=a, markerfacecolor=c) for i, a, c in zip(list(range(len(algos))), algos, colorlist) ] ax.set_xlabel("# GO terms (-log10(qval)) above threshold") ax.set_ylabel("Algorithm Inter-Similarity") ax.legend(handles=patches) ax.grid(True) plt.savefig( os.path.join( constants.OUTPUT_GLOBAL_DIR, "hs_plot_terms_signal_algo_{}.png".format( constants.DATASET_NAME))) if GO_PCA: plot_pca(all_hg_score_labels, df_all_hg_pval, ml_colors, algos)
abs_nw_genes = len(disease_nw_genes) sys.stdout.write("ratio={}, absolute n={}\n".format( total_to_nw_genes_ratio, abs_nw_genes)) if total_to_nw_genes_ratio < TOTAL_TO_NW_RATIO or abs_nw_genes < ABS_NW_GENES: sys.stdout.write("disease genes in network is too small\n") else: print "about to start analyze disease: {}".format(cur_disease_name) disease_counter += 1 dataset_name = "_".join(["DISGENET", ts, str(disease_counter)]) create_ds_folders(dataset_name) disease_nw_genes_selected = random.sample( set(disease_nw_genes), len(set(disease_nw_genes)) / 2) disease_nw_genes_unselected = set(disease_nw_genes) - set( disease_nw_genes_selected) create_ge(disease_nw_genes_selected, bg_genes, dataset_name) run_dataset(dataset_name, disease_nw_genes_unselected, cur_disease_name, constants.DEG_EDGER) if disease_counter == MAX_DISEASES: break aggregate_reports.aggregate_disease("_".join(["DISGENET", ts])) sys.stdout.write("total tested diseases: {}\n".format(disease_counter)) if __name__ == "__main__": constants.update_dirs(DATASET_NAME_u="DISGENET") main() # aggregate_reports.aggregate_disease("DISGENET_1542711137.18")
print subprocess.Popen( "bash ../sh/scripts/prepare_hotnet2.sh.format", shell=True, stdout=subprocess.PIPE).stdout.read() # cwd=dir_path def run_hotnet2(deg_file_name, network_file_name): script = file("scripts/bionet.r").read() return run_rscript(script=script, output_vars=["module_genes", "bg_genes"], network_file_name=network_file_name, deg_file_name=deg_file_name) if __name__ == "__main__": constants.update_dirs(DATASET_NAME_u="TNFa") params = get_parameters() if params != None: args, NETWORK_NAME, dataset_name = params print subprocess.Popen( "bash ../sh/scripts/run_pinnaclez.sh.format", shell=True, stdout=subprocess.PIPE).stdout.read() # cwd=dir_path results = file(os.path.join(constants.OUTPUT_DIR, "pinnaclez_results.txt")).read().split() module_genes = list(set([x for x in results if x.startswith("ENSG")])) dip_network = pd.read_csv(os.path.join(constants.NETWORKS_DIR, "dip_out.sif"), sep="\t",
from infra import * import constants constants.update_dirs(CANCER_TYPE_u="SKCM") tcga = np.array(load_phenotype_data("SKCM_clinicalMatrix")) gdc = np.array(load_phenotype_data("TCGA-SKCM.GDC_phenotype.tsv")) old = np.array(load_phenotype_data("SKCM_clinicalMatrix.txt")) integ = [] integ.append("\t".join( list(gdc[0]) + ["tcga_{}".format(x) for x in tcga[0][1:]] + ["old_{}".format(x) for x in old[0][1:]])) for cur_gdc in gdc[1:]: row = "" row += "\t".join(cur_gdc) additional = [] for cur_tcga in tcga[1:]: if cur_tcga[0] in cur_gdc[0]: additional = cur_tcga[1:] break additional = "\t".join( additional + ["" for x in range((len(tcga[0][1:]) - len(additional)))]) row += "\t" + additional additional = [] for cur_tcga in old[1:]: if cur_tcga[15] in cur_gdc[0]: additional = cur_tcga[1:] break
write.table(data.frame(cel=rownames(pheno), pheno), row.names=F, quote=F, sep="\t", file="bladder-pheno.txt") edata = exprs(bladderEset) write.table(edata, row.names=T, quote=F, sep="\t", file="bladder-expr.txt") # use dataframe instead of matrix mod = model.matrix(~as.factor(cancer) + age, data=pheno) t = Sys.time() cdata = ComBat(dat=edata, batch=as.factor(pheno$batch), mod=mod, numCov=match("age", colnames(mod))) print(Sys.time() - t) print(cdata[1:5, 1:5]) write.table(cdata, row.names=True, quote=F, sep="\t", file="r-batch.txt") """ # for dataset in ["LUSC", "SKCM", "MESO", "OV", "PCPG", "PRAD", "READ", "SARC", "TGCT", "THYM", "THCA", "UCS"]: for dataset in ["PANCAN"]: print "current dataset: {}".format(dataset) constants.update_dirs(CANCER_TYPE_u=dataset) data_normalizaton = "fpkm" gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, mirna_file_name, pval_preprocessing_file_name = build_gdc_params( dataset=dataset, data_normalizaton=data_normalizaton) pheno = pd.read_table(phenotype_file_name, index_col=0) dat = pd.read_table(gene_expression_file_name, index_col=0, dtype="str") print "done load" dat = dat.astype(np.float) print "done conversion" dat = dat.loc[~(dat==0).all(axis=1)] pheno = pheno[pheno.batch_number.notnull()]
def main(dataset="BRCA"): constants.update_dirs(DATASET_NAME_u=dataset) data_normalizaton = "counts_normalized_by_genes_standardization" cur_json = "brca_pam53" meta_groups = None filter_expression = None meta_groups = [json.load(file("../groups/{}.json".format(cur_json)))] filter_expression = json.load(file("../filters/{}.json".format(cur_json))) gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, mirna_file_name, pval_preprocessing_file_name = build_gdc_params( dataset=dataset, data_normalizaton=data_normalizaton) phenotype_file_name = 'BRCA_clinicalMatrix' tested_gene_expression, h_rows, h_cols, labels_assignment, survival_dataset = infra.load_integrated_ge_data( "dip_bg.txt", "dip_bg.txt", gene_expression_file_name, survival_file_name, phenotype_file_name, gene_filter_file_name=None, filter_expression=filter_expression, meta_groups=meta_groups, var_th_index=None) file(os.path.join(constants.DATA_DIR, "classes.tsv"), 'w+').write('\t'.join([str(x) for x in labels_assignment[0]])) h_cols = [x.split('.')[0] for x in h_cols] df_data = pd.DataFrame(index=h_rows, columns=h_cols, data=tested_gene_expression).T df_data.to_csv(os.path.join(constants.DATA_DIR, 'ge.tsv'), index_label="eid", sep="\t") var_th_index = None start_k = 2 end_k = 2 # algos = ["matisse", "keypathwayminer_INES_GREEDY", "netbox", "hotnet2", "bionet", "jactivemodules_greedy", # "jactivemodules_sa", "reactomefi"] algos = ["netbox"] run_dataset(dataset, score_method=constants.DEG_EDGER) gene_list_file_names = [] generate_plot = True clustering_algorithm = "correlation" for cur_algo in algos: algo_output = json.loads( file( os.path.join(constants.OUTPUT_DIR, "{}_client_output.txt".format( cur_algo))).read().split("\n")[1]) i = 0 algo_pvals = [] random_pvals = [] df_mean = pd.DataFrame() all_algo_genes_flatted = [] gene_2_module = {} while True: algo_genes_flatted = [ x['eid'] for x in algo_output if i in x['modules'] ] for cur in algo_genes_flatted: gene_2_module[cur] = i all_algo_genes_flatted += algo_genes_flatted if len(algo_genes_flatted) == 0 and i > 0: break if len(algo_genes_flatted) < 4: i += 1 continue gene_list_file_names.append( os.path.join(constants.LIST_DIR, cur_algo + ".txt")) file(gene_list_file_names[-1], 'w+').write("\n".join(algo_genes_flatted)) df_mean = pd.concat((df_mean, df_data[df_data.index.isin( algo_genes_flatted)].mean().to_frame().T)) i += 1 all_genes_file_name = os.path.join(constants.LIST_DIR, "{}_all_genes.txt".format(cur_algo)) file(all_genes_file_name, 'w+').write('\n'.join(all_algo_genes_flatted)) mean_file_name = os.path.join(constants.DATA_DIR, "mean.tsv") df_mean.index = np.arange(df_mean.shape[0]) df_mean.to_csv(mean_file_name, sep="\t", index_label="eid") index_file_name = os.path.join(constants.LIST_DIR, "{}_indices.txt".format(cur_algo)) file(index_file_name, 'w+').write('\n'.join([str(x) for x in df_mean.index.values])) algo_pvals.append( find_clusters_and_survival( tested_gene_list_file_name="{}_indices.txt".format(cur_algo), total_gene_list_file_name="protein_coding.txt", gene_expression_file_name=mean_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, is_unsupervised=True, start_k=start_k, end_k=end_k, filter_expression=filter_expression, meta_groups=meta_groups, clustering_algorithm=clustering_algorithm, plot=generate_plot)) for cur in range(RAND_TIMES): random_set_file_name = generate_random_set( random_size=len(df_mean.index), meta_gene_set="{}_all_genes.txt".format(cur_algo)) random_pvals.append( find_clusters_and_survival( tested_gene_list_file_name=random_set_file_name, total_gene_list_file_name="dip_bg.txt", gene_expression_file_name=gene_expression_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, is_unsupervised=True, start_k=start_k, end_k=end_k, filter_expression=filter_expression, meta_groups=meta_groups, clustering_algorithm=clustering_algorithm, plot=generate_plot)) print " algo pvals" print algo_pvals print "# above TH: {}".format( len([x for x in algo_pvals if any(y < 0.001 for y in x)])) print " random pvals" print random_pvals print "# above TH: {}".format( len([x for x in random_pvals if any(y < 0.001 for y in x)])) print "# of modules better over random: {}/{}".format( len([ x for x1, x2 in zip(algo_pvals, random_pvals) if x1[0] < x2[0] ]), len(algo_pvals))
conf_file_name = init_specific_params(score_file_name, dest_algo_dir) script_file_name = format_script(os.path.join(constants.SH_DIR, script_name), BASE_FOLDER=constants.BASE_PROFILE, DATASET_DIR=constants.DATASET_DIR, CONFIG_FILE_NAME=conf_file_name, NETBOX_DIR=dest_algo_dir) print subprocess.Popen("bash {}".format(script_file_name), shell=True, stdout=subprocess.PIPE, cwd=dest_algo_dir).stdout.read() modules, all_bg_genes = extract_modules_and_bg(bg_genes, dest_algo_dir) os.remove(script_file_name) os.remove(conf_file_name) shutil.rmtree(dest_algo_dir) output_base_dir = "" if constants.REPORTS: output_base_dir = build_all_reports(ALGO_NAME, dataset_name, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes) output_file_name = os.path.join(constants.OUTPUT_DIR, "{}_client_output.txt".format(ALGO_NAME)) # output_modules(output_file_name, modules, score_file_name, output_base_dir ) if __name__ == "__main__": constants.update_dirs(DATASET_NAME_u="MCF7_2") main()
write.table(data.frame(cel=rownames(pheno), pheno), row.names=F, quote=F, sep="\t", file="bladder-pheno.txt") edata = exprs(bladderEset) write.table(edata, row.names=T, quote=F, sep="\t", file="bladder-expr.txt") # use dataframe instead of matrix mod = model.matrix(~as.factor(cancer) + age, data=pheno) t = Sys.time() cdata = ComBat(dat=edata, batch=as.factor(pheno$batch), mod=mod, numCov=match("age", colnames(mod))) print(Sys.time() - t) print(cdata[1:5, 1:5]) write.table(cdata, row.names=True, quote=F, sep="\t", file="r-batch.txt") """ # for dataset in ["LUSC", "SKCM", "MESO", "OV", "PCPG", "PRAD", "READ", "SARC", "TGCT", "THYM", "THCA", "UCS"]: for dataset in ["PANCAN"]: print "current dataset: {}".format(dataset) constants.update_dirs(DATASET_NAME_u=dataset) data_normalizaton = "fpkm" gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, mirna_file_name, pval_preprocessing_file_name = build_gdc_params( dataset=dataset, data_normalizaton=data_normalizaton) pheno = pd.read_table(os.path.join(constants.DATA_DIR, phenotype_file_name), index_col=0) dat = pd.read_table(os.path.join(constants.DATA_DIR, gene_expression_file_name), index_col=0, dtype="str") print "done load" dat = dat.astype(np.float) print "done conversion" dat = dat.loc[~(dat == 0).all(axis=1)]
import constants import infra import pandas as pd import numpy as np import os constants.update_dirs(DATASET_NAME_u="PRAD_2") #### PREPARE DICT ### company = "illumina" # agilent # old_key_field = "AGILENT WholeGenome 4x44k v1 probe" # new_field = "agilent_44_v1" old_key_field = "ILLUMINA HumanWG 6 V3 probe" new_key_field = "illumina_WG_v3" old_value_field = "Gene stable ID" new_value_field = "eid" df_dict = pd.read_csv(os.path.join(constants.DICTIONARIES_DIR, "{}_ensembl_biomart.tsv".format(company)), sep='\t') df_dict = df_dict.dropna() df_dict.index = df_dict[old_key_field] df_dict = df_dict.drop([old_key_field], axis=1) df_dict = df_dict.rename(columns={old_value_field: new_value_field}) df_dict = df_dict[~df_dict.index.duplicated(keep='first')] df_dict.to_csv(os.path.join(constants.DICTIONARIES_DIR, "{}_ensembl.tsv".format(company)), sep='\t',
def main(dataset="COMB"): constants.update_dirs(DATASET_NAME_u=dataset) data_normalizaton = "fpkm_bc" cur_json = "cancer_types" meta_groups = [json.load(file("../groups/{}.json".format(cur_json)))] filter_expression = json.load(file("../filters/{}.json".format(cur_json))) gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, mirna_file_name, pval_preprocessing_file_name = build_gdc_params( dataset=dataset, data_normalizaton=data_normalizaton) tested_gene_expression, h_rows, h_cols, labels_assignment, survival_dataset = infra.load_integrated_ge_data( "dip_bg.txt", "dip_bg.txt", gene_expression_file_name, survival_file_name, phenotype_file_name, gene_filter_file_name=None, filter_expression=filter_expression, meta_groups=meta_groups, var_th_index=None) h_cols = [x.split('.')[0] for x in h_cols] pd.DataFrame(index=h_rows, columns=h_cols, data=tested_gene_expression).T.to_csv(os.path.join( constants.DATA_DIR, 'ge.tsv'), index_label="eid", sep="\t") var_th_index = None start_k = 2 end_k = 2 # algos = ["matisse", "keypathwayminer_INES_GREEDY", "netbox", "hotnet2", "bionet", "jactivemodules_greedy", # "jactivemodules_sa", "reactomefi"] algos = ["matisse"] run_dataset(dataset, score_method=constants.DEG_EDGER) gene_list_file_names = [] generate_plot = True clustering_algorithm = "correlation" for cur_algo in algos: algo_output = json.loads( file( os.path.join(constants.OUTPUT_DIR, "{}_client_output.txt".format( cur_algo))).read().split("\n")[1]) i = 0 algo_pvals = [] random_pvals = [] while True: algo_genes_flatted = [ x['eid'] for x in algo_output if i in x['modules'] ] if len(algo_genes_flatted) == 0 and i > 0: break if len(algo_genes_flatted) == 0: i += 1 continue gene_list_file_names.append( os.path.join(constants.LIST_DIR, cur_algo + ".txt")) file(gene_list_file_names[-1], 'w+').write("\n".join(algo_genes_flatted)) algo_pvals.append( find_clusters_and_survival( tested_gene_list_file_name=gene_list_file_names[-1], total_gene_list_file_name="protein_coding.txt", gene_expression_file_name=gene_expression_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, is_unsupervised=True, start_k=start_k, end_k=end_k, filter_expression=filter_expression, meta_groups=meta_groups, clustering_algorithm=clustering_algorithm, plot=generate_plot)) random_set_file_name = generate_random_set( random_size=len(algo_genes_flatted), meta_gene_set="dip_bg.txt") random_pvals.append( find_clusters_and_survival( tested_gene_list_file_name=random_set_file_name, total_gene_list_file_name="protein_coding.txt", gene_expression_file_name=gene_expression_file_name, phenotype_file_name=phenotype_file_name, survival_file_name=survival_file_name, var_th_index=var_th_index, is_unsupervised=True, start_k=start_k, end_k=end_k, filter_expression=filter_expression, meta_groups=meta_groups, clustering_algorithm=clustering_algorithm, plot=generate_plot)) i += 1 print " algo pvals" print algo_pvals print "# above TH: {}".format( len([x for x in algo_pvals if any(y < 0.001 for y in x)])) print " random pvals" print random_pvals print "# above TH: {}".format( len([x for x in random_pvals if any(y < 0.001 for y in x)])) print "# of modules better over random: {}/{}".format( len([ x for x1, x2 in zip(algo_pvals, random_pvals) if x1[0] < x2[0] ]), len(algo_pvals))
return bg_genes, network_file_name def main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes = None, score_method=constants.DEG_EDGER): global NETWORK_NAME constants.update_dirs(DATASET_NAME_u=dataset_name) network_file_name, score_file_name, score_method, bg_genes = server.init_common_params(NETWORK_NAME, score_method) if score_method == constants.PREDEFINED_SCORE: raise Exception("Cannot run this algo on scor-based metrics. please provide gene expression file") bg_genes, network_file_name = init_specific_params(NETWORK_NAME) format_script(os.path.join(constants.SH_DIR, "run_{}.sh".format(ALGO_NAME)), BASE_FOLDER=constants.BASE_PROFILE, DATASET_DIR=constants.DATASET_DIR, ALGO_DIR=ALGO_DIR, NETWORK_NAME=NETWORK_NAME) subprocess.Popen("bash {}/run_{}.sh".format(constants.SH_DIR, ALGO_NAME), shell=True, stdout=subprocess.PIPE, cwd=ALGO_DIR).stdout.read() modules, all_bg_genes = extract_modules_and_bg(bg_genes) output_base_dir = "" if constants.REPORTS: output_base_dir = build_all_reports(ALGO_NAME, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes) output_file_name = os.path.join(constants.OUTPUT_DIR, "{}_client_output.txt".format(ALGO_NAME)) output_modules(output_file_name, modules, score_file_name, output_base_dir ) if __name__ == "__main__": constants.update_dirs(DATASET_NAME_u="SOC") main()