Exemple #1
0
def main():
    list_of_files_format = ["{ds}-{rg}/exp_seq.{ds}-{rg}.tsv.gz"]
    constants.ALL_CANCER_TYPES = ["PRAD", "LICA", "RECA", "LIRI", "BRCA", "OV", "PACA", "PACA", "PAEN"] # ["KIRC", "KIRP", "LUSC", "LUAD", "COAD", "BRCA", "STAD", "LIHC", "READ", "PRAD", "BLCA", "HNSC", "THCA", "UCEC", "OV", "PAAD"]
    all_regions = ["FR", "FR", "EU", "JP", "KR", "AU", "AU", "CA", "AU"]
    for cur, cur_rg in zip(constants.ALL_CANCER_TYPES, all_regions):
        if cur == "PANCAN": continue
        constants.update_dirs(DATASET_NAME_u="ICGC_{}_{}".format(cur, cur_rg))
        if not os.path.exists(constants.DATA_DIR):
            os.makedirs(constants.DATA_DIR)

        print "fetching data for {} ({}\{})".format(cur,constants.ALL_CANCER_TYPES.index(cur), len(constants.ALL_CANCER_TYPES))
        list_of_files = [fr.format(ds=cur, rg=cur_rg) for fr in list_of_files_format]
        for cur_file_name in list_of_files:
            if not os.path.exists(os.path.join(constants.DATA_DIR,cur_file_name.format(cur))) and not os.path.exists(os.path.join(constants.DATA_DIR,".".join(cur_file_name.split(".")[:-1]))):
                # run_and_printchar(["wget", "https://gdc.xenahubs.net/download/TCGA-{}/Xena_Matrices/{}".format(cur, cur_file_name), constants.TCGA_DATA_DIR])
                download("https://dcc.icgc.org/api/v1/download?fn=/current/Projects/"+ cur_file_name, constants.DATA_DIR)
        print "extract data for {}".format(cur)
        for cur_file_name in list_of_files:
            if os.path.exists(os.path.join(constants.DATA_DIR,cur_file_name.split('/')[-1])) and not os.path.exists(os.path.join(constants.DATA_DIR,".".join(cur_file_name.split('/')[-1].split(".")[:-1]))):
                with gzip.open(os.path.join(constants.DATA_DIR,cur_file_name.split('/')[-1]), 'rb') as f_in:
                    with open(os.path.join(constants.DATA_DIR,".".join(cur_file_name.split('/')[-1].split(".")[:-1])), 'wb') as f_out:
                        print os.path.join(constants.DATA_DIR,cur_file_name.split('/')[-1])
                        shutil.copyfileobj(f_in, f_out)

        print "delete redundant gz files {}".format(cur)
        for cur_file_name in list_of_files:
            if os.path.exists(os.path.join(constants.DATA_DIR,cur_file_name.split('/')[-1])) and os.path.exists(os.path.join(constants.DATA_DIR,".".join(cur_file_name.split('/')[-1].split(".")[:-1]))):
                os.remove(os.path.join(constants.DATA_DIR,cur_file_name.split('/')[-1]))

        if not os.path.exists(constants.OUTPUT_DIR):
            os.makedirs(constants.OUTPUT_DIR)

        if not os.path.exists(constants.CACHE_DIR):
            os.makedirs(constants.CACHE_DIR)
Exemple #2
0
def main(dataset_name=constants.DATASET_NAME,
         disease_name=None,
         expected_genes=None,
         score_method=constants.DEG_EDGER,
         network_file_name="dip.sif"):

    constants.update_dirs(DATASET_NAME_u=dataset_name)
    network_file_name, score_file_name, score_method, bg_genes = server.init_common_params(
        network_file_name, score_method)

    heat_file_name, network_file_name = init_specific_params(
        score_file_name, score_method, network_file_name)

    script_file_name = format_script(
        os.path.join(constants.SH_DIR, "run_{}.sh".format(ALGO_NAME)),
        ALGO_DIR=ALGO_DIR,
        CACHE_DIR=constants.CACHE_DIR,
        OUTPUT_DIR=constants.OUTPUT_DIR,
        NETWORK_NAME=os.path.splitext(os.path.basename(network_file_name))[0])
    print subprocess.Popen(
        "bash {}".format(script_file_name), shell=True,
        stdout=subprocess.PIPE).stdout.read()  # cwd=dir_path
    os.remove(script_file_name)
    modules, all_bg_genes = extract_modules_and_bg(bg_genes)
    output_base_dir = ""
    if constants.REPORTS:
        output_base_dir = build_all_reports(ALGO_NAME, dataset_name, modules,
                                            all_bg_genes, score_file_name,
                                            network_file_name, disease_name,
                                            expected_genes)

    output_file_name = os.path.join(constants.OUTPUT_DIR,
                                    "{}_client_output.txt".format(ALGO_NAME))
    output_modules(output_file_name, modules, score_file_name, output_base_dir)
Exemple #3
0
def aggregate_disease(DATASET_NAME):
    counter = 1
    report_all = pd.DataFrame()
    report_k = pd.DataFrame()
    while True:
        constants.update_dirs(DATASET_NAME_u=DATASET_NAME +
                              "_{}".format(counter))
        if not os.path.exists(constants.DATASET_DIR): break

        reports_metadata = {
            "all":
            aggregate_all(
                os.path.join(constants.OUTPUT_GLOBAL_DIR,
                             constants.DATASET_NAME), "all_modules_disease"),
            "k":
            aggregate_all(
                os.path.join(constants.OUTPUT_GLOBAL_DIR,
                             constants.DATASET_NAME),
                "k_{}_modules_disease".format(network.MODULE_TH))
        }
        report_all = pd.concat([report_all, reports_metadata["all"][0]])
        report_k = pd.concat([report_k, reports_metadata["k"][0]])

        counter += 1
    # report_all = report_all.set_index("algo")
    # report_k = report_k.set_index("algo")

    diseases_summary_headers = [
        "TP_mean", "TP_std", "TP+FN_mean", "TP+FN_std", "TP+TN_mean",
        "TP+TN_std", "TP/(TP+TN)_mean", "TP/(TP+TN)_std", "TP/(TP+FN)_mean",
        "TP/(TP+FN)_std", "TP/(TP+TN)_std", "F1_mean", "F1_std"
    ]
    diseases_headers = [
        'disease_name', 'TP', 'TP+FN_(_true_)', 'TP+TN_(_retrieved_)',
        'TP/(TP+FN)_(_recall_)', 'TP/(TP+TN)_(_precision_)', 'F1',
        'module_size_avg', 'module_size_std', 'num_of_modules'
    ]
    report_all = report_all[diseases_headers]
    report_k = report_k[diseases_headers]
    df_summary_all = calc_p_r_f_scores(report_all)[diseases_summary_headers]
    df_summary_k = calc_p_r_f_scores(report_k)[diseases_summary_headers]
    for k, v in {
            "all": [report_all, df_summary_all],
            "k_{}".format(network.MODULE_TH): [report_k, df_summary_k]
    }.iteritems():

        format_script(os.path.join(constants.TEMPLATES_DIR, "report.html"),
                      REPORT=[],
                      SCORE=[],
                      DISEASE_GENES=json.dumps(to_full_list(v[0], "algo")),
                      DISEASE_GENES_SUMMARY=json.dumps(
                          to_full_list(v[1], "algo")),
                      MODULE_FILTER=(k + "_modules"),
                      MODULES_SCORE=[],
                      EMB_WU=[])
        output_dir = os.path.join(constants.OUTPUT_GLOBAL_DIR, DATASET_NAME)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        shutil.move(os.path.join(constants.TEMPLATES_DIR, "report.html"),
                    os.path.join(output_dir, "report_{}.html".format(k)))
def main():
    list_of_files_format = ["TCGA.{ds}.sampleMap/HiSeqV2.gz","TCGA.{ds}.sampleMap/{ds}_clinicalMatrix.gz","TCGA.{ds}.sampleMap/miRNA_HiSeq_gene.gz"]
    constants.ALL_CANCER_TYPES = ["PAAD", "OV"] # ["ESCA", "KIRC", "KIRP", "KICH", "LUSC", "LUAD", "COAD", "BRCA", "STAD", "LIHC", "READ", "PRAD", "BLCA", "HNSC", "THCA", "UCEC"]
    for cur in constants.ALL_CANCER_TYPES:
        if cur == "PANCAN": continue
        constants.update_dirs(DATASET_NAME_u="TCGA_"+cur)
        if not os.path.exists(constants.DATA_DIR):
            os.makedirs(constants.DATA_DIR)

        print "fetching data for {} ({}\{})".format(cur,constants.ALL_CANCER_TYPES.index(cur), len(constants.ALL_CANCER_TYPES))
        list_of_files = [fr.format(ds=cur) for fr in list_of_files_format]
        for cur_file_name in list_of_files:
            if not os.path.exists(os.path.join(constants.DATA_DIR,cur_file_name.format(cur))) and not os.path.exists(os.path.join(constants.DATA_DIR,".".join(cur_file_name.split(".")[:-1]))):
                # run_and_printchar(["wget", "https://gdc.xenahubs.net/download/TCGA-{}/Xena_Matrices/{}".format(cur, cur_file_name), constants.TCGA_DATA_DIR])
                download("https://tcga.xenahubs.net/download/"+ cur_file_name, constants.DATA_DIR)
        print "extract data for {}".format(cur)
        for cur_file_name in list_of_files:
            if os.path.exists(os.path.join(constants.DATA_DIR,cur_file_name.split('/')[-1])) and not os.path.exists(os.path.join(constants.DATA_DIR,".".join(cur_file_name.split('/')[-1].split(".")[:-1]))):
                with gzip.open(os.path.join(constants.DATA_DIR,cur_file_name.split('/')[-1]), 'rb') as f_in:
                    with open(os.path.join(constants.DATA_DIR,".".join(cur_file_name.split('/')[-1].split(".")[:-1])), 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)

        print "delete redundant gz files {}".format(cur)
        for cur_file_name in list_of_files:
            if os.path.exists(os.path.join(constants.DATA_DIR,cur_file_name.split('/')[-1])) and os.path.exists(os.path.join(constants.DATA_DIR,".".join(cur_file_name.split('/')[-1].split(".")[:-1]))):
                os.remove(os.path.join(constants.DATA_DIR,cur_file_name.split('/')[-1]))

        if not os.path.exists(constants.OUTPUT_DIR):
            os.makedirs(constants.OUTPUT_DIR)

        if not os.path.exists(constants.CACHE_DIR):
            os.makedirs(constants.CACHE_DIR)
Exemple #5
0
def main():
    list_of_files_format = ["TCGA-{}.htseq_counts.tsv.gz","TCGA-{}.htseq_fpkm.tsv.gz","TCGA-{}.htseq_fpkm-uq.tsv.gz","TCGA-{}.GDC_phenotype.tsv.gz","TCGA-{}.survival.tsv.gz","TCGA-{}.mutect2_snv.tsv.gz", "TCGA-{}.mirna.tsv.gz"]
    for cur in constants.ALL_CANCER_TYPES:
        if cur == "PANCAN": continue
        constants.update_dirs(CANCER_TYPE_u=cur)
        if not os.path.exists(constants.TCGA_DATA_DIR):
            os.makedirs(constants.TCGA_DATA_DIR)

        print "fetching data for {} ({}\{})".format(cur,constants.ALL_CANCER_TYPES.index(cur), len(constants.ALL_CANCER_TYPES))
        list_of_files = [fr.format(cur) for fr in list_of_files_format]
        for cur_file_name in list_of_files:
            if not os.path.exists(os.path.join(constants.TCGA_DATA_DIR,cur_file_name.format(cur))) and not os.path.exists(os.path.join(constants.TCGA_DATA_DIR,".".join(cur_file_name.format(cur).split(".")[:-1]))):
                # run_and_printchar(["wget", "https://gdc.xenahubs.net/download/TCGA-{}/Xena_Matrices/{}".format(cur, cur_file_name), constants.TCGA_DATA_DIR])
                download("https://gdc.xenahubs.net/download/TCGA-{}/Xena_Matrices/{}".format(cur, cur_file_name), constants.TCGA_DATA_DIR)
        print "extract data for {}".format(cur)
        for cur_file_name in list_of_files:
            if os.path.exists(os.path.join(constants.TCGA_DATA_DIR,cur_file_name.format(cur))) and not os.path.exists(os.path.join(constants.TCGA_DATA_DIR,".".join(cur_file_name.format(cur).split(".")[:-1]))):
                with gzip.open(os.path.join(constants.TCGA_DATA_DIR,cur_file_name.format(cur)), 'rb') as f_in:
                    with open(os.path.join(constants.TCGA_DATA_DIR,".".join(cur_file_name.format(cur).split(".")[:-1])), 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)

        print "delete redundant gz files {}".format(cur)
        for cur_file_name in list_of_files:
            if os.path.exists(os.path.join(constants.TCGA_DATA_DIR,cur_file_name.format(cur))) and os.path.exists(os.path.join(constants.TCGA_DATA_DIR,".".join(cur_file_name.format(cur).split(".")[:-1]))):
                os.remove(os.path.join(constants.TCGA_DATA_DIR,cur_file_name.format(cur)))

        if not os.path.exists(constants.OUTPUT_DIR):
            os.makedirs(constants.OUTPUT_DIR)

        if not os.path.exists(constants.CACHE_DIR):
            os.makedirs(constants.CACHE_DIR)


# main()
Exemple #6
0
def main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes = None, score_method=constants.DEG_EDGER, network_file_name="dip.sif"):
    constants.update_dirs(DATASET_NAME_u=dataset_name)
    search_method = "sa"
    network_file_name, score_file_name, score_method, bg_genes= server.init_common_params(network_file_name, score_method)

    results_file_name = init_specific_params(search_method)

    script_file_name=format_script(os.path.join(constants.SH_DIR, "run_{}.sh".format(ALGO_NAME)), BASE_FOLDER=constants.BASE_PROFILE,
                  DATASET_DIR=constants.DATASET_DIR,
                  ALGO_DIR=ALGO_DIR, NETWORK_NAME=network_file_name, SCORE_FILE_NAME=score_file_name,
                  IS_GREEDY=str(search_method == "greedy"), OUTPUT_FILE=results_file_name, NUM_OF_MODULES=10, OVERLAP_THRESHOLD=0)

    subprocess.Popen("bash {}".format(script_file_name), shell=True,
                     stdout=subprocess.PIPE, cwd=ALGO_DIR).stdout.read()

    os.remove(script_file_name)
    modules_genes_file_name = os.path.join(constants.OUTPUT_DIR, "{}_{}_module_genes.txt".format(ALGO_NAME, search_method))
    all_bg_genes, modules = extract_modules_and_bg(bg_genes, results_file_name, modules_genes_file_name)

    output_base_dir = ""
    if constants.REPORTS:
        output_base_dir = build_all_reports(ALGO_NAME + "_" + search_method, dataset_name, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes)

    output_file_name=os.path.join(constants.OUTPUT_DIR,
                 "{}_{}_client_output.txt".format(ALGO_NAME, search_method))
    output_modules(output_file_name, modules, score_file_name, output_base_dir)
Exemple #7
0
def main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes = None, score_method=constants.DEG_EDGER, network_file_name="dip.sif", fdr=0.05):
    constants.update_dirs(DATASET_NAME_u=dataset_name)
    network_file_name, score_file_name, score_method, bg_genes = server.init_common_params(network_file_name , score_method)

    all_bg_genes, modules = run_bionet_for_all_modules(fdr, network_file_name, score_file_name, constants.IS_PVAL_SCORES)

    output_base_dir = ""
    if constants.REPORTS:
        output_base_dir = build_all_reports(ALGO_NAME, dataset_name, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes)

    output_file_name = os.path.join(constants.OUTPUT_DIR,
                                    "{}_client_output.txt".format(ALGO_NAME))
    output_modules(output_file_name, modules, score_file_name, output_base_dir)
def main(dataset_name=constants.DATASET_NAME,
         disease_name=None,
         expected_genes=None):
    global NETWORK_NAME
    constants.update_dirs(DATASET_NAME_u=dataset_name)
    network_file_name, score_file_name, score_method, bg_genes = server.init_common_params(
        NETWORK_NAME)
    STRATEGY = "INES"
    algorithm = "OPTIMAL"
    omitted_genes = []
    modules = []
    all_bg_genes = []
    cur_network_name = NETWORK_NAME
    for cur_i_module in range(40):
        binary_score_file_name, cur_network_file_name = init_specific_params(
            score_file_name, score_method, omitted_genes, network_file_name,
            str(cur_i_module))

        format_scripts(algo_name=ALGO_NAME,
                       score_file_name=binary_score_file_name,
                       network_name=cur_network_file_name,
                       STRATEGY=STRATEGY,
                       algorithm=algorithm)
        print subprocess.Popen("bash {}/run_{}.sh".format(
            constants.SH_DIR, ALGO_NAME),
                               shell=True,
                               stdout=subprocess.PIPE,
                               cwd=ALGO_DIR).stdout.read()
        module, all_bg_gene = extract_module_genes(bg_genes, STRATEGY,
                                                   algorithm)
        if len(module[0]) > 3:
            modules.append(module[0])
            all_bg_genes.append(all_bg_gene[0])
        omitted_genes += list(module[0])
    output_base_dir = ""
    if constants.REPORTS:
        output_base_dir = build_all_reports(
            ALGO_NAME + "_" + STRATEGY + "_" + algorithm, modules,
            all_bg_genes, score_file_name, network_file_name, disease_name,
            expected_genes)

    output_file_name = os.path.join(
        constants.OUTPUT_DIR,
        "{}_{}_{}_client_output.txt".format(ALGO_NAME, STRATEGY, algorithm))
    output_modules(output_file_name, modules, score_file_name, output_base_dir)
Exemple #9
0
def main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes = None):
    global NETWORK_NAME
    constants.update_dirs(DATASET_NAME_u=dataset_name)
    network_file_name, score_file_name, score_method, bg_genes = server.init_common_params(NETWORK_NAME)
    STRATEGY = "GLONE"
    binary_score_file_name = init_common_params(score_file_name, score_method)
    format_scripts(algo_name=ALGO_NAME, score_file_name=binary_score_file_name, network_name=NETWORK_NAME, STRATEGY=STRATEGY)
    print subprocess.Popen("bash {}/run_{}.sh".format(constants.SH_DIR, ALGO_NAME), shell=True,
                           stdout=subprocess.PIPE, cwd=ALGO_DIR).stdout.read()
    modules, all_bg_genes = extract_module_genes(bg_genes, STRATEGY)

    output_base_dir = ""
    if constants.REPORTS:
        output_base_dir = build_all_reports(ALGO_NAME + "_" + STRATEGY, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes)

    output_file_name = os.path.join(constants.OUTPUT_DIR,
                                    "{}_client_output.txt".format(ALGO_NAME))
    output_modules(output_file_name, modules, score_file_name, output_base_dir )
Exemple #10
0
def main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes = None, score_method=constants.DEG_EDGER, network_file_name="dip.sif"):
    constants.update_dirs(DATASET_NAME_u=dataset_name)
    network_file_name, score_file_name, score_method, bg_genes = server.init_common_params(network_file_name, score_method)
    strategy = "INES"
    algorithm = "GREEDY"
    omitted_genes = []
    modules = []
    all_bg_genes = []
    dest_algo_dir = "{}_{}".format(ALGO_DIR, random.random())
    shutil.copytree(ALGO_DIR, dest_algo_dir)
    empty_counter = 0
    for cur_i_module in range(40):
        binary_score_file_name, cur_network_file_name = init_specific_params(score_file_name, score_method, omitted_genes,
                                                                         network_file_name, str(random.random()), dest_algo_dir)

        script_file_name=format_scripts(score_file_name=binary_score_file_name, network_name=cur_network_file_name,
                       STRATEGY=strategy, algorithm=algorithm, algo_dir=dest_algo_dir, dataset_name=dataset_name)
        print subprocess.Popen("bash {}".format(script_file_name), shell=True,
                               stdout=subprocess.PIPE, cwd=dest_algo_dir).stdout.read()
        module, all_bg_gene = extract_module_genes(bg_genes, strategy, algorithm, dest_algo_dir)

        if len(module[0]) > 3:
            empty_counter=0
            modules.append(module[0])
            all_bg_genes.append(all_bg_gene[0])
        else:
            empty_counter+=1
        omitted_genes += list(module[0])
        os.remove(script_file_name)

        if empty_counter>3:
            print "got more that 3 smalle modules in row. continue..."
            break

    shutil.rmtree(dest_algo_dir)

    output_base_dir = ""
    if constants.REPORTS:
        output_base_dir = build_all_reports("{}_{}_{}".format(ALGO_NAME,strategy, algorithm), dataset_name, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes)
    output_file_name = os.path.join(constants.OUTPUT_DIR,
                                    "{}_client_output.txt".format("{}_{}_{}".format(ALGO_NAME,strategy, algorithm)))
    output_modules(output_file_name, modules, score_file_name, output_base_dir )
Exemple #11
0
def main(dataset_name=constants.DATASET_NAME,
         disease_name=None,
         expected_genes=None,
         score_method=constants.DEG_EDGER):
    global NETWORK_NAME
    constants.update_dirs(DATASET_NAME_u=dataset_name)
    network_file_name, score_file_name, score_method, bg_genes = server.init_common_params(
        NETWORK_NAME, score_method)

    ge_file_name, network_file_name, output_file_name = init_specific_params(
        ge_file_name=os.path.join(constants.DATA_DIR, "ge.tsv"),
        network_file_name=os.path.join(constants.NETWORKS_DIR,
                                       NETWORK_NAME + ".sif"))

    format_script(os.path.join(constants.SH_DIR,
                               "run_{}.sh".format(ALGO_NAME)),
                  ALGO_BASE_DIR=constants.ALGO_BASE_DIR,
                  GE_FILE_NAME=ge_file_name,
                  NETWORK_FILE_NAME=network_file_name,
                  BETA=0.95,
                  MINIMAL_MODULE_SIZE=4,
                  MAXIMAL_MODULE_SIZE=1000,
                  OUTPUT_FILE_NAME=output_file_name)

    subprocess.Popen("bash {}/run_{}.sh".format(constants.SH_DIR, ALGO_NAME),
                     shell=True,
                     stdout=subprocess.PIPE,
                     cwd=ALGO_DIR).stdout.read()

    modules, all_bg_genes = extract_modules_and_bg(bg_genes, output_file_name)

    output_base_dir = ""
    if constants.REPORTS:
        output_base_dir = build_all_reports(ALGO_NAME, modules, all_bg_genes,
                                            score_file_name, network_file_name,
                                            disease_name, expected_genes)

    output_file_name = os.path.join(constants.OUTPUT_DIR,
                                    "{}_client_output.txt".format(ALGO_NAME))
    output_modules(output_file_name, modules, score_file_name, output_base_dir)
Exemple #12
0
def main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes = None, score_method=constants.DEG_EDGER):
    global NETWORK_NAME
    constants.update_dirs(DATASET_NAME_u=dataset_name)
    network_file_name, score_file_name, score_method, bg_genes = server.init_common_params(NETWORK_NAME, score_method)
    if score_method == constants.PREDEFINED_SCORE:
        raise Exception("Cannot run this algo on scor-based metrics. please provide gene expression file")

    bg_genes, network_file_name = init_specific_params(NETWORK_NAME)

    format_script(os.path.join(constants.SH_DIR, "run_{}.sh".format(ALGO_NAME)), BASE_FOLDER=constants.BASE_PROFILE,
                  DATASET_DIR=constants.DATASET_DIR, ALGO_DIR=ALGO_DIR, NETWORK_NAME=NETWORK_NAME)

    subprocess.Popen("bash {}/run_{}.sh".format(constants.SH_DIR, ALGO_NAME), shell=True,
                     stdout=subprocess.PIPE, cwd=ALGO_DIR).stdout.read()

    modules, all_bg_genes = extract_modules_and_bg(bg_genes)
    output_base_dir = ""
    if constants.REPORTS:
        output_base_dir = build_all_reports(ALGO_NAME, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes)

    output_file_name = os.path.join(constants.OUTPUT_DIR,
                                    "{}_client_output.txt".format(ALGO_NAME))
    output_modules(output_file_name, modules, score_file_name, output_base_dir )
Exemple #13
0
def main(dataset_name=constants.DATASET_NAME,
         disease_name=None,
         expected_genes=None,
         score_method=constants.DEG_EDGER,
         network_file_name="dip.sif"):
    constants.update_dirs(DATASET_NAME_u=dataset_name)
    network_file_name, score_file_name, score_method, bg_genes = server.init_common_params(
        network_file_name, score_method)

    script_name = "run_{}.sh".format(ALGO_NAME)
    dest_algo_dir = "{}_{}".format(ALGO_DIR, random.random())
    shutil.copytree(ALGO_DIR, dest_algo_dir)
    conf_file_name = init_specific_params(score_file_name, dest_algo_dir)
    script_file_name = format_script(os.path.join(constants.SH_DIR,
                                                  script_name),
                                     BASE_FOLDER=constants.BASE_PROFILE,
                                     DATASET_DIR=constants.DATASET_DIR,
                                     CONFIG_FILE_NAME=conf_file_name,
                                     NETBOX_DIR=dest_algo_dir)
    print subprocess.Popen("bash {}".format(script_file_name),
                           shell=True,
                           stdout=subprocess.PIPE,
                           cwd=dest_algo_dir).stdout.read()

    modules, all_bg_genes = extract_modules_and_bg(bg_genes, dest_algo_dir)
    os.remove(script_file_name)
    os.remove(conf_file_name)
    shutil.rmtree(dest_algo_dir)
    output_base_dir = ""
    if constants.REPORTS:
        output_base_dir = build_all_reports(ALGO_NAME, dataset_name, modules,
                                            all_bg_genes, score_file_name,
                                            network_file_name, disease_name,
                                            expected_genes)
    output_file_name = os.path.join(constants.OUTPUT_DIR,
                                    "{}_client_output.txt".format(ALGO_NAME))
Exemple #14
0
import pandas as pd
import constants
import os

constants.update_dirs(DATASET_NAME_u="IES")
df_ge = pd.read_csv(os.path.join(constants.DATA_DIR, "ge_mouse.tsv"), sep="\t")
df_ge = df_ge.set_index("id")
df_ge = df_ge[~df_ge.index.duplicated(keep='first')]
df_mouse2human = pd.read_csv(os.path.join(constants.DICTIONARIES_DIR, "mouse2human.txt"), sep="\t")
df_mouse2human = df_mouse2human.set_index("Mouse gene stable ID")
df_mouse2human = df_mouse2human[~df_mouse2human.index.duplicated(keep='first')]
df_converted_ge = pd.concat([df_ge, df_mouse2human], join="inner", axis=1)
df_converted_ge = df_converted_ge.set_index("Gene stable ID")
df_converted_ge = df_converted_ge[~df_converted_ge.index.duplicated(keep="first")]
df_converted_ge.to_csv(os.path.join(constants.DATA_DIR, "ge.tsv"), sep="\t", index_label="id")

Exemple #15
0
    def __init__(self, dataset_names, meta_groups_files, metagroups_names):

        self.labels = np.array([])
        self.labels_unique=np.array([])
        self.samples = pd.DataFrame()
        self.survival= pd.DataFrame()
        label_counter=0
        all_genes=np.array([])
        for dataset_name, meta_groups_file, metagroups_name in zip(dataset_names, meta_groups_files, metagroups_names):
            constants.update_dirs(DATASET_NAME_u=dataset_name)
            meta_groups = [json.load(file(meta_groups_file))]
            data_normalizaton = "rsem"
            gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, mirna_file_name, pval_preprocessing_file_name =param_builder_func(
                dataset=dataset_name, data_normalizaton=data_normalizaton)

            # gene_expression_file_name=mirna_file_name
                   
            tested_gene_list_file_name = "protein_coding.txt" # "mir_total.txt" #
            total_gene_list_file_name = "protein_coding.txt" # "mir_total.txt" # 
            filter_expression = None
            print gene_expression_file_name
            data = infra.load_integrated_ge_data(tested_gene_list_file_name=tested_gene_list_file_name,
                                                 total_gene_list_file_name=total_gene_list_file_name,
                                                 gene_expression_file_name=gene_expression_file_name,
                                                 phenotype_file_name=phenotype_file_name,
                                                 survival_file_name=survival_file_name,
                                                 var_th_index=None, meta_groups=meta_groups,
                                                 filter_expression=filter_expression)

            gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns, labels_assignment, survival_dataset = data
            all_genes=np.append(all_genes, gene_expression_top_var_headers_columns)
            if survival_dataset is not None:
                self.survival=pd.concat([self.survival, pd.DataFrame(survival_dataset[1:, 1:], index=survival_dataset[1:, 0])])

            # self.labels_unique = np.array([x['_name'] for x in meta_groups[0]])

            labels_assignment=np.array(labels_assignment)[0]
            for cur_label in np.unique(labels_assignment):
                cur_label_name=[cur["_name"] for cur in meta_groups[0] if "_label" in cur and int(cur["_label"])==cur_label]
                cur_label_name = "{}, {}".format(metagroups_name, cur_label_name[0] if len(cur_label_name) > 0 else "unknown")
                print metagroups_name
                # cur_label_name = "{}".format(cur_label_name[0] if len(cur_label_name) > 0 else "unknown")
                # print cur_label_name
                # if "unknown" in cur_label_name: continue
                df_new = pd.DataFrame(data=gene_expression_top_var[labels_assignment==cur_label], index=gene_expression_top_var_headers_rows[labels_assignment==cur_label],
                                      columns=gene_expression_top_var_headers_columns)
                self.samples = pd.concat([self.samples, df_new], axis=0)
                # print "number of nan samples: {}".format(np.sum(np.isnan(np.sum(self.samples.values, axis=0))))
                # print "shape size: {}".format(df_new.shape)
                all_genes=np.unique(np.append(all_genes, df_new.columns.values))
                # print "numnber of all genes: {}. current: {}".format(all_genes.shape[0], df_new.columns.values.shape[0])
                # print "current nan cols: {}".format(self.samples.columns[np.isnan(np.sum(self.samples.values, axis=0))])
                # print "current filtered shape: {}".format(self.samples.dropna(axis=1).shape)
                self.labels = np.append(self.labels, [cur_label_name for x in range(len(df_new.index))])
                self.labels_unique = np.append(self.labels_unique, [cur_label_name])
                label_counter+=1

        # print "all genes: {}".format(len(set(all_genes)))
        var_th_index =n_input_layer-1
        if var_th_index is not None:
            print "filtering top vars"
            self.samples=self.samples.dropna(axis=1)
            gene_expression_top_var, gene_expression_top_var_headers_rows, gene_expression_top_var_headers_columns = infra.filter_top_var_genes(
                self.samples.values.T, self.samples.index.values, self.samples.columns, var_th_index)
            self.samples = pd.DataFrame(data=gene_expression_top_var, index=gene_expression_top_var_headers_rows,
                                        columns=gene_expression_top_var_headers_columns).T


        self.samples = self.samples.divide(self.samples.max(axis=1), axis=0)
        # self.samples = self.samples / self.samples.max()
        print "total shape: {}".format(self.samples.shape)
def main(dataset, cur_json, ds_types="GDC"):
    constants.update_dirs(DATASET_NAME_u=dataset)
    meta_groups = None
    filter_expression = None
    meta_groups = [json.load(file("../filters/{}.json".format(cur_json)))]
    filter_expression = json.load(file("../filters/{}.json".format(cur_json)))

    gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, mirna_file_name, pval_preprocessing_file_name = build_params(
        type=ds_types,
        dataset=constants.DATASET_NAME,
        data_normalizaton=data_normalizaton)
    gene_expression_normalized_file_name = "ge_normalized.tsv"  # gene_expression_file_name
    survival_file_name = "none"
    tested_gene_expression, h_rows, h_cols, labels_assignment, survival_dataset = infra.load_integrated_ge_data(
        "dip_bg.txt",
        "dip_bg.txt",
        gene_expression_file_name,
        survival_file_name,
        phenotype_file_name,
        gene_filter_file_name=None,
        filter_expression=filter_expression,
        meta_groups=meta_groups,
        var_th_index=None)
    file(os.path.join(constants.DATA_DIR, "classes.tsv"),
         'w+').write('\t'.join([str(x) for x in labels_assignment[0]]))
    h_cols = [x.split('.')[0] for x in h_cols]
    df_data = pd.DataFrame(index=h_rows,
                           columns=h_cols,
                           data=tested_gene_expression).T
    df_data.to_csv(os.path.join(constants.DATA_DIR, "ge.tsv"),
                   index_label="eid",
                   sep="\t")
    var_th_index = None

    if not use_algo_cache:
        run_dataset(dataset, score_method=deg_method, algos=algos)
    # exit(0)
    gene_list_file_names = []
    prs = pd.DataFrame(columns=[
        'algo', 'algo_pr', 'kmean_pr_avg', 'kmean_pr_std', 'algo_kmean_ratio',
        'top_sig_pr', "algo_top_sig_ratio", 'rand_pr_avg', 'rand_pr_std',
        "algo_rand_ratio", 'algo_pr_rank_from_rand', "num of modules",
        'num_of_genes'
    ])

    for cur_algo in algos:
        print "about to start running {}".format(cur_algo)
        if not os.path.exists(
                os.path.join(constants.OUTPUT_GLOBAL_DIR, 'pca', dataset,
                             cur_algo)):
            os.makedirs(
                os.path.join(constants.OUTPUT_GLOBAL_DIR, 'pca', dataset,
                             cur_algo))
        algo_output = json.loads(
            file(
                os.path.join(constants.OUTPUT_DIR,
                             "{}_client_output.txt".format(
                                 cur_algo))).read().split("\n")[1])
        module_i = 0
        algo_pvals = []
        df_mean = pd.DataFrame()
        gene_2_module = {}
        num_of_genes = 0
        algo_genes_flatted = []

        while True:
            module_genes_flatted = [
                x['eid'] for x in algo_output if module_i in x['modules']
            ]
            algo_genes_flatted += module_genes_flatted
            num_of_genes += len(module_genes_flatted)
            print "# of genes in module {} : {}".format(
                module_i, len(module_genes_flatted))
            for cur in module_genes_flatted:
                gene_2_module[cur] = module_i

            algo_genes_flatted += module_genes_flatted
            if len(module_genes_flatted) == 0 and module_i > 0: break
            if len(module_genes_flatted) < 4 or sum(
                    df_data.index.isin(module_genes_flatted)) == 0:
                module_i += 1
                continue
            gene_list_file_names.append(
                os.path.join(constants.LIST_DIR, cur_algo + ".txt"))
            file(gene_list_file_names[-1],
                 'w+').write("\n".join(module_genes_flatted))
            df_mean = pd.concat(
                (df_mean,
                 pd.DataFrame(zscore(
                     df_data[df_data.index.isin(module_genes_flatted)],
                     axis=1).mean(axis=0).reshape(1, len(df_data.columns)),
                              columns=df_data.columns)))

            module_i += 1

        module_i = len(df_mean.index)

        if module_i < 2:
            print "not enough modules. retrieved {}".format(module_i)
            continue

        mean_file_name = os.path.join(constants.DATA_DIR, "mean.tsv")
        df_mean.index = np.arange(df_mean.shape[0])
        df_mean.to_csv(mean_file_name, sep="\t", index_label="eid")
        index_file_name = os.path.join(constants.LIST_DIR,
                                       "{}_indices.txt".format(cur_algo))
        file(index_file_name,
             'w+').write('\n'.join([str(x) for x in df_mean.index.values]))

        algo_genes_file_name = os.path.join(
            constants.LIST_DIR, "{}_all_genes.txt".format(cur_algo))
        file(algo_genes_file_name,
             'w+').write('\n'.join([str(x) for x in algo_genes_flatted]))

        df_algo_gene_matrix = df_data[df_data.index.isin(algo_genes_flatted)]

        results = plot_detailed_pca(
            tested_gene_list_file_name="{}_indices.txt".format(cur_algo),
            total_gene_list_file_name="protein_coding.txt",
            gene_expression_file_name=mean_file_name,
            phenotype_file_name=phenotype_file_name,
            survival_file_name=survival_file_name,
            filter_expression=filter_expression,
            meta_groups=meta_groups,
            var_th_index=var_th_index,
            algo_name=cur_algo,
            plot_svm=plot_svm)

        if results is None:
            continue
        X, y, algo_pr, algo_roc = results

        print "results for mean: {}".format(algo_pr)
        algo_bg_pr_mean = 0
        algo_bg_pr_std = 0
        if KMEANS_TIMES > 1:
            all_algo_bg_pr = []
            for kmean_i in range(KMEANS_TIMES):
                _1, clusters, _2 = kmeanssample(X=df_algo_gene_matrix.values,
                                                k=module_i,
                                                metric="euclidean")

                bg_modules = [
                    df_algo_gene_matrix.index.values[clusters == cur_i]
                    for cur_i in range(module_i)
                ]
                df_mean_bg = pd.DataFrame()
                for i, module in enumerate(bg_modules):
                    print "# of genes in background module {} : {}".format(
                        i, len(module))

                    gene_list_file_names.append(
                        os.path.join(constants.LIST_DIR, cur_algo + "_bg.txt"))
                    file(gene_list_file_names[-1],
                         'w+').write("\n".join(module_genes_flatted))
                    df_mean_bg = pd.concat(
                        (df_mean_bg,
                         pd.DataFrame(
                             zscore(df_data[df_data.index.isin(module)],
                                    axis=1).mean(axis=0).reshape(
                                        1, len(df_data.columns)),
                             columns=df_data.columns)))

                bg_mean_file_name = os.path.join(constants.DATA_DIR,
                                                 "mean_bg.tsv")
                df_mean_bg.index = np.arange(df_mean_bg.shape[0])
                df_mean_bg.to_csv(bg_mean_file_name,
                                  sep="\t",
                                  index_label="eid")
                index_file_name = os.path.join(
                    constants.LIST_DIR, "{}_bg_indices.txt".format(cur_algo))
                file(index_file_name, 'w+').write('\n'.join(
                    [str(x) for x in df_mean_bg.index.values]))

                all_genes_file_name = os.path.join(
                    constants.LIST_DIR, "{}_all_genes.txt".format(cur_algo))
                file(all_genes_file_name,
                     'w+').write('\n'.join(algo_genes_flatted))

                bg_genes = pd.read_csv(
                    os.path.join(constants.CACHE_DIR,
                                 deg_file_name.format(deg_method)),
                    sep='\t',
                    index_col=0).index.values[:len(df_mean.index)]
                bg_genes_file_name = os.path.join(
                    constants.LIST_DIR,
                    "{}_{}_bg_genes.txt".format(cur_algo, deg_method))
                file(bg_genes_file_name,
                     'w+').write('\n'.join([x.split('.')[0]
                                            for x in bg_genes]))

                X, y, algo_bg_pr, algo_bg_roc = plot_detailed_pca(
                    tested_gene_list_file_name="{}_bg_indices.txt".format(
                        cur_algo),
                    total_gene_list_file_name="protein_coding.txt",
                    gene_expression_file_name=bg_mean_file_name,
                    phenotype_file_name=phenotype_file_name,
                    survival_file_name=survival_file_name,
                    filter_expression=filter_expression,
                    meta_groups=meta_groups,
                    var_th_index=var_th_index,
                    algo_name=cur_algo,
                    plot_svm=plot_svm)

                print "results for mean: {}".format(algo_bg_pr)
                all_algo_bg_pr.append(algo_bg_pr)
            all_algo_bg_pr = np.array(all_algo_bg_pr)
            algo_bg_pr_mean = all_algo_bg_pr.mean()
            algo_bg_pr_std = all_algo_bg_pr.mean()

        top_sig_pr = 0
        if TOP_SIG:
            top_sig_genes = pd.read_csv(os.path.join(constants.CACHE_DIR,
                                                     deg_file_name),
                                        sep='\t',
                                        index_col=0)
            top_sig_genes = top_sig_genes.index.values[:len(
                top_sig_genes.index) / 200]  # len(df_mean.index)
            top_sig_genes_file_name = os.path.join(
                constants.LIST_DIR,
                "{}_{}_top_sig_genes.txt".format(cur_algo, deg_method))
            file(top_sig_genes_file_name, 'w+').write('\n'.join(
                [x.split('.')[0] for x in top_sig_genes]))

            X, y, top_sig_pr, top_sig_roc = plot_detailed_pca(
                tested_gene_list_file_name=os.path.basename(
                    top_sig_genes_file_name),
                total_gene_list_file_name="protein_coding.txt",
                gene_expression_file_name=gene_expression_normalized_file_name,
                phenotype_file_name=phenotype_file_name,
                survival_file_name=survival_file_name,
                filter_expression=filter_expression,
                meta_groups=meta_groups,
                var_th_index=var_th_index,
                algo_name=cur_algo,
                plot_svm=plot_svm)
            print "results for top: {}".format(top_sig_pr)

        rand_prs_mean = 0
        rand_prs_std = 0
        rand_prs = []
        trials = 0
        if RAND_TIMES > 1:
            while trials < RAND_TIMES:
                random_set_file_name = generate_random_set(
                    random_size=len(df_mean.index),  # df_mean.index
                    meta_gene_set="dip_bg.txt".format(cur_algo))
                print "running {} iteration for {} random bg with {} genes".format(
                    trials, cur_algo, len(df_mean.index))
                results = plot_detailed_pca(
                    tested_gene_list_file_name=random_set_file_name,
                    total_gene_list_file_name="protein_coding.txt",
                    gene_expression_file_name=
                    gene_expression_normalized_file_name,
                    phenotype_file_name=phenotype_file_name,
                    survival_file_name=survival_file_name,
                    filter_expression=filter_expression,
                    meta_groups=meta_groups,
                    var_th_index=var_th_index,
                    feature_names=gene_2_module,
                    algo_name=cur_algo,
                    plot_svm=plot_svm)

                if results is None:
                    print "not enough genes retrieved. retry.."
                    continue
                X, y, rand_pr, rand_roc = results
                trials += 1
                rand_prs.append(rand_pr)
                print "results for random {}: {}".format(trials, rand_pr)
            rand_prs = np.array(rand_prs)
            rand_prs_mean = rand_prs.mean()
            rand_prs_std = rand_prs.std()

        row = {
            'algo':
            cur_algo,
            'algo_pr':
            algo_pr,
            'kmean_pr_avg':
            algo_bg_pr_mean,
            'kmean_pr_std':
            algo_bg_pr_std,
            'algo_kmean_ratio':
            algo_pr / algo_bg_pr_mean,
            'top_sig_pr':
            top_sig_pr,
            "algo_top_sig_ratio":
            algo_pr / top_sig_pr,
            'rand_pr_mean':
            rand_prs_mean,
            'rand_pr_std':
            rand_prs_std,
            "algo_rand_ratio":
            algo_pr / rand_pr,
            'algo_pr_rank_from_rand':
            len([cur for cur in rand_prs if cur > algo_pr]),
            "num of modules":
            module_i,
            'num_of_genes':
            num_of_genes
        }
        row.update({'rand_pr' + str(i): v for i, v in enumerate(rand_prs)})
        prs = prs.append(row, ignore_index=True)

    prs = prs.set_index('algo')
    prs.to_csv(os.path.join(constants.OUTPUT_GLOBAL_DIR, 'pca',
                            constants.DATASET_NAME,
                            "pr_summary_{}_{}.tsv".format(dataset, cur_json)),
               sep='\t')
    print " algo pvals"
    print algo_pvals
Exemple #17
0
    # datasets = ["GWAS_fasting_insulin", "GWAS_2hr_glucose", "GWAS_adhd", "GWAS_alzheimers", "GWAS_anorexia",
    #             "GWAS_autism", "GWAS_beta-cell_function", "GWAS_bipolar_disorder", "GWAS_blood_pressure_systolic",
    #             "GWAS_body_mass_index", "GWAS_coronary_artery_disease", "GWAS_crohns_disease", "GWAS_cross_disorder"]
    datasets = [
        name for name in os.listdir(constants.OUTPUT_GLOBAL_DIR)
        if os.path.isdir(os.path.join(constants.OUTPUT_GLOBAL_DIR, name))
        and name.startswith("GWAS_random")
    ]  #  and not name.startswith("GWAS_random") and not name.startswith("GWAS_cancer")
    # datasets = ["TNFa_2", "MCF7_2", "SOC", "HC12", "IEM", "IES"]
    # datasets=['GWAS_schizophrenia']
    all_embs = np.array([])

    fig, ax = plt.subplots(figsize=(15, 5))
    for cur_ds in datasets:
        print "current ds: {}".format(cur_ds)
        constants.update_dirs(DATASET_NAME_u=cur_ds)
        root_path = os.path.join(constants.OUTPUT_GLOBAL_DIR,
                                 constants.DATASET_NAME)

        all_algo_modules = {}
        for name in os.listdir(root_path):
            if os.path.isdir(os.path.join(root_path, name)) and name not in [
                    "data", "cache", "output"
            ]:
                modules_summary = pd.read_csv(os.path.join(
                    root_path, name, "modules_summary.tsv"),
                                              sep="\t")
                if len(modules_summary.index) == 0:
                    continue
                modules_summary = modules_summary.set_index("module")
                all_algo_modules[name] = np.array(modules_summary.index)
Exemple #18
0
def main(datasets, algos):

    colormap = cm.rainbow
    colorlist = [ml_colors.rgb2hex(colormap(i)) for i in
                 np.array(list(range(len(algos)))) / float(len(algos) - 1)]
    df_matrix = pd.DataFrame()
    df_summary = pd.DataFrame()
    for cur_ds in datasets:

        constants.update_dirs(DATASET_NAME_u=cur_ds)
        total_num_genes=[]
        avg_num_genes=[]
        std_num_genes=[]
        algos_signals=[]
        algo_go_sims = []

        for i_algo, cur_algo in enumerate(algos):
            print "current aggregation: {}, {}".format(cur_ds,cur_algo)
            try:
                total_num_genes.append(pd.read_csv(
                    os.path.join(constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME, cur_algo, "all_modules_general.tsv"),
                    sep="\t")["total_num_genes"][0])
                avg_num_genes.append(pd.read_csv(
                    os.path.join(constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME, cur_algo,
                                 "modules_summary.tsv"),
                    sep="\t")["#_genes"].mean())
                std_num_genes.append(pd.read_csv(
                    os.path.join(constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME, cur_algo,
                                 "modules_summary.tsv"),
                    sep="\t")["#_genes"].std())
            except:
                print "no genes were found for: {}, {}".format(cur_ds, cur_algo)
                total_num_genes.append(0)
            algos_signals.append(float(file(os.path.join(constants.OUTPUT_GLOBAL_DIR, "emp_fdr", "ds_2_alg_scores",
                              "{}_{}_{}".format(cur_ds, cur_algo, "n_sig.txt"))).read()))
            algo_go_sims.append(float(file(os.path.join(constants.OUTPUT_GLOBAL_DIR, "emp_fdr", "ds_2_alg_scores",
                                                       "{}_{}_{}".format(cur_ds, cur_algo, "var.txt"))).read()))

        fig, ax = plt.subplots(figsize=(10, 10))

        print "all data: \n{}\n{}\n{}\n{}".format(algos_signals, algo_go_sims, algos, total_num_genes)
        for h, s, c, a, gene_size, module_mean, module_std in zip(algos_signals, algo_go_sims, colorlist, algos,
                                         total_num_genes, avg_num_genes, std_num_genes):  # [0 for x in range(len(algo_go_sim_score))]
            print (h, s)
            ax.scatter(h, s, s=(50 + 2000 * (float(gene_size) / (1+np.max(total_num_genes)))),
                       c=c, cmap='jet', label=a)
            df_series=pd.Series({"algo": a, "dataset": cur_ds, "sig_terms": h,
                       "sig_terms_rank": pd.Series(np.array(algos_signals)).rank(ascending=0).values[
                           np.where(np.array(algos_signals) == h)[0][0]], "variability": s,
                       "variability_rank": pd.Series(np.array(algo_go_sims)).rank(ascending=0).values[
                           np.where((np.array(algo_go_sims)) == s)[0][0]], "n_genes": gene_size, "module_size_mean": module_mean, "module_size_std": module_std})
            df_series.name = "{}_{}".format(cur_ds, a)
            df_summary=df_summary.append(df_series)
            df_matrix.loc[a, cur_ds]=h
            colorlist = [ml_colors.rgb2hex(colormap(i)) for i in
                         np.array(list(range(len(algos)))) / float(len(algos) - 1)]
            patches = [Line2D([0], [0], marker='o', color='gray', label=a,
                              markerfacecolor=c) for i, a, c in zip(list(range(len(algos))), algos, colorlist)]
            ax.set_xlabel("# GO terms (-log10(qval)) above threshold")
            ax.set_ylabel("GO terms variability")
            ax.legend(handles=patches)
            ax.grid(True)
        plt.savefig(os.path.join(constants.OUTPUT_GLOBAL_DIR,
                                 "hs_plot_terms_signal_algo_{}.png".format(constants.DATASET_NAME)))
    return df_summary, df_matrix
Exemple #19
0
        else:
            small_modules += 1
    return all_bg_genes, modules

def main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes = None, score_method=constants.DEG_EDGER, network_file_name="dip.sif", fdr=0.05):
    constants.update_dirs(DATASET_NAME_u=dataset_name)
    network_file_name, score_file_name, score_method, bg_genes = server.init_common_params(network_file_name , score_method)

    all_bg_genes, modules = run_bionet_for_all_modules(fdr, network_file_name, score_file_name, constants.IS_PVAL_SCORES)

    output_base_dir = ""
    if constants.REPORTS:
        output_base_dir = build_all_reports(ALGO_NAME, dataset_name, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes)

    output_file_name = os.path.join(constants.OUTPUT_DIR,
                                    "{}_client_output.txt".format(ALGO_NAME))
    output_modules(output_file_name, modules, score_file_name, output_base_dir)




if __name__ == "__main__":
    constants.update_dirs(DATASET_NAME_u="GE_ERS_1")
    main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes = None, score_method=constants.DEG_EDGER, network_file_name="dip.sif")






Exemple #20
0
def main(dataset_name):
    global df_go_metadata, all_hg_score_modules, df_hg_output
    colormap = cm.rainbow
    constants.update_dirs(DATASET_NAME_u=dataset_name)
    GO_RANK_CUTOFF = 150
    ##########################
    if TERMS_SIMILARITY_TO_NUM_OF_TERMS:
        ontology_type = 'GeneOntology'
        ignore_parameters = {'ignore': {}}
        source_type = 'obo'
        source = os.path.join(
            os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))

        print "\n######################"
        print "# Loading ontology... #"
        print "######################\n"

        ontology = ontologies.load(source=source,
                                   source_type=source_type,
                                   ontology_type=ontology_type,
                                   parameters=ignore_parameters)

        print "\n######################"
        print "# Loading Annotation Corpus... #"
        print "######################\n"
        ac = AnnotationCorpus.AnnotationCorpus(ontology)
        ac.parse(os.path.join(constants.GO_DIR, "goa_human.gaf"), "gaf-2.0")
        ac.isConsistent()

        print "\n#################################"
        print "# Annotation corpus successfully loaded."
        print "#################################\n"

        semsim = GSESAMESemSim(ontology, ac)  # maxSemSim(ontology, ac) #
    #################
    if ENABLE_GO_GRAPH:
        dict_result, go2geneids, geneids2go, entrez2ensembl = utils.go_hierarcies.build_hierarcy(
            roots=['GO:0008150', 'GO:0005575', 'GO:0003674'])
    #################
    all_homogeneity = []
    all_separability = []
    agg_homogeneity = []
    agg_separability = []
    algo_go_sim_score = []
    colors = []
    df_all_hg_pval = pd.DataFrame()
    df_go_metadata = pd.DataFrame()
    all_hg_score_labels = []
    all_hg_score_modules = []
    labels_by_sample = []
    total_num_genes = []
    algos = [
        "keypathwayminer_INES_GREEDY", "netbox", "hotnet2",
        "jactivemodules_greedy", "bionet", "jactivemodules_sa"
    ]  # "matisse", "reactomefi"  # "keypathwayminer_INES_OPTIMAL", "keypathwayminer_INES_ACO"
    algos_signals = []
    modules_signals = []
    df_all_hg_qval = pd.DataFrame()
    df_module2best_rows = []
    df_module2avg_rows = []
    df_algo2best_rows = []
    df_algo2avg_rows = []
    for i_algo, ALGO_NAME in enumerate(algos):
        df_all_hg_qval = pd.DataFrame()
        print "current algo: {}".format(ALGO_NAME)

        go2modules = {}
        modules2go = {}
        homogeneity = []
        separability = []
        all_go_terms = []

        try:
            total_num_genes.append(
                pd.read_csv(os.path.join(constants.OUTPUT_GLOBAL_DIR,
                                         constants.DATASET_NAME, ALGO_NAME,
                                         "all_modules_general.tsv"),
                            sep="\t")["total_num_genes"][0])
        except pd.errors.EmptyDataError:
            total_num_genes.append(0)
        algo2best_go_ratio = 0
        algo2avg_go_ratio = 0
        module2best_go_ratio = []
        module2avg_go_ratio = []
        df_modules_summary = pd.read_csv(os.path.join(
            constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME, ALGO_NAME,
            "modules_summary.tsv"),
                                         sep='\t')
        i = -1
        for i in range(len(df_modules_summary.index)):
            hg_file_name = os.path.join(
                constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME, ALGO_NAME,
                "module_{}_separated_modules_hg_samples.tsv".format(i))
            print "reading module: {} from file".format(i)
            if os.path.getsize(hg_file_name) < 2:
                modules2go[i] = np.array([])
                modules_signals.append(0)

            else:
                df_hg_output = pd.read_csv(hg_file_name, sep="\t")
                df_hg_output.index = df_hg_output["GO id"]
                df_go_metadata = pd.concat(
                    [df_go_metadata, df_hg_output[["GO id", "GO name"]]],
                    axis=0)
                df_go_metadata = df_go_metadata[~df_go_metadata.index.
                                                duplicated(keep='first')]
                df_all_hg_pval = pd.concat([
                    df_all_hg_pval,
                    df_hg_output["pval"].apply(lambda x: -np.log10(x))
                ],
                                           join='outer',
                                           axis=1)
                df_all_hg_qval = pd.concat([
                    df_all_hg_qval,
                    df_hg_output["qval"].apply(lambda x: -np.log10(x))
                ],
                                           join='outer',
                                           axis=1)
                df_hg_output = df_hg_output.iloc[:min(len(df_hg_output.index),
                                                      GO_RANK_CUTOFF), :]
                df_hg_output = df_hg_output[
                    df_hg_output["qval"] <=
                    QVAL_TH]  # .iloc[:min(len(df_hg_output.index),5),:]  #

                score = df_hg_output['value'].iloc[0] / float(
                    df_modules_summary.loc[i, "#_genes"]) if len(
                        df_hg_output.index) > 0 else 0
                df_module2best_rows.append({
                    'name':
                    "{}_{}".format(ALGO_NAME, i),
                    'algo':
                    ALGO_NAME,
                    'module':
                    i,
                    'score':
                    score,
                    'num_of_genes':
                    df_modules_summary.loc[i, "#_genes"]
                })
                algo2best_go_ratio += score
                score = (df_hg_output['value'] /
                         float(df_modules_summary.loc[i, "#_genes"])
                         if len(df_hg_output.index) > 0 else np.array(
                             [0])).mean()
                df_module2avg_rows.append({
                    'name':
                    "{}_{}".format(ALGO_NAME, i),
                    'algo':
                    ALGO_NAME,
                    'module':
                    i,
                    'score':
                    score,
                    'num_of_genes':
                    df_modules_summary.loc[i, "#_genes"]
                })
                algo2avg_go_ratio += score
                modules_signals.append(len(df_hg_output.index))
                all_hg_score_modules.append(i)
                all_hg_score_labels.append(i_algo)
                for x in df_hg_output["GO id"]:
                    if x in go2modules:
                        go2modules[x].append(i)
                    else:
                        go2modules[x] = [i]

                modules2go[str(i)] = df_hg_output["GO id"].values
                all_go_terms = np.append(all_go_terms,
                                         df_hg_output["GO id"].values)

        i += 1
        if RATIO_TO_GO_TERM:
            df_algo2best_rows.append({
                'name': '{}_total_avg'.format(ALGO_NAME),
                'score': algo2best_go_ratio / max(i, 1)
            })
            df_algo2avg_rows.append({
                'name': '{}_total_avg'.format(ALGO_NAME),
                'score': algo2avg_go_ratio / max(i, 1)
            })

        df_all_hg_pval[pd.isna(df_all_hg_pval)] = 0
        df_all_hg_qval[pd.isna(df_all_hg_qval)] = 0
        max_per_go_term = 0
        if df_all_hg_qval.values.size > 0:
            max_per_go_term = np.sum(
                np.max(df_all_hg_qval.values, axis=1) >= -np.log10(QVAL_TH))
        algos_signals.append(max_per_go_term)
        all_go_terms = list(np.unique(all_go_terms))

        print "added signal : {}".format(algos_signals[-1])
        print "all_go_terms : {}".format(len(all_go_terms))

        if ENABLE_GO_GRAPH:
            plot_go_tree(dict_result, all_go_terms, ALGO_NAME, i)

        if ENABLE_GO_GRAPH and IS_GO_GRAPH_ONLY:
            continue

        adj = np.ones((len(all_go_terms), len(all_go_terms))) * (-2)

        if TERMS_SIMILARITY_TO_NUM_OF_TERMS:
            for i_x, x in enumerate(all_go_terms):
                print "calc distance between terms {}/ {}".format(
                    i_x, len(all_go_terms))
                for i_y, y in enumerate(all_go_terms):
                    if adj[i_x, i_y] != -2: continue
                    adj[i_x,
                        i_y] = semsim.SemSim(x,
                                             y)  # , ResnikSemSim(ontology,ac))
                    if np.isnan(adj[i_x, i_y]):
                        adj[i_x, i_y] = -1
                    adj[i_y, i_x] = adj[i_x, i_y]

            algo_go_sim_score.append([
                1 if np.isnan(x) else x for x in
                [np.sum(adj[adj != -1]) / (np.size(adj) - np.sum(adj == -1))]
            ][0])

            for k, v in sorted([(int(k), v)
                                for k, v in modules2go.iteritems()],
                               key=lambda x: x[0]):
                print "calc homogeneity and seperability for module: {}".format(
                    k)
                v_filtered = [x for x in v]  # if x in G.nodes
                labels_by_sample.append(ALGO_NAME)
                homogeneity.append(
                    np.nan_to_num(
                        np.sum([
                            adj[all_go_terms.index(x),
                                all_go_terms.index(y)] for x in v for y in v
                            if adj[all_go_terms.index(x),
                                   all_go_terms.index(y)] != -1 and x != y
                        ]) / (len(v_filtered) * (len(v_filtered) - 1))))
                separability.append(
                    np.nan_to_num(
                        np.sum([
                            adj[all_go_terms.index(x),
                                all_go_terms.index(y)] for x in v
                            for y in all_go_terms
                            if y not in v and adj[all_go_terms.index(x),
                                                  all_go_terms.index(y)] != -1
                        ]) / (len(v_filtered) *
                              (len(all_go_terms) - len(v_filtered)))))

            all_separability = all_separability + separability
            all_homogeneity = all_homogeneity + homogeneity

            agg_separability.append(np.average(separability))
            agg_homogeneity.append(np.average(homogeneity))

            fig, ax = plt.subplots(figsize=(15, 15))

            ax.scatter(homogeneity, separability)
            ax.legend()
            ax.set_xlabel("Intra-Similarity")
            ax.set_ylabel("Inter-Similarity")
            ax.grid(True)
            colors = colors + [
                float(i_algo) / len(algos) for x in range(len(modules2go))
            ]

            for module_i, txt in enumerate(range(i)):
                ax.annotate(str(txt),
                            (homogeneity[module_i], separability[module_i]))
            plt.savefig(
                os.path.join(
                    constants.OUTPUT_GLOBAL_DIR,
                    "hs_plot_{}_{}.png".format(ALGO_NAME,
                                               constants.DATASET_NAME)))
    if RATIO_TO_GO_TERM:
        pd.DataFrame(df_module2best_rows).set_index('name').to_csv(
            os.path.join(constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME,
                         "GO_terms_per_module_ratio_best.tsv"),
            sep='\t')
        pd.DataFrame(df_module2avg_rows).set_index('name').to_csv(os.path.join(
            constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME,
            "GO_terms_per_module_ratio_avg.tsv"),
                                                                  sep='\t')

        pd.DataFrame(df_algo2best_rows).set_index('name').to_csv(os.path.join(
            constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME,
            "GO_terms_ratio_best.tsv"),
                                                                 sep='\t')
        pd.DataFrame(df_algo2avg_rows).set_index('name').to_csv(os.path.join(
            constants.OUTPUT_GLOBAL_DIR, constants.DATASET_NAME,
            "GO_terms_ratio_avg.tsv"),
                                                                sep='\t')
    if TERMS_SIMILARITY_TO_NUM_OF_TERMS:
        fig, ax = plt.subplots(figsize=(15, 15))
        colorlist = [ml_colors.rgb2hex(colormap(i)) for i in colors]
        for h, s, c, a in zip(all_homogeneity, all_separability, colorlist,
                              labels_by_sample):
            ax.scatter(h, s, s=50, c=c, vmin=0, vmax=1, cmap='jet')
            colorlist = [
                ml_colors.rgb2hex(colormap(i))
                for i in np.array(list(range(len(algos)))) /
                float(len(algos) - 1)
            ]
            patches = [
                Line2D([0], [0],
                       marker='o',
                       color='gray',
                       label=a,
                       markerfacecolor=c)
                for i, a, c in zip(list(range(len(algos))), algos, colorlist)
            ]
            ax.legend(handles=patches)
            ax.set_xlabel("Intra-Similarity")
            ax.set_ylabel("Inter-Similarity")
            ax.grid(True)
        plt.savefig(
            os.path.join(constants.OUTPUT_GLOBAL_DIR,
                         "hs_plot_all_{}.png".format(constants.DATASET_NAME)))

        fig, ax = plt.subplots(figsize=(10, 10))
        colorlist = [ml_colors.rgb2hex(colormap(i)) for i in colors]
        for h, s, c, a in zip(modules_signals, all_separability, colorlist,
                              labels_by_sample):
            ax.scatter(h, s, s=50, c=c, vmin=0, vmax=1, cmap='jet')
        colorlist = [
            ml_colors.rgb2hex(colormap(i))
            for i in np.array(list(range(len(algos)))) / float(len(algos) - 1)
        ]
        patches = [
            Line2D([0], [0],
                   marker='o',
                   color='gray',
                   label=a,
                   markerfacecolor=c)
            for i, a, c in zip(list(range(len(algos))), algos, colorlist)
        ]
        ax.legend(handles=patches)
        ax.set_xlabel("# GO terms (-log10(qval)) above threshold")
        ax.set_ylabel("Algorithm Inter-Similarity")
        ax.grid(True)
        plt.savefig(
            os.path.join(
                constants.OUTPUT_GLOBAL_DIR,
                "hs_plot_signal_all_{}.png".format(constants.DATASET_NAME)))

        colorlist = [
            ml_colors.rgb2hex(colormap(i))
            for i in np.array(list(range(len(agg_homogeneity)))) /
            float(len(agg_homogeneity) - 1)
        ]
        fig, ax = plt.subplots(figsize=(10, 10))
        for h, s, c, a in zip(agg_homogeneity, agg_separability, colorlist,
                              algos):
            ax.scatter(h, s, s=50, c=c, cmap='jet', label=a)
        ax.set_xlabel("Intra-Similarity")
        ax.set_ylabel("Inter-Similarity")
        ax.legend()
        ax.grid(True)

        for module_i, txt in enumerate(algos):
            ax.annotate(
                str(txt),
                (agg_homogeneity[module_i], agg_separability[module_i]))
        plt.savefig(
            os.path.join(constants.OUTPUT_GLOBAL_DIR,
                         "hs_plot_agg_{}.png".format(constants.DATASET_NAME)))
        # colorlist = [ml_colors.rgb2hex(colormap(i)) for i in algo_go_sim_score/np.max(algo_go_sim_score)]

        colorlist = [
            ml_colors.rgb2hex(colormap(i))
            for i in np.array(list(range(len(algos)))) / float(len(algos) - 1)
        ]

        fig, ax = plt.subplots(figsize=(10, 10))
        for h, s, c, a, gene_size in zip(
                algos_signals, algo_go_sim_score, colorlist, algos,
                total_num_genes):  # [0 for x in range(len(algo_go_sim_score))]
            print(h, s)
            ax.scatter(h,
                       s,
                       s=(50 + 2000 *
                          (float(gene_size) / np.max(total_num_genes))),
                       c=c,
                       cmap='jet',
                       label=a)

            colorlist = [
                ml_colors.rgb2hex(colormap(i))
                for i in np.array(list(range(len(algos)))) /
                float(len(algos) - 1)
            ]
            patches = [
                Line2D([0], [0],
                       marker='o',
                       color='gray',
                       label=a,
                       markerfacecolor=c)
                for i, a, c in zip(list(range(len(algos))), algos, colorlist)
            ]
            ax.set_xlabel("# GO terms (-log10(qval)) above threshold")
            ax.set_ylabel("Algorithm Inter-Similarity")
            ax.legend(handles=patches)
            ax.grid(True)
        plt.savefig(
            os.path.join(
                constants.OUTPUT_GLOBAL_DIR,
                "hs_plot_terms_signal_algo_{}.png".format(
                    constants.DATASET_NAME)))
    if GO_PCA:
        plot_pca(all_hg_score_labels, df_all_hg_pval, ml_colors, algos)
Exemple #21
0
        abs_nw_genes = len(disease_nw_genes)
        sys.stdout.write("ratio={}, absolute n={}\n".format(
            total_to_nw_genes_ratio, abs_nw_genes))
        if total_to_nw_genes_ratio < TOTAL_TO_NW_RATIO or abs_nw_genes < ABS_NW_GENES:
            sys.stdout.write("disease genes in network is too small\n")
        else:
            print "about to start analyze disease: {}".format(cur_disease_name)
            disease_counter += 1
            dataset_name = "_".join(["DISGENET", ts, str(disease_counter)])
            create_ds_folders(dataset_name)
            disease_nw_genes_selected = random.sample(
                set(disease_nw_genes),
                len(set(disease_nw_genes)) / 2)
            disease_nw_genes_unselected = set(disease_nw_genes) - set(
                disease_nw_genes_selected)
            create_ge(disease_nw_genes_selected, bg_genes, dataset_name)
            run_dataset(dataset_name, disease_nw_genes_unselected,
                        cur_disease_name, constants.DEG_EDGER)

        if disease_counter == MAX_DISEASES:
            break

    aggregate_reports.aggregate_disease("_".join(["DISGENET", ts]))
    sys.stdout.write("total tested diseases: {}\n".format(disease_counter))


if __name__ == "__main__":
    constants.update_dirs(DATASET_NAME_u="DISGENET")
    main()
    # aggregate_reports.aggregate_disease("DISGENET_1542711137.18")
Exemple #22
0
    print subprocess.Popen(
        "bash ../sh/scripts/prepare_hotnet2.sh.format",
        shell=True,
        stdout=subprocess.PIPE).stdout.read()  # cwd=dir_path


def run_hotnet2(deg_file_name, network_file_name):
    script = file("scripts/bionet.r").read()
    return run_rscript(script=script,
                       output_vars=["module_genes", "bg_genes"],
                       network_file_name=network_file_name,
                       deg_file_name=deg_file_name)


if __name__ == "__main__":
    constants.update_dirs(DATASET_NAME_u="TNFa")
    params = get_parameters()
    if params != None:
        args, NETWORK_NAME, dataset_name = params

    print subprocess.Popen(
        "bash ../sh/scripts/run_pinnaclez.sh.format",
        shell=True,
        stdout=subprocess.PIPE).stdout.read()  # cwd=dir_path

    results = file(os.path.join(constants.OUTPUT_DIR,
                                "pinnaclez_results.txt")).read().split()
    module_genes = list(set([x for x in results if x.startswith("ENSG")]))
    dip_network = pd.read_csv(os.path.join(constants.NETWORKS_DIR,
                                           "dip_out.sif"),
                              sep="\t",
Exemple #23
0
from infra import *
import constants

constants.update_dirs(CANCER_TYPE_u="SKCM")

tcga = np.array(load_phenotype_data("SKCM_clinicalMatrix"))

gdc = np.array(load_phenotype_data("TCGA-SKCM.GDC_phenotype.tsv"))

old = np.array(load_phenotype_data("SKCM_clinicalMatrix.txt"))

integ = []
integ.append("\t".join(
    list(gdc[0]) + ["tcga_{}".format(x) for x in tcga[0][1:]] +
    ["old_{}".format(x) for x in old[0][1:]]))
for cur_gdc in gdc[1:]:
    row = ""
    row += "\t".join(cur_gdc)
    additional = []
    for cur_tcga in tcga[1:]:
        if cur_tcga[0] in cur_gdc[0]:
            additional = cur_tcga[1:]
            break
    additional = "\t".join(
        additional + ["" for x in range((len(tcga[0][1:]) - len(additional)))])
    row += "\t" + additional
    additional = []
    for cur_tcga in old[1:]:
        if cur_tcga[15] in cur_gdc[0]:
            additional = cur_tcga[1:]
            break
Exemple #24
0
        write.table(data.frame(cel=rownames(pheno), pheno), row.names=F, quote=F, sep="\t", file="bladder-pheno.txt")
	edata = exprs(bladderEset)
    write.table(edata, row.names=T, quote=F, sep="\t", file="bladder-expr.txt")
	# use dataframe instead of matrix
	mod = model.matrix(~as.factor(cancer) + age, data=pheno)
    t = Sys.time()
	cdata = ComBat(dat=edata, batch=as.factor(pheno$batch), mod=mod, numCov=match("age", colnames(mod)))
    print(Sys.time() - t)
    print(cdata[1:5, 1:5])
    write.table(cdata, row.names=True, quote=F, sep="\t", file="r-batch.txt")
    """

    # for dataset in ["LUSC", "SKCM", "MESO", "OV", "PCPG", "PRAD", "READ", "SARC", "TGCT", "THYM", "THCA", "UCS"]:
    for dataset in ["PANCAN"]:
        print "current dataset: {}".format(dataset)
        constants.update_dirs(CANCER_TYPE_u=dataset)
        data_normalizaton = "fpkm"
        gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, mirna_file_name, pval_preprocessing_file_name = build_gdc_params(
            dataset=dataset, data_normalizaton=data_normalizaton)


        pheno = pd.read_table(phenotype_file_name, index_col=0)
        dat = pd.read_table(gene_expression_file_name, index_col=0, dtype="str")
        print "done load"
        dat = dat.astype(np.float)
        print "done conversion"
        dat = dat.loc[~(dat==0).all(axis=1)]
        pheno = pheno[pheno.batch_number.notnull()]


def main(dataset="BRCA"):
    constants.update_dirs(DATASET_NAME_u=dataset)
    data_normalizaton = "counts_normalized_by_genes_standardization"
    cur_json = "brca_pam53"
    meta_groups = None
    filter_expression = None
    meta_groups = [json.load(file("../groups/{}.json".format(cur_json)))]
    filter_expression = json.load(file("../filters/{}.json".format(cur_json)))

    gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, mirna_file_name, pval_preprocessing_file_name = build_gdc_params(
        dataset=dataset, data_normalizaton=data_normalizaton)
    phenotype_file_name = 'BRCA_clinicalMatrix'
    tested_gene_expression, h_rows, h_cols, labels_assignment, survival_dataset = infra.load_integrated_ge_data(
        "dip_bg.txt",
        "dip_bg.txt",
        gene_expression_file_name,
        survival_file_name,
        phenotype_file_name,
        gene_filter_file_name=None,
        filter_expression=filter_expression,
        meta_groups=meta_groups,
        var_th_index=None)
    file(os.path.join(constants.DATA_DIR, "classes.tsv"),
         'w+').write('\t'.join([str(x) for x in labels_assignment[0]]))
    h_cols = [x.split('.')[0] for x in h_cols]
    df_data = pd.DataFrame(index=h_rows,
                           columns=h_cols,
                           data=tested_gene_expression).T
    df_data.to_csv(os.path.join(constants.DATA_DIR, 'ge.tsv'),
                   index_label="eid",
                   sep="\t")
    var_th_index = None
    start_k = 2
    end_k = 2

    # algos = ["matisse", "keypathwayminer_INES_GREEDY", "netbox", "hotnet2", "bionet", "jactivemodules_greedy",
    #          "jactivemodules_sa", "reactomefi"]
    algos = ["netbox"]
    run_dataset(dataset, score_method=constants.DEG_EDGER)
    gene_list_file_names = []
    generate_plot = True
    clustering_algorithm = "correlation"
    for cur_algo in algos:
        algo_output = json.loads(
            file(
                os.path.join(constants.OUTPUT_DIR,
                             "{}_client_output.txt".format(
                                 cur_algo))).read().split("\n")[1])
        i = 0
        algo_pvals = []
        random_pvals = []
        df_mean = pd.DataFrame()
        all_algo_genes_flatted = []
        gene_2_module = {}
        while True:
            algo_genes_flatted = [
                x['eid'] for x in algo_output if i in x['modules']
            ]
            for cur in algo_genes_flatted:
                gene_2_module[cur] = i
            all_algo_genes_flatted += algo_genes_flatted
            if len(algo_genes_flatted) == 0 and i > 0: break
            if len(algo_genes_flatted) < 4:
                i += 1
                continue
            gene_list_file_names.append(
                os.path.join(constants.LIST_DIR, cur_algo + ".txt"))
            file(gene_list_file_names[-1],
                 'w+').write("\n".join(algo_genes_flatted))
            df_mean = pd.concat((df_mean, df_data[df_data.index.isin(
                algo_genes_flatted)].mean().to_frame().T))

            i += 1

        all_genes_file_name = os.path.join(constants.LIST_DIR,
                                           "{}_all_genes.txt".format(cur_algo))
        file(all_genes_file_name,
             'w+').write('\n'.join(all_algo_genes_flatted))

        mean_file_name = os.path.join(constants.DATA_DIR, "mean.tsv")
        df_mean.index = np.arange(df_mean.shape[0])
        df_mean.to_csv(mean_file_name, sep="\t", index_label="eid")
        index_file_name = os.path.join(constants.LIST_DIR,
                                       "{}_indices.txt".format(cur_algo))
        file(index_file_name,
             'w+').write('\n'.join([str(x) for x in df_mean.index.values]))

        algo_pvals.append(
            find_clusters_and_survival(
                tested_gene_list_file_name="{}_indices.txt".format(cur_algo),
                total_gene_list_file_name="protein_coding.txt",
                gene_expression_file_name=mean_file_name,
                phenotype_file_name=phenotype_file_name,
                survival_file_name=survival_file_name,
                var_th_index=var_th_index,
                is_unsupervised=True,
                start_k=start_k,
                end_k=end_k,
                filter_expression=filter_expression,
                meta_groups=meta_groups,
                clustering_algorithm=clustering_algorithm,
                plot=generate_plot))

        for cur in range(RAND_TIMES):
            random_set_file_name = generate_random_set(
                random_size=len(df_mean.index),
                meta_gene_set="{}_all_genes.txt".format(cur_algo))

            random_pvals.append(
                find_clusters_and_survival(
                    tested_gene_list_file_name=random_set_file_name,
                    total_gene_list_file_name="dip_bg.txt",
                    gene_expression_file_name=gene_expression_file_name,
                    phenotype_file_name=phenotype_file_name,
                    survival_file_name=survival_file_name,
                    var_th_index=var_th_index,
                    is_unsupervised=True,
                    start_k=start_k,
                    end_k=end_k,
                    filter_expression=filter_expression,
                    meta_groups=meta_groups,
                    clustering_algorithm=clustering_algorithm,
                    plot=generate_plot))

        print " algo pvals"
        print algo_pvals
        print "# above TH: {}".format(
            len([x for x in algo_pvals if any(y < 0.001 for y in x)]))
        print " random pvals"
        print random_pvals
        print "# above TH: {}".format(
            len([x for x in random_pvals if any(y < 0.001 for y in x)]))
        print "# of modules better over random: {}/{}".format(
            len([
                x for x1, x2 in zip(algo_pvals, random_pvals) if x1[0] < x2[0]
            ]), len(algo_pvals))
Exemple #26
0
    conf_file_name = init_specific_params(score_file_name, dest_algo_dir)
    script_file_name = format_script(os.path.join(constants.SH_DIR,
                                                  script_name),
                                     BASE_FOLDER=constants.BASE_PROFILE,
                                     DATASET_DIR=constants.DATASET_DIR,
                                     CONFIG_FILE_NAME=conf_file_name,
                                     NETBOX_DIR=dest_algo_dir)
    print subprocess.Popen("bash {}".format(script_file_name),
                           shell=True,
                           stdout=subprocess.PIPE,
                           cwd=dest_algo_dir).stdout.read()

    modules, all_bg_genes = extract_modules_and_bg(bg_genes, dest_algo_dir)
    os.remove(script_file_name)
    os.remove(conf_file_name)
    shutil.rmtree(dest_algo_dir)
    output_base_dir = ""
    if constants.REPORTS:
        output_base_dir = build_all_reports(ALGO_NAME, dataset_name, modules,
                                            all_bg_genes, score_file_name,
                                            network_file_name, disease_name,
                                            expected_genes)
    output_file_name = os.path.join(constants.OUTPUT_DIR,
                                    "{}_client_output.txt".format(ALGO_NAME))
    # output_modules(output_file_name, modules, score_file_name, output_base_dir )


if __name__ == "__main__":
    constants.update_dirs(DATASET_NAME_u="MCF7_2")
    main()
Exemple #27
0
        write.table(data.frame(cel=rownames(pheno), pheno), row.names=F, quote=F, sep="\t", file="bladder-pheno.txt")
	edata = exprs(bladderEset)
    write.table(edata, row.names=T, quote=F, sep="\t", file="bladder-expr.txt")
	# use dataframe instead of matrix
	mod = model.matrix(~as.factor(cancer) + age, data=pheno)
    t = Sys.time()
	cdata = ComBat(dat=edata, batch=as.factor(pheno$batch), mod=mod, numCov=match("age", colnames(mod)))
    print(Sys.time() - t)
    print(cdata[1:5, 1:5])
    write.table(cdata, row.names=True, quote=F, sep="\t", file="r-batch.txt")
    """

    # for dataset in ["LUSC", "SKCM", "MESO", "OV", "PCPG", "PRAD", "READ", "SARC", "TGCT", "THYM", "THCA", "UCS"]:
    for dataset in ["PANCAN"]:
        print "current dataset: {}".format(dataset)
        constants.update_dirs(DATASET_NAME_u=dataset)
        data_normalizaton = "fpkm"
        gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, mirna_file_name, pval_preprocessing_file_name = build_gdc_params(
            dataset=dataset, data_normalizaton=data_normalizaton)

        pheno = pd.read_table(os.path.join(constants.DATA_DIR,
                                           phenotype_file_name),
                              index_col=0)
        dat = pd.read_table(os.path.join(constants.DATA_DIR,
                                         gene_expression_file_name),
                            index_col=0,
                            dtype="str")
        print "done load"
        dat = dat.astype(np.float)
        print "done conversion"
        dat = dat.loc[~(dat == 0).all(axis=1)]
Exemple #28
0
import constants
import infra
import pandas as pd
import numpy as np
import os

constants.update_dirs(DATASET_NAME_u="PRAD_2")

#### PREPARE DICT ###

company = "illumina"  # agilent

# old_key_field = "AGILENT WholeGenome 4x44k v1 probe"
# new_field = "agilent_44_v1"

old_key_field = "ILLUMINA HumanWG 6 V3 probe"
new_key_field = "illumina_WG_v3"
old_value_field = "Gene stable ID"
new_value_field = "eid"

df_dict = pd.read_csv(os.path.join(constants.DICTIONARIES_DIR,
                                   "{}_ensembl_biomart.tsv".format(company)),
                      sep='\t')
df_dict = df_dict.dropna()
df_dict.index = df_dict[old_key_field]
df_dict = df_dict.drop([old_key_field], axis=1)
df_dict = df_dict.rename(columns={old_value_field: new_value_field})
df_dict = df_dict[~df_dict.index.duplicated(keep='first')]
df_dict.to_csv(os.path.join(constants.DICTIONARIES_DIR,
                            "{}_ensembl.tsv".format(company)),
               sep='\t',
def main(dataset="COMB"):
    constants.update_dirs(DATASET_NAME_u=dataset)
    data_normalizaton = "fpkm_bc"
    cur_json = "cancer_types"
    meta_groups = [json.load(file("../groups/{}.json".format(cur_json)))]
    filter_expression = json.load(file("../filters/{}.json".format(cur_json)))

    gene_expression_file_name, phenotype_file_name, survival_file_name, mutation_file_name, mirna_file_name, pval_preprocessing_file_name = build_gdc_params(
        dataset=dataset, data_normalizaton=data_normalizaton)
    tested_gene_expression, h_rows, h_cols, labels_assignment, survival_dataset = infra.load_integrated_ge_data(
        "dip_bg.txt",
        "dip_bg.txt",
        gene_expression_file_name,
        survival_file_name,
        phenotype_file_name,
        gene_filter_file_name=None,
        filter_expression=filter_expression,
        meta_groups=meta_groups,
        var_th_index=None)
    h_cols = [x.split('.')[0] for x in h_cols]
    pd.DataFrame(index=h_rows, columns=h_cols,
                 data=tested_gene_expression).T.to_csv(os.path.join(
                     constants.DATA_DIR, 'ge.tsv'),
                                                       index_label="eid",
                                                       sep="\t")
    var_th_index = None
    start_k = 2
    end_k = 2

    # algos = ["matisse", "keypathwayminer_INES_GREEDY", "netbox", "hotnet2", "bionet", "jactivemodules_greedy",
    #          "jactivemodules_sa", "reactomefi"]
    algos = ["matisse"]
    run_dataset(dataset, score_method=constants.DEG_EDGER)
    gene_list_file_names = []
    generate_plot = True
    clustering_algorithm = "correlation"
    for cur_algo in algos:
        algo_output = json.loads(
            file(
                os.path.join(constants.OUTPUT_DIR,
                             "{}_client_output.txt".format(
                                 cur_algo))).read().split("\n")[1])
        i = 0
        algo_pvals = []
        random_pvals = []

        while True:
            algo_genes_flatted = [
                x['eid'] for x in algo_output if i in x['modules']
            ]
            if len(algo_genes_flatted) == 0 and i > 0: break
            if len(algo_genes_flatted) == 0:
                i += 1
                continue
            gene_list_file_names.append(
                os.path.join(constants.LIST_DIR, cur_algo + ".txt"))
            file(gene_list_file_names[-1],
                 'w+').write("\n".join(algo_genes_flatted))

            algo_pvals.append(
                find_clusters_and_survival(
                    tested_gene_list_file_name=gene_list_file_names[-1],
                    total_gene_list_file_name="protein_coding.txt",
                    gene_expression_file_name=gene_expression_file_name,
                    phenotype_file_name=phenotype_file_name,
                    survival_file_name=survival_file_name,
                    var_th_index=var_th_index,
                    is_unsupervised=True,
                    start_k=start_k,
                    end_k=end_k,
                    filter_expression=filter_expression,
                    meta_groups=meta_groups,
                    clustering_algorithm=clustering_algorithm,
                    plot=generate_plot))

            random_set_file_name = generate_random_set(
                random_size=len(algo_genes_flatted),
                meta_gene_set="dip_bg.txt")

            random_pvals.append(
                find_clusters_and_survival(
                    tested_gene_list_file_name=random_set_file_name,
                    total_gene_list_file_name="protein_coding.txt",
                    gene_expression_file_name=gene_expression_file_name,
                    phenotype_file_name=phenotype_file_name,
                    survival_file_name=survival_file_name,
                    var_th_index=var_th_index,
                    is_unsupervised=True,
                    start_k=start_k,
                    end_k=end_k,
                    filter_expression=filter_expression,
                    meta_groups=meta_groups,
                    clustering_algorithm=clustering_algorithm,
                    plot=generate_plot))

            i += 1

        print " algo pvals"
        print algo_pvals
        print "# above TH: {}".format(
            len([x for x in algo_pvals if any(y < 0.001 for y in x)]))
        print " random pvals"
        print random_pvals
        print "# above TH: {}".format(
            len([x for x in random_pvals if any(y < 0.001 for y in x)]))
        print "# of modules better over random: {}/{}".format(
            len([
                x for x1, x2 in zip(algo_pvals, random_pvals) if x1[0] < x2[0]
            ]), len(algo_pvals))
Exemple #30
0
    return bg_genes, network_file_name


def main(dataset_name=constants.DATASET_NAME, disease_name=None, expected_genes = None, score_method=constants.DEG_EDGER):
    global NETWORK_NAME
    constants.update_dirs(DATASET_NAME_u=dataset_name)
    network_file_name, score_file_name, score_method, bg_genes = server.init_common_params(NETWORK_NAME, score_method)
    if score_method == constants.PREDEFINED_SCORE:
        raise Exception("Cannot run this algo on scor-based metrics. please provide gene expression file")

    bg_genes, network_file_name = init_specific_params(NETWORK_NAME)

    format_script(os.path.join(constants.SH_DIR, "run_{}.sh".format(ALGO_NAME)), BASE_FOLDER=constants.BASE_PROFILE,
                  DATASET_DIR=constants.DATASET_DIR, ALGO_DIR=ALGO_DIR, NETWORK_NAME=NETWORK_NAME)

    subprocess.Popen("bash {}/run_{}.sh".format(constants.SH_DIR, ALGO_NAME), shell=True,
                     stdout=subprocess.PIPE, cwd=ALGO_DIR).stdout.read()

    modules, all_bg_genes = extract_modules_and_bg(bg_genes)
    output_base_dir = ""
    if constants.REPORTS:
        output_base_dir = build_all_reports(ALGO_NAME, modules, all_bg_genes, score_file_name, network_file_name, disease_name, expected_genes)

    output_file_name = os.path.join(constants.OUTPUT_DIR,
                                    "{}_client_output.txt".format(ALGO_NAME))
    output_modules(output_file_name, modules, score_file_name, output_base_dir )

if __name__ == "__main__":
    constants.update_dirs(DATASET_NAME_u="SOC")
    main()