Exemple #1
0
def KEGG_download_and_create_fasta_file_from_genes(genes_path, taxa = 'Bacteria', aa_dna = 'protein', genes_per_batch = 10000):
    KEGG_TEMP_FASTA = KEGG_DB_DIR + '/temp_fasta_' + aa_dna + '/'
    BASE_URL = base_urls[aa_dna]
    if not os.path.exists(KEGG_TEMP_FASTA): os.makedirs(KEGG_TEMP_FASTA)
          
    if not os.path.exists(KEGG_DB_DIR + '/kos_genes_dict.dat'):
        print "KO - genes file doesn't exist. Run: KEGG_build_KO_genes_dict()"
        return
    genes = [gene.split('\n')[0] for gene in open(genes_path, 'r').readlines()]
#     kos_genes_dict = Utils.Load(KEGG_DB_DIR + '/kos_genes_dict.dat')
    with qp(jobname = 'KGG2fasta', q=['himem7.q'], mem_def = '1G', trds_def = 1, tryrerun=True, 
            max_u = 210, delay_batch=15) as q:
        os.chdir("/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/")
        q.startpermanentrun()
        upload_create_fasta_files_jobs(q, KEGG_TEMP_FASTA, genes, BASE_URL, taxa, genes_per_batch)

    output_fasta = KEGG_DB_DIR + '_'.join(['/KEGG_genes', taxa, aa_dna]) + '.fa'
    temp_files = os.listdir(KEGG_TEMP_FASTA)
    temp_files.sort()
    
    with open(output_fasta, 'w') as handle:
        for temp_file in temp_files:
            print temp_file
            fasta_content = open(KEGG_TEMP_FASTA + '/' + temp_file, 'r').readlines()
            for line in fasta_content:
                handle.write(line)
            os.remove(KEGG_TEMP_FASTA + '/' + temp_file)
    os.removedirs(KEGG_TEMP_FASTA)
    return
def _grouping_with_sescg(mat, k, work_dir, corr_thresh, abscorr,
                         presence_thresh, corr_presence_thresh, first_split_n,
                         split_n, corrmethod, grouping_type):
    print(str(datetime.now()) + " - [_grouping_with_sescg] start - " + k)
    with qp('grping',
            delay_sec=1,
            tryrerun=True,
            mem_def='2G',
            trds_def=2,
            q=['himem7.q']) as q:
        q.startpermanentrun()
        ex_grps = semi_exahustive_strict_cor_grouping(mat, corr_thresh,
                                                      abscorr, presence_thresh,
                                                      corr_presence_thresh,
                                                      first_split_n, split_n,
                                                      corrmethod, q)
    cormet = mat.corr(corrmethod, corr_presence_thresh)

    def select_rep(grp):
        if len(grp) == 1:
            return grp[0]
        grp_nnl = mat[grp].notnull().sum()
        return cormet.loc[grp_nnl[grp_nnl == grp_nnl.max()].index, grp_nnl[
            grp_nnl == grp_nnl.max()].index].sum(1).argmax()

    met_lg = concat((mat[select_rep(grp)] for grp in ex_grps), axis=1)
    Utils.Write(work_dir + '/' + k, met_lg)
    print(str(datetime.now()) + " - [_grouping_with_sescg] end - " + k)
    return met_lg
def main():
    print(str(datetime.now()) + " - [GroupEMGenesByKEGG:main] start")
    parser = argparse.ArgumentParser()
    parser.add_argument('-work_dir',
                        help='Path to working directory',
                        type=str,
                        default=None)
    parser.add_argument('-output_name',
                        help='Name of output dataframe file',
                        type=str,
                        default='EMGenes_group.df')
    parser.add_argument('-keggDB',
                        help='Type of KEGG database to group by',
                        type=str,
                        default='ko')
    parser.add_argument('-min_size',
                        help='Minimum size of KEGG DB to perform grouping on',
                        type=int,
                        default=20)
    parser.add_argument('-max_size',
                        help='Minimum size of KEGG DB to perform grouping on',
                        type=int,
                        default=100000)
    parser.add_argument(
        '-method',
        help='Which algorithm to use for grouping: newalg or sescg',
        type=str,
        default='newalg')
    parser.add_argument(
        '-grouping_type',
        help='Whether to choose a representetive or to sum: sum or rep',
        type=str,
        default='rep')
    parser.add_argument('-corr_thresh',
                        help='Correlation threshold to group by',
                        type=float,
                        default=0.75)
    parser.add_argument('-abscorr',
                        help='Use absolute value of correlation',
                        type=bool,
                        default=True)
    parser.add_argument('-presence_thresh',
                        help='Demand at least this amount of samples',
                        type=int,
                        default=100)
    parser.add_argument('-corr_presence_thresh',
                        help='Threshold for computing correlation',
                        type=int,
                        default=50)
    parser.add_argument('-first_split_n',
                        help='first split n',
                        type=int,
                        default=30)
    parser.add_argument('-split_n', help='split n', type=int, default=1)
    parser.add_argument('-corrmethod',
                        help='Which correlation method to use',
                        type=str,
                        default='spearman')
    parser.add_argument('-mirror',
                        help='mirror option in newalg',
                        type=bool,
                        default=False)
    parser.add_argument('-just_concat',
                        help='Whether to just read and concat the output',
                        type=bool,
                        default=False)
    command_args = parser.parse_args()

    if command_args.work_dir is None:
        command_args.work_dir = METABOLON_DIR + '/Grouping_by_' + command_args.keggDB
    if not os.path.exists(command_args.work_dir):
        os.makedirs(command_args.work_dir)

    if command_args.keggDB not in POSSIBLE_KEGG_DB:
        print(
            str(datetime.now()) +
            " - [GroupEMGenesByKEGG:main] keggDB not legal")
        return

    if str(command_args.min_size).isdigit() is False or str(
            command_args.max_size).isdigit() is False:
        print(
            str(datetime.now()) +
            " - [GroupEMGenesByKEGG:main] min_size and max_size must be integers"
        )
        return

    if command_args.method not in POSSIBLE_METHODS:
        print("method must be one of: " + ', '.join(POSSIBLE_METHODS))
        return

    if command_args.corr_thresh > 1 or command_args.corr_thresh < -1:
        print(
            str(datetime.now()) +
            " - [GroupEMGenesByKEGG:main] corr_thresh must be between -1 and 1"
        )
        return

    if command_args.corrmethod not in REASONABLE_CORRELATIONS:
        print("corrmethod must be one of: " +
              ', '.join(REASONABLE_CORRELATIONS))
        return

    if command_args.grouping_type not in REASONABLE_GROUPING_TYPES:
        print("corrmethod must be one of: " +
              ', '.join(REASONABLE_GROUPING_TYPES))
        return

    if command_args.just_concat:
        _concat_output(command_args.work_dir, command_args.output_name,
                       command_args.keggDB, False)
        return

    with open(command_args.work_dir + '/args' + str(datetime.now()),
              'w') as handle:
        for arg in vars(command_args):
            handle.write(
                str(arg) + '\t' + str(getattr(command_args, arg)) + '\n')

    print(
        str(datetime.now()) +
        " - [GroupEMGenesByKEGG:main] build KEGG data holder")
    kegg = KEGG_data_holder()
    print(
        str(datetime.now()) +
        " - [GroupEMGenesByKEGG:main] Load EMGenes dataframe")
    from Analyses.MetabolonAnalysis import *
    M = MetabolonEMPairwiseCorrelationWriter()
    M.run()
    EMGenes = M.B

    print(
        str(datetime.now()) +
        " - [GroupEMGenesByKEGG:main] Divide EMGenes by " +
        command_args.keggDB)
    keggDict = {}
    EMGenes_columns_set = set(EMGenes.columns)
    if (command_args.keggDB == 'ko'):
        for k in kegg.get_dicts()['ko_bacgene'].keys():
            temp_genes = set(kegg.get_dicts()['ko_bacgene'][k])
            valid_genes = list(temp_genes.intersection(EMGenes_columns_set))
            if len(valid_genes) > 0:
                keggDict[k] = EMGenes[valid_genes]

    print(
        str(datetime.now()) +
        " - [GroupEMGenesByKEGG:main] uploading jobs to q...")

    with qp('m_grping',
            delay_sec=1,
            mem_def='1G',
            q=['himem7.q'],
            trds_def=1,
            max_u=420,
            tryrerun=True) as q_m:
        unused_genes = set()
        os.chdir(METABOLON_DIR)
        waiton = []
        q_m.startpermanentrun()
        for k in keggDict.keys():
            if keggDict[k].shape[1] < command_args.min_size or keggDict[
                    k].shape[1] > command_args.max_size:
                unused_genes.update(keggDict[k].columns)
                continue
            if os.path.exists(command_args.work_dir + '/' + k):
                continue
            if keggDict[k].shape[1] > command_args.min_size:
                print k + ' - ' + str(keggDict[k].shape[1])
            if command_args.method == 'newalg':
                waiton.append(
                    q_m.method(_grouping_with_newalg,
                               (keggDict[k], k, command_args.work_dir,
                                command_args.corr_thresh, command_args.abscorr,
                                command_args.corr_presence_thresh,
                                command_args.corrmethod, command_args.mirror,
                                command_args.grouping_type)))
            elif command_args.method == 'sescg':
                waiton.append(
                    q_m.method(
                        _grouping_with_sescg,
                        (keggDict[k], k, command_args.work_dir,
                         command_args.corr_thresh, command_args.abscorr,
                         command_args.presence_thresh,
                         command_args.corr_presence_thresh,
                         command_args.first_split_n, command_args.split_n,
                         command_args.corrmethod, command_args.grouping_type)))
        res = q_m.waitforresults(waiton)
    unused_genes = list(unused_genes)
    Utils.Write(command_args.work_dir + '/ko:rest.df', EMGenes[unused_genes])

    print(
        str(datetime.now()) +
        " - [GroupEMGenesByKEGG:main] concatinating dataframes into one")
    _concat_output(command_args.work_dir, command_args.output_name,
                   command_args.keggDB, True)
    print(str(datetime.now()) + " - [GroupEMGenesByKEGG:main] end")
    return
Exemple #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'list_of_directories',
        help=
        'List of directories to take mapping files from separated with ---',
        type=str,
        default=None)
    parser.add_argument('output_dir',
                        help='Path to output directory',
                        type=str,
                        default=None)
    parser.add_argument('-reference',
                        help='Path to DIAMOND reference file',
                        type=str,
                        default=None)
    parser.add_argument('-max_target_seqs',
                        help='--max-target-seqs parameter in DIAMOND',
                        type=int,
                        default=1)
    parser.add_argument('-evalue',
                        help='--evalue parameter in DIAMOND',
                        type=float,
                        default=10.)
    parser.add_argument('-more_sensitive',
                        help='--more-sensitive parameter in DIAMOND',
                        type=bool,
                        default=True)
    parser.add_argument('-take_only_from_file',
                        help='Path to list of samples prefixes to take',
                        type=str,
                        default=None)
    parser.add_argument(
        '-diamond_mapper',
        help='Which diamond mapper to use, one of blastx, blastp',
        type=str,
        default='blastx')
    parser.add_argument('-only_parse',
                        help='Whether to only parse and create results',
                        type=bool,
                        default=False)
    parser.add_argument('-divide_to_ko',
                        help='Whether to divide the mapping counts into KOs',
                        type=bool,
                        default=False)
    parser.add_argument('-divide_to_ko_th',
                        help='Threshold for divide to ko',
                        type=float,
                        default=1e-4)

    command_args = parser.parse_args()

    if command_args.list_of_directories is None or command_args.output_dir is None:
        return
    if command_args.only_parse:
        parse_diamond_output(command_args)
        return
    dirs_list = command_args.list_of_directories.split('---')
    for d in dirs_list:
        if not os.path.exists(d):
            print d + ' does not exists.'
            return
    if not os.path.exists(command_args.output_dir):
        os.makedirs(command_args.output_dir)

    if command_args.max_target_seqs < 1:
        return

    if command_args.take_only_from_file is not None:
        if not os.path.exists(command_args.take_only_from_file):
            return

    with open(command_args.output_dir + '/args' + str(datetime.now()),
              'w') as handle:
        for arg in vars(command_args):
            handle.write(
                str(arg) + '\t' + str(getattr(command_args, arg)) + '\n')

    with qp(jobname='RDiamond',
            q=['himem7.q'],
            mem_def='5G',
            trds_def=1,
            tryrerun=True,
            max_u=310,
            delay_batch=15) as q:
        os.chdir(
            "/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/")
        q.startpermanentrun()
        upload_run_diamond_jobs_to_q(q, command_args)

    parse_diamond_output(command_args)
    return
def main():
    print('main')
    parser = argparse.ArgumentParser()
    parser.add_argument('output_dir',
                        help='Path to output directory',
                        type=str,
                        default=None)
    parser.add_argument('-n_cols_per_job',
                        help='Number of columns per job',
                        type=int,
                        default=10)
    parser.add_argument(
        '-path_to_X',
        '--path_to_X',
        help='Path to features data - X, separated by comma',
        type=str,
        default=
        '/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/Metabolon/Paper_v4/unknown_pathway_prediction/metabolomics_levels_mean_null.csv'
    )
    parser.add_argument(
        '-path_to_Y',
        help='Path to labels - Y',
        type=str,
        default=
        '/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/Metabolon/Paper_v4/unknown_pathway_prediction/super_pathway_y.csv'
    )
    parser.add_argument(
        '-names',
        help=
        'names of Xs, separated by comma. Must be same length as the number of Xs.',
        type=str,
        default='levels')
    parser.add_argument('-ntrees',
                        help='The number of trees for training',
                        type=int,
                        default=2000)
    parser.add_argument('-val_size',
                        help='The fraction of validation for the training',
                        type=float,
                        default=0.2)
    parser.add_argument(
        '-early_stopping_rounds',
        help='The number of early stopping rounds for the training',
        type=int,
        default=50)
    parser.add_argument('-over_sample',
                        help='Whether to over sample the samples',
                        type=bool,
                        default=False)
    parser.add_argument('-only_concat',
                        help='Whether to only concatenate the output files',
                        type=bool,
                        default=False)
    parser.add_argument('-only_predict_test',
                        help='Whether to only run the predict_test function',
                        type=bool,
                        default=False)
    parser.add_argument('-mem_def',
                        help='Amount of memory per job',
                        type=int,
                        default=1)
    parser.add_argument('-job_name',
                        help='Job preffix for q',
                        type=str,
                        default='PathwayClassifier')
    command_args = parser.parse_args()

    command_args.path_to_X = _convert_comma_separated_to_list(
        command_args.path_to_X)
    for x in command_args.path_to_X:
        if not os.path.exists(x):
            print(x, 'does not exist')
            return
    if not os.path.exists(command_args.path_to_Y):
        print(command_args.path_to_Y, 'does not exist!')
        return

    command_args.names = _convert_comma_separated_to_list(command_args.names)
    assert len(command_args.names) == len(command_args.path_to_X)

    if command_args.n_cols_per_job < 1 or command_args.n_cols_per_job > 1000:
        print("n_cols_per_job must be between 1 and 1000")
        return

    if command_args.only_concat:
        concat_outputs(command_args)
        return

    if command_args.only_predict_test:
        predict_test(command_args)
        return

    make_dir_if_not_exists(command_args.output_dir)

    log_run_details(command_args)

    # qp = fakeqp
    with qp(jobname=command_args.job_name,
            q=['himem7.q'],
            mem_def=str(command_args.mem_def) + 'G',
            trds_def=2,
            tryrerun=True,
            max_u=650,
            delay_batch=5) as q:
        os.chdir(
            "/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/")
        q.startpermanentrun()
        upload_these_jobs(q, command_args)
Exemple #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('phenotype', help='Which phenotype/s to take as y. (Metabolomics_raw, Metabolomics_normed, BMI, Cohort, etc.)', type=str)
    parser.add_argument('samples', help='What samples to use. (ACS, MAR17, MAY18, etc.)', type=str)
    parser.add_argument('-output_dir', help='Path to output directory', type=str, default=LMM_DIR)
    parser.add_argument('-random', help='What are the random effects. (IGC, IGC-COG, MPA_species, MPA_species-phyla, KEGGBacGenes, Diet, etc.) separated by comma', type=str, default='IGC')
    parser.add_argument('-fixed', help='What are the fixed effects. (Age, Gender, BMI, Nextera, etc.) separated by comma', type=str, default='Age,Gender,Nextera')
    parser.add_argument('-n_phenotypes_per_job', help='Number of phenotypes per job', type=int, default=50)
    parser.add_argument('-use_quantile_normalization', help='Whether to use quantile normalization over microbiome data', type=bool, default=False)
    parser.add_argument('-jackknife_iterations', help='Number of Jackknife iterations', type=int, default=100)
    parser.add_argument('-output_name', help='Specify an output directory name', type=str, default=None)


#     parser.add_argument('-covariates', help='Which covariates to consider, separated by comma', type=str, default='Age,Gender')
#     parser.add_argument('-prevalence', help='In case of a case-control trait, what is the prevalence in the general population', type=float, default=None)
    parser.add_argument('-fiesta_iterations', help='Number of iterations to be made by FIESTA', type=int, default=100)

    command_args = parser.parse_args()
    make_dir_if_not_exists(command_args.output_dir)

    if command_args.output_name is not None:
        command_args.output_dir += '/' + command_args.output_name + '/'
        make_dir_if_not_exists(command_args.output_dir)

    if command_args.phenotype not in LEGAL_PHENOTYPES:
        print ('phenotype currently supported are: ' + ','.join(LEGAL_PHENOTYPES))
        return

    command_args.random = _convert_comma_separated_to_list(command_args.random)
    for grm in command_args.random:
        if grm not in LEGAL_GRMs:
            print ('grm currently supported are: ' + ','.join(LEGAL_GRMs))
            exit

    command_args.samples = _convert_comma_separated_to_list(command_args.samples)
    for samps in command_args.samples:
        if samps not in SAMPLES_DICT:
            print ('samples currently supported are: ' + ','.join(SAMPLES_DICT.keys()))
            exit

    if command_args.use_quantile_normalization:
        command_args.output_dir += '/QN/'
        make_dir_if_not_exists(command_args.output_dir)

    command_args.output_dir += '/' + '-'.join(command_args.samples) + '/'
    make_dir_if_not_exists(command_args.output_dir)
    command_args.output_dir += '/' + command_args.phenotype + '/'
    make_dir_if_not_exists(command_args.output_dir)
    command_args.output_dir += '/' + '+'.join(command_args.random) + '/'
    make_dir_if_not_exists(command_args.output_dir)


    with open(command_args.output_dir + '/args' + str(datetime.now()), 'w') as handle:
        for arg in vars(command_args):
            handle.write(str(arg) + '\t' + str(getattr(command_args, arg)) + '\n')

    command_args.fixed = _convert_comma_separated_to_list(command_args.fixed)

    with qp(jobname = 'LMMs', q=['himem7.q'], mem_def = '1G', trds_def = 1, tryrerun=False,
#     with fakeqp(jobname = 'LMMs', q=['himem7.q'], mem_def = '1G', trds_def = 1, tryrerun=False,
        max_u = 220, delay_batch=15) as q:
        os.chdir("/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/")
        q.startpermanentrun()
        upload_lmm_per_n_phenotypes(q, command_args)

    return
Exemple #7
0
def main():
    print('main')
    parser = argparse.ArgumentParser()
    parser.add_argument('output_dir',
                        help='Path to output directory',
                        type=str,
                        default=None)
    parser.add_argument('-model',
                        help='Which prediction model to use',
                        type=str,
                        default='lightgbm')
    parser.add_argument('-n_cols_per_job',
                        help='Number of columns per job',
                        type=int,
                        default=2)
    parser.add_argument('-n_random',
                        help='Number of random samples',
                        type=int,
                        default=20)
    parser.add_argument('-pfilter',
                        help='Threshold for p-value in association test',
                        type=float,
                        default=0.00001)
    parser.add_argument(
        '-path_to_plink_bfiles',
        '--path_to_plink_bfiles',
        help='Path to basename plink data',
        type=str,
        default=
        '/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/Metabolon/Genetics/PNP_autosomal_clean2_nodfukim_norelated_Metabolon'
    )
    parser.add_argument(
        '-path_to_Y',
        help='Path to labels - Y',
        type=str,
        default=
        '/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/Metabolon/technical_noise/dataframes/mar17_metabolomics_grouped085_unnormed_fillna_min_dayfromfirstsample_regressed_rzs_regid.csv'
    )
    parser.add_argument('-k_folds',
                        help='Number of folds',
                        type=int,
                        default=10)
    parser.add_argument('-only_concat',
                        help='Whether to only concatenate the output files',
                        type=bool,
                        default=False)

    parser.add_argument('-mem_def',
                        help='Amount of memory per job',
                        type=int,
                        default=2)
    parser.add_argument('-job_name',
                        help='Job preffix for q',
                        type=str,
                        default='plink-cv')
    command_args = parser.parse_args()

    if (not os.path.exists(command_args.path_to_Y)
        ):  # (not os.path.exists(command_args.path_to_X)) or
        print("X or Y doesn't exist!")
        return
    if command_args.n_cols_per_job < 1 or command_args.n_cols_per_job > 1000:
        print("n_cols_per_job must be between 1 and 1000")
        return

    if command_args.only_concat:
        concat_outputs(command_args)
        return

    # if command_args.only_compute_abs_SHAP:
    #     _compute_abs_and_sign_SHAP(command_args.output_dir, command_args.path_to_X);
    #     return

    make_dir_if_not_exists(command_args.output_dir)

    log_run_details(command_args)

    # qp = fakeqp
    with qp(jobname=command_args.job_name,
            q=['himem7.q'],
            mem_def=str(command_args.mem_def) + 'G',
            trds_def=2,
            tryrerun=True,
            max_u=650,
            delay_batch=5) as q:
        os.chdir(
            "/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/")
        q.startpermanentrun()
        upload_these_jobs(q, command_args)
Exemple #8
0
def main():
    """

    :return:
    """
    print('main')
    parser = argparse.ArgumentParser()
    parser.add_argument('output_dir',
                        help='Path to output directory',
                        type=str,
                        default=None)
    parser.add_argument(
        '-path_to_X',
        '--path_to_X',
        help='Path to features data - X',
        type=str,
        default=
        '/home/noamba/Analyses/Noamba/Metabolon/SHAP/dataframes/mar17_phenome.dat'
    )
    parser.add_argument(
        '-path_to_Y',
        help='Path to labels - Y',
        type=str,
        default=
        '/home/noamba/Analyses/Noamba/Metabolon/technical_noise/dataframes/mar17_metabolomics_grouped085_unnormed_fillna_min_dayfromfirstsample_regressed_rzs.dat'
    )
    # default='/home/noamba/Analyses/Noamba/Metabolon/SHAP/dataframes/mar17_metabolomics_grouped085_unnormed_fillna_min_dayfromfirstsample_regressed_rzs.dat')
    parser.add_argument('-model',
                        help='Which prediction model to use',
                        type=str,
                        default='lightgbm')
    parser.add_argument('-k_folds',
                        help='Number of folds',
                        type=int,
                        default=10)
    parser.add_argument('-only_concat',
                        help='Whether to only concatenate the output files',
                        type=bool,
                        default=False)
    parser.add_argument('-mem_def',
                        help='Number of folds',
                        type=int,
                        default=1)
    parser.add_argument('-n_BS',
                        help='Number of CI permutations',
                        type=int,
                        default=1000)
    parser.add_argument(
        '-multi_features',
        help='Whether to use the set of hyper parameters designed for a large '
        'number of features',
        type=bool,
        default=False)
    parser.add_argument(
        '-bootstrap_negative_estimate',
        help='Whether to run bootstrapping on estimates which are '
        'negative',
        type=bool,
        default=False)
    parser.add_argument('-log_transform',
                        help='Whether to log transform the data',
                        type=bool,
                        default=False)
    parser.add_argument(
        '-rand_folds',
        help='Whether to randomize the folds when bootstrapping',
        type=bool,
        default=False)
    parser.add_argument('-job_name',
                        help='Job preffix for q',
                        type=str,
                        default='')

    command_args = parser.parse_args()
    # check X and y exist

    command_args.Xs = _convert_comma_separated_to_list(command_args.path_to_X)

    for x in command_args.Xs:
        if not os.path.exists(x):
            print(x, 'does not exist.')
            return

    if not os.path.exists(command_args.path_to_Y):
        print("Y doesn't exist!")
        return
    # check model is legal
    if command_args.model not in supported_prediction_models:
        print('chosen model must be one of:',
              ', '.join(supported_prediction_models))
        return
    # only concat results, do not run bootstrapping
    if command_args.only_concat:
        concat_outputs(command_args)
        return

    make_dir_if_not_exists(command_args.output_dir)

    log_run_details(command_args)

    if len(command_args.job_name) == 0:
        job_name = 'bs_' + command_args.output_dir.split('/')[-2]
    else:
        job_name = command_args.job_name
    # with fakeqp(jobname = job_name, q=['himem7.q'], mem_def = '1G',
    with qp(jobname=job_name,
            q=['himem7.q'],
            mem_def='1G',
            trds_def=2,
            tryrerun=True,
            max_u=550,
            delay_batch=5,
            max_r=550) as q:
        os.chdir(
            "/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/")
        q.startpermanentrun()
        upload_job_per_y(q, command_args)
def main():
    """
    Compute all correlations between EM genes matrix and metabolites and write to disk.
    """

    parser = argparse.ArgumentParser()
    parser.add_argument('-output_dir',
                        help='Path to output directory',
                        type=str,
                        default=None)
    parser.add_argument(
        'analysisType',
        help='What type of analysis to perform: single or shuffle',
        type=str,
        default='single')
    parser.add_argument('-data',
                        help='What type of data: EMGenes or KO',
                        type=str,
                        default='EMGenes')
    parser.add_argument('-metabsPath',
                        help='Path to metabolites file',
                        type=str,
                        default=METABOLON_DIR + '/tmp_files/metabs.df')
    parser.add_argument('-ppmetPath',
                        help='Path to metabolite values file',
                        type=str,
                        default=METABOLON_DIR + '/tmp_files/ppmet_483x820.df')
    parser.add_argument('-EMGenesPath',
                        help='Path to EMGenes dataframe',
                        type=str,
                        default=None)
    parser.add_argument('-ko',
                        '--ko_path',
                        help='Path to KOxsample data frame',
                        type=str,
                        default=METABOLON_DIR + 'KO_EMGenes.df')
    parser.add_argument('-f',
                        '--from_file',
                        help='Path to external file holding metabolites IDs',
                        type=str,
                        default=None)
    parser.add_argument('-rnk',
                        '--remove_nonKEGG_metabs',
                        type=bool,
                        help='Whether to remove metabolits without KEGG IDs',
                        default=True)
    parser.add_argument('-rhc',
                        '--remove_high_corr_metabs',
                        type=bool,
                        help='Whether to remove highly correlated metabolits',
                        default=False)
    parser.add_argument('-nc',
                        '--n_cols_per_job',
                        help='Number of columns per job',
                        type=int,
                        default=10000)
    parser.add_argument('-cp',
                        '--minimal_corr_pair',
                        help='Minimal number of pairs for test',
                        type=int,
                        default=10)
    parser.add_argument('-dp',
                        '--directional_pval',
                        type=bool,
                        help='Whether to compute directional p-value',
                        default=True)
    parser.add_argument('-oc',
                        '--only_concat',
                        type=bool,
                        help='Whether to only run the concat function',
                        default=False)
    parser.add_argument('-ns',
                        '--num_of_shuffles',
                        help='Number of shuffles to perform',
                        type=int,
                        default=100)
    parser.add_argument('-wc',
                        '--write_corr',
                        help='Whether to write the correlation files',
                        type=bool,
                        default=False)
    parser.add_argument('-pq',
                        '--pass_path_to_q',
                        help='Whether to pass the big df path to q',
                        type=bool,
                        default=True)
    parser.add_argument('-mem',
                        '--memory_per_job',
                        help='Amount of memory to assign each job',
                        type=str,
                        default='10G')
    command_args = parser.parse_args()

    if command_args.analysisType != 'single' and command_args.analysisType != 'shuffle':
        print(now() +
              " - [main] analysisType must be either single or shuffle.")
        return
    if command_args.data != 'EMGenes' and command_args.data != 'KO':
        print(now() + " - [main] data must be either EMGenes or KO.")
        return
    if not (os.path.exists(command_args.metabsPath)
            or os.path.exists(command_args.ppmetPath)):
        print(now() + " - [main] metabsPath or ppmetPath doesn't exist.")
        return

    if command_args.only_concat:
        concat_temp_corr_dfs(command_args.output_dir,
                             command_args.n_cols_per_job,
                             command_args.write_corr)
        return


#     return #TODO: check if using args works and then remove this line
    from Analyses.MetabolonAnalysis import *
    M = MetabolonEMPairwiseCorrelationWriter()
    if command_args.EMGenesPath is not None:
        M.A = Utils.Load(command_args.ppmetPath)
        M.B = Utils.Load(command_args.EMGenesPath)
    else:
        if command_args.data == 'KO':
            with open(command_args.ko_path, 'rb') as handle:
                M.B = pickle.load(handle)
            with open(command_args.ppmetPath, 'rb') as handle:
                M.A = pickle.load(handle)
        elif command_args.data == 'EMGenes':
            M.run()

    if command_args.output_dir is None and command_args.analysisType == 'single':
        command_args.output_dir = METABOLON_DIR + '/' + command_args.data + '_single_analysis/'
    elif command_args.output_dir is None and command_args.analysisType == 'shuffle':
        command_args.output_dir = METABOLON_DIR + '/' + command_args.data + '_shuffled_analysis/'

    if not os.path.exists(command_args.output_dir):
        os.makedirs(command_args.output_dir)

    with open(command_args.output_dir + '/args' + str(datetime.now()),
              'w') as handle:
        for arg in vars(command_args):
            handle.write(
                str(arg) + '\t' + str(getattr(command_args, arg)) + '\n')

    if command_args.remove_nonKEGG_metabs:
        M.A = remove_non_kegg_metabolites(M.A, command_args.metabsPath)

    if command_args.remove_high_corr_metabs:
        M.A = remove_highly_correlated_metabolites(M.A)

    if command_args.from_file is not None:
        M.A = filter_metabolites_from_file(M.A, command_args.from_file)

    if command_args.analysisType == 'single':
        print(now() + " - uploading jobs to q...")
        with qp(jobname='BldSng',
                q=['himem7.q'],
                mem_def=command_args.memory_per_job,
                trds_def=1,
                max_u=120,
                tryrerun=True,
                deleteCSHwithnoerr=True) as q:
            os.chdir(METABOLON_DIR)
            q.startpermanentrun()
            res = upload_jobs(q, M.A, M.B, command_args.output_dir,
                              command_args.minimal_corr_pair,
                              command_args.directional_pval,
                              command_args.n_cols_per_job,
                              command_args.write_corr)
        concat_temp_corr_dfs(command_args.output_dir,
                             command_args.n_cols_per_job,
                             command_args.write_corr)
    elif command_args.analysisType == 'shuffle':
        with qp(jobname='BldShf',
                q=['himem7.q'],
                mem_def=command_args.memory_per_job,
                trds_def=1,
                max_u=320,
                tryrerun=True,
                deleteCSHwithnoerr=True) as q:
            #             M.B = M.B.iloc[:,0:10000]
            #             M.A = M.A.iloc[:,0:2]
            os.chdir(METABOLON_DIR)
            q.startpermanentrun()
            res = upload_jobs_shuffled(q, M.A, M.B, command_args.output_dir,
                                       command_args.minimal_corr_pair,
                                       command_args.directional_pval,
                                       command_args.num_of_shuffles,
                                       command_args.write_corr,
                                       command_args.pass_path_to_q)

    return
Exemple #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('output_dir',
                        help='Path to output directory',
                        type=str,
                        default=None)
    parser.add_argument(
        '-dim_red_method',
        help='Either PCA, PCoA or None currently only these are supported',
        type=str,
        default='PCA')
    parser.add_argument('-output_df',
                        help='Path to final data frame',
                        type=str,
                        default=None)
    parser.add_argument('-n_runs',
                        help='Number of random predictions to run',
                        type=int,
                        default=10)
    parser.add_argument('-start_from_index',
                        help='Start naming files from this index',
                        type=int,
                        default=0)
    parser.add_argument(
        '-path_to_X',
        help='Path to features data - X',
        type=str,
        default=
        '/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/Cardio/Cardio07112017/EMGenes_binary_joined.csv'
    )
    parser.add_argument(
        '-path_to_Y',
        help='Path to labels - Y',
        type=str,
        default=
        '/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/Cardio/Cardio07112017/PCoA_sites_EMGenes_Y.dat'
    )
    parser.add_argument('-only_concat',
                        help='Path to final data frame',
                        type=bool,
                        default=False)
    parser.add_argument('-use_projection',
                        help='Whether to project the test data in each k fold',
                        type=bool,
                        default=False)
    command_args = parser.parse_args()

    if command_args.output_dir is None:
        return
    if command_args.output_df is None:
        command_args.output_df = command_args.output_dir + '/final_predictions.dat'
    if (not os.path.exists(command_args.path_to_X)) or (not os.path.exists(
            command_args.path_to_Y)):
        return
    if command_args.n_runs < 1 or command_args.n_runs > 100000:
        return
    if command_args.only_concat:
        concat_temp_files(command_args)
        return
    if command_args.dim_red_method not in LEGAL_DIM_REDUCTION_METHODS:
        print "dim_red_method must be one of: " + ', '.join(
            LEGAL_DIM_REDUCTION_METHODS)
        return


#     if (command_args.dim_red_method == 'PCoA') and (not re.match(command_args.dim_red_method + '.+',
#                                                                  command_args.path_to_X.split('/')[-1])):
#         print "dim_red_method doesn't match input X"
#         return
#     if (command_args.dim_red_method == 'PCA') and (re.match('PCoA.+', command_args.path_to_X.split('/')[-1])):
#         print "dim_red_method doesn't match input X"
#         return
#     if (command_args.use_projection is False) and (not re.match(command_args.dim_red_method + '.+', command_args.path_to_X.split('/')[-1])):
#         print "When not using projection X input must start with the dimensionality reduction method"
#         return
#     if command_args.dim_red_method == 'PCoA' and command_args.use_projection:
#         print "Projection of test data using PCoA isn't supported at the moment"
#         return

    if not os.path.exists(command_args.output_dir):
        os.makedirs(command_args.output_dir)
    with open(command_args.output_dir + '/args' + str(datetime.now()),
              'w') as handle:
        for arg in vars(command_args):
            handle.write(
                str(arg) + '\t' + str(getattr(command_args, arg)) + '\n')

    if command_args.use_projection:
        mem_def = '20G'
    else:
        mem_def = '1G'
    with qp(jobname='RandPredict',
            q=['himem7.q'],
            mem_def=mem_def,
            trds_def=1,
            tryrerun=True,
            max_u=110,
            delay_batch=5) as q:
        os.chdir(
            "/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/")
        q.startpermanentrun()
        generate_random_predictors(q, command_args)
    concat_temp_files(command_args)
def main():
    """
    Compute all correlations between EM genes matrix and metabolites and write to disk.
    """

    parser = argparse.ArgumentParser()
    parser.add_argument('work_dir',
                        help='Path to working directory',
                        type=str,
                        default=None)
    parser.add_argument(
        '-analysisType',
        help='What type of analysis to perform: normal or shuffle',
        type=str,
        default='normal')
    parser.add_argument(
        '-keggdb',
        help='Which KEGG DB to consider: reaction, pathway, module, ko, all',
        type=str,
        default='ko')
    parser.add_argument('-plot',
                        help='Whether to call the plotting function',
                        type=bool,
                        default=False)
    parser.add_argument('-gsea',
                        help='Name of GSEA results file in working directory',
                        type=str,
                        default='GSEA')
    parser.add_argument('-metabsPath',
                        help='Path to metabolites file',
                        type=str,
                        default=METABOLON_DIR + '/tmp_files/metabs.df')
    parser.add_argument(
        '-stat_test',
        help='Type of statistical test: mannwhitneyu, directed_mannwhitneyu',
        type=str,
        default='directed_mannwhitneyu')
    parser.add_argument('-dsc',
                        '--down_stream_cols',
                        help='What type of data: bac_genes or ko',
                        type=str,
                        default='bac_genes')
    parser.add_argument('-tkdbff',
                        '--take_kdb_from_file',
                        help='Path to KEGG DB',
                        type=str,
                        default=None)
    parser.add_argument('-tkprff',
                        '--take_metab_kdb_pair_from_file',
                        help='Path to metabolite-KEGGDB pairs file',
                        type=str,
                        default=None)
    parser.add_argument('-min_dsc',
                        '--minimal_downstream_cols',
                        help='Minimal number downstream columns in test',
                        type=int,
                        default=5)
    parser.add_argument('-max_dsc',
                        '--maximal_downstream_cols',
                        help='Maximal number downstream columns in test',
                        type=int,
                        default=10000)
    parser.add_argument('-max_kos_db',
                        '--maximal_kos_per_db',
                        help='Maximal number KOs per KEGG DB',
                        type=int,
                        default=100)
    parser.add_argument('-fpb',
                        '--files_per_batch',
                        help='Number of files to parse in each job',
                        type=int,
                        default=100)
    parser.add_argument('-kpc',
                        '--kos_per_compound',
                        help='Maximal number of KOs per compound',
                        type=int,
                        default=2000)
    parser.add_argument('-cpk',
                        '--compounds_per_ko',
                        help='Maximal number of compound per KO',
                        type=int,
                        default=500)

    command_args = parser.parse_args()

    if command_args.work_dir is None:
        print(str(datetime.now()) + " - [main] work_dir must be passed.")
        return
    if command_args.analysisType != 'normal' and command_args.analysisType != 'shuffle':
        print(
            str(datetime.now()) +
            " - [main] analysisType must be normal or shuffle.")
        return
    if command_args.down_stream_cols not in POSSIBLE_DOWNSTREAM_COL:
        print(
            str(datetime.now()) +
            " - [main] down_stream_cols must be bac_genes or ko.")
        return
    if command_args.keggdb != 'all' and command_args.keggdb not in POSSIBLE_KEGG_DB:
        print(str(datetime.now()) + " - [main] Illegal KEGG DB.")
        return
    if command_args.stat_test not in POSSIBLE_STAT_TESTS:
        print(str(datetime.now()) + " - [main] Illegal stat test.")
        return

    with open(command_args.work_dir + '/args' + str(datetime.now()),
              'w') as handle:
        for arg in vars(command_args):
            handle.write(
                str(arg) + '\t' + str(getattr(command_args, arg)) + '\n')

#     mkg = MetabolonKEGGGWAS(command_args.work_dir, command_args.down_stream_cols, command_args.gsea,
#                             command_args.metabsPath, command_args.stat_test,
#                             command_args.plot, command_args.take_kdb_from_file,
#                             command_args.take_metab_kdb_pair_from_file)

    if command_args.analysisType == 'normal':
        pval_files = os.listdir(command_args.work_dir)
        pval_files = [
            f for f in pval_files if len(f) > 4 and f[0:5] == 'pvals'
        ]
        pval_files.sort(key=natural_keys)
        if len(pval_files) > command_args.files_per_batch:
            with qp('MetGWAS',
                    delay_sec=1,
                    mem_def='10G',
                    q=['himem7.q'],
                    trds_def=1,
                    max_u=120,
                    tryrerun=True) as q:
                os.chdir(METABOLON_DIR)
                waiton = []
                q.startpermanentrun()
                for i in range(0, len(pval_files),
                               command_args.files_per_batch):
                    print(
                        str(datetime.now()) + " - [main] uploading job " +
                        str(i) + " to q")
                    waiton.append(
                        q.method(
                            divide_jobs,
                            (command_args,
                             pval_files[i:i + command_args.files_per_batch],
                             i)))
                res = q.waitforresults(waiton)
            print(str(datetime.now()) + " - [main] Results are back")
            concat_gsea_files(command_args.work_dir, command_args.gsea)
            return

        else:
            mkg = MetabolonKEGGGWAS(
                command_args.work_dir, command_args.down_stream_cols,
                command_args.gsea, command_args.metabsPath,
                command_args.stat_test, command_args.kos_per_compound,
                command_args.compounds_per_ko, command_args.plot,
                command_args.take_kdb_from_file,
                command_args.take_metab_kdb_pair_from_file)
            for f in pval_files:
                mkg._get_pvals_matrix(command_args.work_dir + '/' + f)
                if command_args.keggdb == 'all':
                    for kdb in POSSIBLE_KEGG_DB:
                        mkg.all_rank_sum_tests(
                            kdb, f, command_args.minimal_downstream_cols,
                            command_args.maximal_downstream_cols,
                            command_args.maximal_kos_per_db)
                else:
                    mkg.all_rank_sum_tests(
                        command_args.keggdb, f,
                        command_args.minimal_downstream_cols,
                        command_args.maximal_downstream_cols,
                        command_args.maximal_kos_per_db)
    elif command_args.analysisType == 'shuffle':
        # no much reason to use this, since shuffling is done before hand
        pass