Exemple #1
0
def copy_plink_files(origin_basename, destination_dir):
    make_dir_if_not_exists(destination_dir)
    os.chdir(destination_dir)
    # using the command line "cp"
    os.system(' '.join(['cp', origin_basename + '*', '.']))
    # # using python
    # basename = os.path.basename(origin_basename)
    # basedir = origin_basename.split(basename)[0]
    # for s in [s for s in os.listdir(basedir) if s.startswith(basename)]:
    #     copyfile(os.path.join(basedir, s), os.path.join(destination_dir, s))
    return
Exemple #2
0
def build_grm(command_args, grm):
    command_args.output_dir += '/' + grm + '/'
    make_dir_if_not_exists(command_args.output_dir)
    print ('build_grm')
    samples = get_FD_SPID_list(command_args)
    # load relevant data
    db = grm.split('-')
    mc = None
    if len(db) > 2: mc = db[1]
    db = db[0]
    if db in ['IGC', 'MPA_species', 'MPA_genera', 'KO', 'KEGGBacGenes']:
        data = get_microbiome_df(db, command_args, grm, mc).loc[:, samples].dropna(how='all', axis=1)
        data = remove_rare_elements(data, null=False, rare_def=RARE_ELEMENTS_TH)
    elif db == 'Diet':
        data = Utils.Load(DIET_DF_PATH).loc[samples].fillna(0)
    elif db == 'DietBin':
        data = Utils.Load(DIETBIN_DF_PATH).loc[samples].fillna(0)    
    elif db == 'Drugs':
        data = Utils.Load(DRUGS_DF_PATH).loc[samples].fillna(0)
    elif db in ml_cat:
        data = Utils.Load(PNP_FEATURES_PATH).replace({'Gender':{'Female':0, 'Male':1}}).loc[samples, ml_cat[db]] # fillna(0) ?
        data = data.fillna(data.median())
    elif db == 'BLOOD':
        data = Utils.Load(BLOOD_DF_PATH).loc[samples].drop(['CRP (WIDE RANGE)', 'CRP hs'], axis=1) # fillna(0) ?
        data = data.fillna(data.median())
    elif db in BLOOD_DIC:
        data = Utils.Load(BLOOD_DF_PATH).loc[samples, BLOOD_DIC[db]] # fillna(0) ?
        data = data.fillna(data.median())
    elif db == 'TCR0.1':
        data = Utils.Load(TCR_01_DF_PATH).loc[samples].dropna()
    else:
        exit
    # check for multi-component
    if mc is not None:
        divide_into_multicomponent(data, db, mc, command_args)
    else:
        compute_grm_and_save(data, command_args, db)
    return
Exemple #3
0
def main():
    print ('main')
    parser = argparse.ArgumentParser()
    parser.add_argument('grm', help='What are the random effects. (IGC, IGC-COG, MPA_species, MPA_species-phyla, KEGGBacGenes, etc.) separated by comma', type=str)
    parser.add_argument('samples', help='What samples to use. (ACS, MAR17, MAY18, etc.) separated by comma', type=str)
    parser.add_argument('-output_dir', help='Path to output directory', type=str, default=GRM_DIR)
    parser.add_argument('-use_quantile_normalization', help='Whether to use quantile normalization over microbiome data', type=bool, default=False)
    parser.add_argument('-presence_absence_th', help='What should be the presence absence threshold for microbiome abundances', type=float, default=1e-6)
#     parser.add_argument('-IGC_corr', help='Correlation threshold to use for IGC matrix', type=float, default=None)
    
    command_args = parser.parse_args()
    
    make_dir_if_not_exists(command_args.output_dir)
    
    command_args.grm = _convert_comma_separated_to_list(command_args.grm)
    for grm in command_args.grm:
        if grm not in LEGAL_GRMs:
            print ('grm currently supported are: ' + ','.join(LEGAL_GRMs))
            exit
            
    command_args.samples = _convert_comma_separated_to_list(command_args.samples)
    for samps in command_args.samples:
        if samps not in SAMPLES_DICT:
            print ('samples currently supported are: ' + ','.join(SAMPLES_DICT.keys()))
            exit
            
    if command_args.use_quantile_normalization:
        command_args.output_dir += '/QN/'
        make_dir_if_not_exists(command_args.output_dir)
        
    command_args.output_dir += '/' + '-'.join(command_args.samples) + '/'
    
    make_dir_if_not_exists(command_args.output_dir)
    
    with open(command_args.output_dir + '/args' + str(datetime.now()), 'w') as handle:
        for arg in vars(command_args):
            handle.write(str(arg) + '\t' + str(getattr(command_args, arg)) + '\n')
    
    
#     with qp(jobname = 'GRM-build', q=['himem7.q'], mem_def = '30G', trds_def = 1, tryrerun=False,
    with fakeqp(jobname = 'GRM-build', q=['himem7.q'], mem_def = '30G', trds_def = 1, tryrerun=False,
        max_u = 25, delay_batch=15) as q:
        os.chdir("/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/") # TODO: change to your dir
        q.startpermanentrun()
        upload_job_buildGRM(q, command_args)

    return
def main():
    print('main')
    parser = argparse.ArgumentParser()
    parser.add_argument('output_dir',
                        help='Path to output directory',
                        type=str,
                        default=None)
    parser.add_argument('-n_cols_per_job',
                        help='Number of columns per job',
                        type=int,
                        default=10)
    parser.add_argument(
        '-path_to_X',
        '--path_to_X',
        help='Path to features data - X, separated by comma',
        type=str,
        default=
        '/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/Metabolon/Paper_v4/unknown_pathway_prediction/metabolomics_levels_mean_null.csv'
    )
    parser.add_argument(
        '-path_to_Y',
        help='Path to labels - Y',
        type=str,
        default=
        '/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/Metabolon/Paper_v4/unknown_pathway_prediction/super_pathway_y.csv'
    )
    parser.add_argument(
        '-names',
        help=
        'names of Xs, separated by comma. Must be same length as the number of Xs.',
        type=str,
        default='levels')
    parser.add_argument('-ntrees',
                        help='The number of trees for training',
                        type=int,
                        default=2000)
    parser.add_argument('-val_size',
                        help='The fraction of validation for the training',
                        type=float,
                        default=0.2)
    parser.add_argument(
        '-early_stopping_rounds',
        help='The number of early stopping rounds for the training',
        type=int,
        default=50)
    parser.add_argument('-over_sample',
                        help='Whether to over sample the samples',
                        type=bool,
                        default=False)
    parser.add_argument('-only_concat',
                        help='Whether to only concatenate the output files',
                        type=bool,
                        default=False)
    parser.add_argument('-only_predict_test',
                        help='Whether to only run the predict_test function',
                        type=bool,
                        default=False)
    parser.add_argument('-mem_def',
                        help='Amount of memory per job',
                        type=int,
                        default=1)
    parser.add_argument('-job_name',
                        help='Job preffix for q',
                        type=str,
                        default='PathwayClassifier')
    command_args = parser.parse_args()

    command_args.path_to_X = _convert_comma_separated_to_list(
        command_args.path_to_X)
    for x in command_args.path_to_X:
        if not os.path.exists(x):
            print(x, 'does not exist')
            return
    if not os.path.exists(command_args.path_to_Y):
        print(command_args.path_to_Y, 'does not exist!')
        return

    command_args.names = _convert_comma_separated_to_list(command_args.names)
    assert len(command_args.names) == len(command_args.path_to_X)

    if command_args.n_cols_per_job < 1 or command_args.n_cols_per_job > 1000:
        print("n_cols_per_job must be between 1 and 1000")
        return

    if command_args.only_concat:
        concat_outputs(command_args)
        return

    if command_args.only_predict_test:
        predict_test(command_args)
        return

    make_dir_if_not_exists(command_args.output_dir)

    log_run_details(command_args)

    # qp = fakeqp
    with qp(jobname=command_args.job_name,
            q=['himem7.q'],
            mem_def=str(command_args.mem_def) + 'G',
            trds_def=2,
            tryrerun=True,
            max_u=650,
            delay_batch=5) as q:
        os.chdir(
            "/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/")
        q.startpermanentrun()
        upload_these_jobs(q, command_args)
Exemple #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('phenotype', help='Which phenotype/s to take as y. (Metabolomics_raw, Metabolomics_normed, BMI, Cohort, etc.)', type=str)
    parser.add_argument('samples', help='What samples to use. (ACS, MAR17, MAY18, etc.)', type=str)
    parser.add_argument('-output_dir', help='Path to output directory', type=str, default=LMM_DIR)
    parser.add_argument('-random', help='What are the random effects. (IGC, IGC-COG, MPA_species, MPA_species-phyla, KEGGBacGenes, Diet, etc.) separated by comma', type=str, default='IGC')
    parser.add_argument('-fixed', help='What are the fixed effects. (Age, Gender, BMI, Nextera, etc.) separated by comma', type=str, default='Age,Gender,Nextera')
    parser.add_argument('-n_phenotypes_per_job', help='Number of phenotypes per job', type=int, default=50)
    parser.add_argument('-use_quantile_normalization', help='Whether to use quantile normalization over microbiome data', type=bool, default=False)
    parser.add_argument('-jackknife_iterations', help='Number of Jackknife iterations', type=int, default=100)
    parser.add_argument('-output_name', help='Specify an output directory name', type=str, default=None)


#     parser.add_argument('-covariates', help='Which covariates to consider, separated by comma', type=str, default='Age,Gender')
#     parser.add_argument('-prevalence', help='In case of a case-control trait, what is the prevalence in the general population', type=float, default=None)
    parser.add_argument('-fiesta_iterations', help='Number of iterations to be made by FIESTA', type=int, default=100)

    command_args = parser.parse_args()
    make_dir_if_not_exists(command_args.output_dir)

    if command_args.output_name is not None:
        command_args.output_dir += '/' + command_args.output_name + '/'
        make_dir_if_not_exists(command_args.output_dir)

    if command_args.phenotype not in LEGAL_PHENOTYPES:
        print ('phenotype currently supported are: ' + ','.join(LEGAL_PHENOTYPES))
        return

    command_args.random = _convert_comma_separated_to_list(command_args.random)
    for grm in command_args.random:
        if grm not in LEGAL_GRMs:
            print ('grm currently supported are: ' + ','.join(LEGAL_GRMs))
            exit

    command_args.samples = _convert_comma_separated_to_list(command_args.samples)
    for samps in command_args.samples:
        if samps not in SAMPLES_DICT:
            print ('samples currently supported are: ' + ','.join(SAMPLES_DICT.keys()))
            exit

    if command_args.use_quantile_normalization:
        command_args.output_dir += '/QN/'
        make_dir_if_not_exists(command_args.output_dir)

    command_args.output_dir += '/' + '-'.join(command_args.samples) + '/'
    make_dir_if_not_exists(command_args.output_dir)
    command_args.output_dir += '/' + command_args.phenotype + '/'
    make_dir_if_not_exists(command_args.output_dir)
    command_args.output_dir += '/' + '+'.join(command_args.random) + '/'
    make_dir_if_not_exists(command_args.output_dir)


    with open(command_args.output_dir + '/args' + str(datetime.now()), 'w') as handle:
        for arg in vars(command_args):
            handle.write(str(arg) + '\t' + str(getattr(command_args, arg)) + '\n')

    command_args.fixed = _convert_comma_separated_to_list(command_args.fixed)

    with qp(jobname = 'LMMs', q=['himem7.q'], mem_def = '1G', trds_def = 1, tryrerun=False,
#     with fakeqp(jobname = 'LMMs', q=['himem7.q'], mem_def = '1G', trds_def = 1, tryrerun=False,
        max_u = 220, delay_batch=15) as q:
        os.chdir("/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/")
        q.startpermanentrun()
        upload_lmm_per_n_phenotypes(q, command_args)

    return
Exemple #6
0
def main():
    print('main')
    parser = argparse.ArgumentParser()
    parser.add_argument('output_dir',
                        help='Path to output directory',
                        type=str,
                        default=None)
    parser.add_argument('-model',
                        help='Which prediction model to use',
                        type=str,
                        default='lightgbm')
    parser.add_argument('-n_cols_per_job',
                        help='Number of columns per job',
                        type=int,
                        default=2)
    parser.add_argument('-n_random',
                        help='Number of random samples',
                        type=int,
                        default=20)
    parser.add_argument('-pfilter',
                        help='Threshold for p-value in association test',
                        type=float,
                        default=0.00001)
    parser.add_argument(
        '-path_to_plink_bfiles',
        '--path_to_plink_bfiles',
        help='Path to basename plink data',
        type=str,
        default=
        '/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/Metabolon/Genetics/PNP_autosomal_clean2_nodfukim_norelated_Metabolon'
    )
    parser.add_argument(
        '-path_to_Y',
        help='Path to labels - Y',
        type=str,
        default=
        '/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/Metabolon/technical_noise/dataframes/mar17_metabolomics_grouped085_unnormed_fillna_min_dayfromfirstsample_regressed_rzs_regid.csv'
    )
    parser.add_argument('-k_folds',
                        help='Number of folds',
                        type=int,
                        default=10)
    parser.add_argument('-only_concat',
                        help='Whether to only concatenate the output files',
                        type=bool,
                        default=False)

    parser.add_argument('-mem_def',
                        help='Amount of memory per job',
                        type=int,
                        default=2)
    parser.add_argument('-job_name',
                        help='Job preffix for q',
                        type=str,
                        default='plink-cv')
    command_args = parser.parse_args()

    if (not os.path.exists(command_args.path_to_Y)
        ):  # (not os.path.exists(command_args.path_to_X)) or
        print("X or Y doesn't exist!")
        return
    if command_args.n_cols_per_job < 1 or command_args.n_cols_per_job > 1000:
        print("n_cols_per_job must be between 1 and 1000")
        return

    if command_args.only_concat:
        concat_outputs(command_args)
        return

    # if command_args.only_compute_abs_SHAP:
    #     _compute_abs_and_sign_SHAP(command_args.output_dir, command_args.path_to_X);
    #     return

    make_dir_if_not_exists(command_args.output_dir)

    log_run_details(command_args)

    # qp = fakeqp
    with qp(jobname=command_args.job_name,
            q=['himem7.q'],
            mem_def=str(command_args.mem_def) + 'G',
            trds_def=2,
            tryrerun=True,
            max_u=650,
            delay_batch=5) as q:
        os.chdir(
            "/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/")
        q.startpermanentrun()
        upload_these_jobs(q, command_args)
Exemple #7
0
def main():
    """

    :return:
    """
    print('main')
    parser = argparse.ArgumentParser()
    parser.add_argument('output_dir',
                        help='Path to output directory',
                        type=str,
                        default=None)
    parser.add_argument(
        '-path_to_X',
        '--path_to_X',
        help='Path to features data - X',
        type=str,
        default=
        '/home/noamba/Analyses/Noamba/Metabolon/SHAP/dataframes/mar17_phenome.dat'
    )
    parser.add_argument(
        '-path_to_Y',
        help='Path to labels - Y',
        type=str,
        default=
        '/home/noamba/Analyses/Noamba/Metabolon/technical_noise/dataframes/mar17_metabolomics_grouped085_unnormed_fillna_min_dayfromfirstsample_regressed_rzs.dat'
    )
    # default='/home/noamba/Analyses/Noamba/Metabolon/SHAP/dataframes/mar17_metabolomics_grouped085_unnormed_fillna_min_dayfromfirstsample_regressed_rzs.dat')
    parser.add_argument('-model',
                        help='Which prediction model to use',
                        type=str,
                        default='lightgbm')
    parser.add_argument('-k_folds',
                        help='Number of folds',
                        type=int,
                        default=10)
    parser.add_argument('-only_concat',
                        help='Whether to only concatenate the output files',
                        type=bool,
                        default=False)
    parser.add_argument('-mem_def',
                        help='Number of folds',
                        type=int,
                        default=1)
    parser.add_argument('-n_BS',
                        help='Number of CI permutations',
                        type=int,
                        default=1000)
    parser.add_argument(
        '-multi_features',
        help='Whether to use the set of hyper parameters designed for a large '
        'number of features',
        type=bool,
        default=False)
    parser.add_argument(
        '-bootstrap_negative_estimate',
        help='Whether to run bootstrapping on estimates which are '
        'negative',
        type=bool,
        default=False)
    parser.add_argument('-log_transform',
                        help='Whether to log transform the data',
                        type=bool,
                        default=False)
    parser.add_argument(
        '-rand_folds',
        help='Whether to randomize the folds when bootstrapping',
        type=bool,
        default=False)
    parser.add_argument('-job_name',
                        help='Job preffix for q',
                        type=str,
                        default='')

    command_args = parser.parse_args()
    # check X and y exist

    command_args.Xs = _convert_comma_separated_to_list(command_args.path_to_X)

    for x in command_args.Xs:
        if not os.path.exists(x):
            print(x, 'does not exist.')
            return

    if not os.path.exists(command_args.path_to_Y):
        print("Y doesn't exist!")
        return
    # check model is legal
    if command_args.model not in supported_prediction_models:
        print('chosen model must be one of:',
              ', '.join(supported_prediction_models))
        return
    # only concat results, do not run bootstrapping
    if command_args.only_concat:
        concat_outputs(command_args)
        return

    make_dir_if_not_exists(command_args.output_dir)

    log_run_details(command_args)

    if len(command_args.job_name) == 0:
        job_name = 'bs_' + command_args.output_dir.split('/')[-2]
    else:
        job_name = command_args.job_name
    # with fakeqp(jobname = job_name, q=['himem7.q'], mem_def = '1G',
    with qp(jobname=job_name,
            q=['himem7.q'],
            mem_def='1G',
            trds_def=2,
            tryrerun=True,
            max_u=550,
            delay_batch=5,
            max_r=550) as q:
        os.chdir(
            "/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/")
        q.startpermanentrun()
        upload_job_per_y(q, command_args)