def KEGG_download_and_create_fasta_file_from_genes(genes_path, taxa = 'Bacteria', aa_dna = 'protein', genes_per_batch = 10000): KEGG_TEMP_FASTA = KEGG_DB_DIR + '/temp_fasta_' + aa_dna + '/' BASE_URL = base_urls[aa_dna] if not os.path.exists(KEGG_TEMP_FASTA): os.makedirs(KEGG_TEMP_FASTA) if not os.path.exists(KEGG_DB_DIR + '/kos_genes_dict.dat'): print "KO - genes file doesn't exist. Run: KEGG_build_KO_genes_dict()" return genes = [gene.split('\n')[0] for gene in open(genes_path, 'r').readlines()] # kos_genes_dict = Utils.Load(KEGG_DB_DIR + '/kos_genes_dict.dat') with qp(jobname = 'KGG2fasta', q=['himem7.q'], mem_def = '1G', trds_def = 1, tryrerun=True, max_u = 210, delay_batch=15) as q: os.chdir("/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/") q.startpermanentrun() upload_create_fasta_files_jobs(q, KEGG_TEMP_FASTA, genes, BASE_URL, taxa, genes_per_batch) output_fasta = KEGG_DB_DIR + '_'.join(['/KEGG_genes', taxa, aa_dna]) + '.fa' temp_files = os.listdir(KEGG_TEMP_FASTA) temp_files.sort() with open(output_fasta, 'w') as handle: for temp_file in temp_files: print temp_file fasta_content = open(KEGG_TEMP_FASTA + '/' + temp_file, 'r').readlines() for line in fasta_content: handle.write(line) os.remove(KEGG_TEMP_FASTA + '/' + temp_file) os.removedirs(KEGG_TEMP_FASTA) return
def _grouping_with_sescg(mat, k, work_dir, corr_thresh, abscorr, presence_thresh, corr_presence_thresh, first_split_n, split_n, corrmethod, grouping_type): print(str(datetime.now()) + " - [_grouping_with_sescg] start - " + k) with qp('grping', delay_sec=1, tryrerun=True, mem_def='2G', trds_def=2, q=['himem7.q']) as q: q.startpermanentrun() ex_grps = semi_exahustive_strict_cor_grouping(mat, corr_thresh, abscorr, presence_thresh, corr_presence_thresh, first_split_n, split_n, corrmethod, q) cormet = mat.corr(corrmethod, corr_presence_thresh) def select_rep(grp): if len(grp) == 1: return grp[0] grp_nnl = mat[grp].notnull().sum() return cormet.loc[grp_nnl[grp_nnl == grp_nnl.max()].index, grp_nnl[ grp_nnl == grp_nnl.max()].index].sum(1).argmax() met_lg = concat((mat[select_rep(grp)] for grp in ex_grps), axis=1) Utils.Write(work_dir + '/' + k, met_lg) print(str(datetime.now()) + " - [_grouping_with_sescg] end - " + k) return met_lg
def main(): print(str(datetime.now()) + " - [GroupEMGenesByKEGG:main] start") parser = argparse.ArgumentParser() parser.add_argument('-work_dir', help='Path to working directory', type=str, default=None) parser.add_argument('-output_name', help='Name of output dataframe file', type=str, default='EMGenes_group.df') parser.add_argument('-keggDB', help='Type of KEGG database to group by', type=str, default='ko') parser.add_argument('-min_size', help='Minimum size of KEGG DB to perform grouping on', type=int, default=20) parser.add_argument('-max_size', help='Minimum size of KEGG DB to perform grouping on', type=int, default=100000) parser.add_argument( '-method', help='Which algorithm to use for grouping: newalg or sescg', type=str, default='newalg') parser.add_argument( '-grouping_type', help='Whether to choose a representetive or to sum: sum or rep', type=str, default='rep') parser.add_argument('-corr_thresh', help='Correlation threshold to group by', type=float, default=0.75) parser.add_argument('-abscorr', help='Use absolute value of correlation', type=bool, default=True) parser.add_argument('-presence_thresh', help='Demand at least this amount of samples', type=int, default=100) parser.add_argument('-corr_presence_thresh', help='Threshold for computing correlation', type=int, default=50) parser.add_argument('-first_split_n', help='first split n', type=int, default=30) parser.add_argument('-split_n', help='split n', type=int, default=1) parser.add_argument('-corrmethod', help='Which correlation method to use', type=str, default='spearman') parser.add_argument('-mirror', help='mirror option in newalg', type=bool, default=False) parser.add_argument('-just_concat', help='Whether to just read and concat the output', type=bool, default=False) command_args = parser.parse_args() if command_args.work_dir is None: command_args.work_dir = METABOLON_DIR + '/Grouping_by_' + command_args.keggDB if not os.path.exists(command_args.work_dir): os.makedirs(command_args.work_dir) if command_args.keggDB not in POSSIBLE_KEGG_DB: print( str(datetime.now()) + " - [GroupEMGenesByKEGG:main] keggDB not legal") return if str(command_args.min_size).isdigit() is False or str( command_args.max_size).isdigit() is False: print( str(datetime.now()) + " - [GroupEMGenesByKEGG:main] min_size and max_size must be integers" ) return if command_args.method not in POSSIBLE_METHODS: print("method must be one of: " + ', '.join(POSSIBLE_METHODS)) return if command_args.corr_thresh > 1 or command_args.corr_thresh < -1: print( str(datetime.now()) + " - [GroupEMGenesByKEGG:main] corr_thresh must be between -1 and 1" ) return if command_args.corrmethod not in REASONABLE_CORRELATIONS: print("corrmethod must be one of: " + ', '.join(REASONABLE_CORRELATIONS)) return if command_args.grouping_type not in REASONABLE_GROUPING_TYPES: print("corrmethod must be one of: " + ', '.join(REASONABLE_GROUPING_TYPES)) return if command_args.just_concat: _concat_output(command_args.work_dir, command_args.output_name, command_args.keggDB, False) return with open(command_args.work_dir + '/args' + str(datetime.now()), 'w') as handle: for arg in vars(command_args): handle.write( str(arg) + '\t' + str(getattr(command_args, arg)) + '\n') print( str(datetime.now()) + " - [GroupEMGenesByKEGG:main] build KEGG data holder") kegg = KEGG_data_holder() print( str(datetime.now()) + " - [GroupEMGenesByKEGG:main] Load EMGenes dataframe") from Analyses.MetabolonAnalysis import * M = MetabolonEMPairwiseCorrelationWriter() M.run() EMGenes = M.B print( str(datetime.now()) + " - [GroupEMGenesByKEGG:main] Divide EMGenes by " + command_args.keggDB) keggDict = {} EMGenes_columns_set = set(EMGenes.columns) if (command_args.keggDB == 'ko'): for k in kegg.get_dicts()['ko_bacgene'].keys(): temp_genes = set(kegg.get_dicts()['ko_bacgene'][k]) valid_genes = list(temp_genes.intersection(EMGenes_columns_set)) if len(valid_genes) > 0: keggDict[k] = EMGenes[valid_genes] print( str(datetime.now()) + " - [GroupEMGenesByKEGG:main] uploading jobs to q...") with qp('m_grping', delay_sec=1, mem_def='1G', q=['himem7.q'], trds_def=1, max_u=420, tryrerun=True) as q_m: unused_genes = set() os.chdir(METABOLON_DIR) waiton = [] q_m.startpermanentrun() for k in keggDict.keys(): if keggDict[k].shape[1] < command_args.min_size or keggDict[ k].shape[1] > command_args.max_size: unused_genes.update(keggDict[k].columns) continue if os.path.exists(command_args.work_dir + '/' + k): continue if keggDict[k].shape[1] > command_args.min_size: print k + ' - ' + str(keggDict[k].shape[1]) if command_args.method == 'newalg': waiton.append( q_m.method(_grouping_with_newalg, (keggDict[k], k, command_args.work_dir, command_args.corr_thresh, command_args.abscorr, command_args.corr_presence_thresh, command_args.corrmethod, command_args.mirror, command_args.grouping_type))) elif command_args.method == 'sescg': waiton.append( q_m.method( _grouping_with_sescg, (keggDict[k], k, command_args.work_dir, command_args.corr_thresh, command_args.abscorr, command_args.presence_thresh, command_args.corr_presence_thresh, command_args.first_split_n, command_args.split_n, command_args.corrmethod, command_args.grouping_type))) res = q_m.waitforresults(waiton) unused_genes = list(unused_genes) Utils.Write(command_args.work_dir + '/ko:rest.df', EMGenes[unused_genes]) print( str(datetime.now()) + " - [GroupEMGenesByKEGG:main] concatinating dataframes into one") _concat_output(command_args.work_dir, command_args.output_name, command_args.keggDB, True) print(str(datetime.now()) + " - [GroupEMGenesByKEGG:main] end") return
def main(): parser = argparse.ArgumentParser() parser.add_argument( 'list_of_directories', help= 'List of directories to take mapping files from separated with ---', type=str, default=None) parser.add_argument('output_dir', help='Path to output directory', type=str, default=None) parser.add_argument('-reference', help='Path to DIAMOND reference file', type=str, default=None) parser.add_argument('-max_target_seqs', help='--max-target-seqs parameter in DIAMOND', type=int, default=1) parser.add_argument('-evalue', help='--evalue parameter in DIAMOND', type=float, default=10.) parser.add_argument('-more_sensitive', help='--more-sensitive parameter in DIAMOND', type=bool, default=True) parser.add_argument('-take_only_from_file', help='Path to list of samples prefixes to take', type=str, default=None) parser.add_argument( '-diamond_mapper', help='Which diamond mapper to use, one of blastx, blastp', type=str, default='blastx') parser.add_argument('-only_parse', help='Whether to only parse and create results', type=bool, default=False) parser.add_argument('-divide_to_ko', help='Whether to divide the mapping counts into KOs', type=bool, default=False) parser.add_argument('-divide_to_ko_th', help='Threshold for divide to ko', type=float, default=1e-4) command_args = parser.parse_args() if command_args.list_of_directories is None or command_args.output_dir is None: return if command_args.only_parse: parse_diamond_output(command_args) return dirs_list = command_args.list_of_directories.split('---') for d in dirs_list: if not os.path.exists(d): print d + ' does not exists.' return if not os.path.exists(command_args.output_dir): os.makedirs(command_args.output_dir) if command_args.max_target_seqs < 1: return if command_args.take_only_from_file is not None: if not os.path.exists(command_args.take_only_from_file): return with open(command_args.output_dir + '/args' + str(datetime.now()), 'w') as handle: for arg in vars(command_args): handle.write( str(arg) + '\t' + str(getattr(command_args, arg)) + '\n') with qp(jobname='RDiamond', q=['himem7.q'], mem_def='5G', trds_def=1, tryrerun=True, max_u=310, delay_batch=15) as q: os.chdir( "/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/") q.startpermanentrun() upload_run_diamond_jobs_to_q(q, command_args) parse_diamond_output(command_args) return
def main(): print('main') parser = argparse.ArgumentParser() parser.add_argument('output_dir', help='Path to output directory', type=str, default=None) parser.add_argument('-n_cols_per_job', help='Number of columns per job', type=int, default=10) parser.add_argument( '-path_to_X', '--path_to_X', help='Path to features data - X, separated by comma', type=str, default= '/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/Metabolon/Paper_v4/unknown_pathway_prediction/metabolomics_levels_mean_null.csv' ) parser.add_argument( '-path_to_Y', help='Path to labels - Y', type=str, default= '/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/Metabolon/Paper_v4/unknown_pathway_prediction/super_pathway_y.csv' ) parser.add_argument( '-names', help= 'names of Xs, separated by comma. Must be same length as the number of Xs.', type=str, default='levels') parser.add_argument('-ntrees', help='The number of trees for training', type=int, default=2000) parser.add_argument('-val_size', help='The fraction of validation for the training', type=float, default=0.2) parser.add_argument( '-early_stopping_rounds', help='The number of early stopping rounds for the training', type=int, default=50) parser.add_argument('-over_sample', help='Whether to over sample the samples', type=bool, default=False) parser.add_argument('-only_concat', help='Whether to only concatenate the output files', type=bool, default=False) parser.add_argument('-only_predict_test', help='Whether to only run the predict_test function', type=bool, default=False) parser.add_argument('-mem_def', help='Amount of memory per job', type=int, default=1) parser.add_argument('-job_name', help='Job preffix for q', type=str, default='PathwayClassifier') command_args = parser.parse_args() command_args.path_to_X = _convert_comma_separated_to_list( command_args.path_to_X) for x in command_args.path_to_X: if not os.path.exists(x): print(x, 'does not exist') return if not os.path.exists(command_args.path_to_Y): print(command_args.path_to_Y, 'does not exist!') return command_args.names = _convert_comma_separated_to_list(command_args.names) assert len(command_args.names) == len(command_args.path_to_X) if command_args.n_cols_per_job < 1 or command_args.n_cols_per_job > 1000: print("n_cols_per_job must be between 1 and 1000") return if command_args.only_concat: concat_outputs(command_args) return if command_args.only_predict_test: predict_test(command_args) return make_dir_if_not_exists(command_args.output_dir) log_run_details(command_args) # qp = fakeqp with qp(jobname=command_args.job_name, q=['himem7.q'], mem_def=str(command_args.mem_def) + 'G', trds_def=2, tryrerun=True, max_u=650, delay_batch=5) as q: os.chdir( "/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/") q.startpermanentrun() upload_these_jobs(q, command_args)
def main(): parser = argparse.ArgumentParser() parser.add_argument('phenotype', help='Which phenotype/s to take as y. (Metabolomics_raw, Metabolomics_normed, BMI, Cohort, etc.)', type=str) parser.add_argument('samples', help='What samples to use. (ACS, MAR17, MAY18, etc.)', type=str) parser.add_argument('-output_dir', help='Path to output directory', type=str, default=LMM_DIR) parser.add_argument('-random', help='What are the random effects. (IGC, IGC-COG, MPA_species, MPA_species-phyla, KEGGBacGenes, Diet, etc.) separated by comma', type=str, default='IGC') parser.add_argument('-fixed', help='What are the fixed effects. (Age, Gender, BMI, Nextera, etc.) separated by comma', type=str, default='Age,Gender,Nextera') parser.add_argument('-n_phenotypes_per_job', help='Number of phenotypes per job', type=int, default=50) parser.add_argument('-use_quantile_normalization', help='Whether to use quantile normalization over microbiome data', type=bool, default=False) parser.add_argument('-jackknife_iterations', help='Number of Jackknife iterations', type=int, default=100) parser.add_argument('-output_name', help='Specify an output directory name', type=str, default=None) # parser.add_argument('-covariates', help='Which covariates to consider, separated by comma', type=str, default='Age,Gender') # parser.add_argument('-prevalence', help='In case of a case-control trait, what is the prevalence in the general population', type=float, default=None) parser.add_argument('-fiesta_iterations', help='Number of iterations to be made by FIESTA', type=int, default=100) command_args = parser.parse_args() make_dir_if_not_exists(command_args.output_dir) if command_args.output_name is not None: command_args.output_dir += '/' + command_args.output_name + '/' make_dir_if_not_exists(command_args.output_dir) if command_args.phenotype not in LEGAL_PHENOTYPES: print ('phenotype currently supported are: ' + ','.join(LEGAL_PHENOTYPES)) return command_args.random = _convert_comma_separated_to_list(command_args.random) for grm in command_args.random: if grm not in LEGAL_GRMs: print ('grm currently supported are: ' + ','.join(LEGAL_GRMs)) exit command_args.samples = _convert_comma_separated_to_list(command_args.samples) for samps in command_args.samples: if samps not in SAMPLES_DICT: print ('samples currently supported are: ' + ','.join(SAMPLES_DICT.keys())) exit if command_args.use_quantile_normalization: command_args.output_dir += '/QN/' make_dir_if_not_exists(command_args.output_dir) command_args.output_dir += '/' + '-'.join(command_args.samples) + '/' make_dir_if_not_exists(command_args.output_dir) command_args.output_dir += '/' + command_args.phenotype + '/' make_dir_if_not_exists(command_args.output_dir) command_args.output_dir += '/' + '+'.join(command_args.random) + '/' make_dir_if_not_exists(command_args.output_dir) with open(command_args.output_dir + '/args' + str(datetime.now()), 'w') as handle: for arg in vars(command_args): handle.write(str(arg) + '\t' + str(getattr(command_args, arg)) + '\n') command_args.fixed = _convert_comma_separated_to_list(command_args.fixed) with qp(jobname = 'LMMs', q=['himem7.q'], mem_def = '1G', trds_def = 1, tryrerun=False, # with fakeqp(jobname = 'LMMs', q=['himem7.q'], mem_def = '1G', trds_def = 1, tryrerun=False, max_u = 220, delay_batch=15) as q: os.chdir("/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/") q.startpermanentrun() upload_lmm_per_n_phenotypes(q, command_args) return
def main(): print('main') parser = argparse.ArgumentParser() parser.add_argument('output_dir', help='Path to output directory', type=str, default=None) parser.add_argument('-model', help='Which prediction model to use', type=str, default='lightgbm') parser.add_argument('-n_cols_per_job', help='Number of columns per job', type=int, default=2) parser.add_argument('-n_random', help='Number of random samples', type=int, default=20) parser.add_argument('-pfilter', help='Threshold for p-value in association test', type=float, default=0.00001) parser.add_argument( '-path_to_plink_bfiles', '--path_to_plink_bfiles', help='Path to basename plink data', type=str, default= '/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/Metabolon/Genetics/PNP_autosomal_clean2_nodfukim_norelated_Metabolon' ) parser.add_argument( '-path_to_Y', help='Path to labels - Y', type=str, default= '/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/Metabolon/technical_noise/dataframes/mar17_metabolomics_grouped085_unnormed_fillna_min_dayfromfirstsample_regressed_rzs_regid.csv' ) parser.add_argument('-k_folds', help='Number of folds', type=int, default=10) parser.add_argument('-only_concat', help='Whether to only concatenate the output files', type=bool, default=False) parser.add_argument('-mem_def', help='Amount of memory per job', type=int, default=2) parser.add_argument('-job_name', help='Job preffix for q', type=str, default='plink-cv') command_args = parser.parse_args() if (not os.path.exists(command_args.path_to_Y) ): # (not os.path.exists(command_args.path_to_X)) or print("X or Y doesn't exist!") return if command_args.n_cols_per_job < 1 or command_args.n_cols_per_job > 1000: print("n_cols_per_job must be between 1 and 1000") return if command_args.only_concat: concat_outputs(command_args) return # if command_args.only_compute_abs_SHAP: # _compute_abs_and_sign_SHAP(command_args.output_dir, command_args.path_to_X); # return make_dir_if_not_exists(command_args.output_dir) log_run_details(command_args) # qp = fakeqp with qp(jobname=command_args.job_name, q=['himem7.q'], mem_def=str(command_args.mem_def) + 'G', trds_def=2, tryrerun=True, max_u=650, delay_batch=5) as q: os.chdir( "/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/") q.startpermanentrun() upload_these_jobs(q, command_args)
def main(): """ :return: """ print('main') parser = argparse.ArgumentParser() parser.add_argument('output_dir', help='Path to output directory', type=str, default=None) parser.add_argument( '-path_to_X', '--path_to_X', help='Path to features data - X', type=str, default= '/home/noamba/Analyses/Noamba/Metabolon/SHAP/dataframes/mar17_phenome.dat' ) parser.add_argument( '-path_to_Y', help='Path to labels - Y', type=str, default= '/home/noamba/Analyses/Noamba/Metabolon/technical_noise/dataframes/mar17_metabolomics_grouped085_unnormed_fillna_min_dayfromfirstsample_regressed_rzs.dat' ) # default='/home/noamba/Analyses/Noamba/Metabolon/SHAP/dataframes/mar17_metabolomics_grouped085_unnormed_fillna_min_dayfromfirstsample_regressed_rzs.dat') parser.add_argument('-model', help='Which prediction model to use', type=str, default='lightgbm') parser.add_argument('-k_folds', help='Number of folds', type=int, default=10) parser.add_argument('-only_concat', help='Whether to only concatenate the output files', type=bool, default=False) parser.add_argument('-mem_def', help='Number of folds', type=int, default=1) parser.add_argument('-n_BS', help='Number of CI permutations', type=int, default=1000) parser.add_argument( '-multi_features', help='Whether to use the set of hyper parameters designed for a large ' 'number of features', type=bool, default=False) parser.add_argument( '-bootstrap_negative_estimate', help='Whether to run bootstrapping on estimates which are ' 'negative', type=bool, default=False) parser.add_argument('-log_transform', help='Whether to log transform the data', type=bool, default=False) parser.add_argument( '-rand_folds', help='Whether to randomize the folds when bootstrapping', type=bool, default=False) parser.add_argument('-job_name', help='Job preffix for q', type=str, default='') command_args = parser.parse_args() # check X and y exist command_args.Xs = _convert_comma_separated_to_list(command_args.path_to_X) for x in command_args.Xs: if not os.path.exists(x): print(x, 'does not exist.') return if not os.path.exists(command_args.path_to_Y): print("Y doesn't exist!") return # check model is legal if command_args.model not in supported_prediction_models: print('chosen model must be one of:', ', '.join(supported_prediction_models)) return # only concat results, do not run bootstrapping if command_args.only_concat: concat_outputs(command_args) return make_dir_if_not_exists(command_args.output_dir) log_run_details(command_args) if len(command_args.job_name) == 0: job_name = 'bs_' + command_args.output_dir.split('/')[-2] else: job_name = command_args.job_name # with fakeqp(jobname = job_name, q=['himem7.q'], mem_def = '1G', with qp(jobname=job_name, q=['himem7.q'], mem_def='1G', trds_def=2, tryrerun=True, max_u=550, delay_batch=5, max_r=550) as q: os.chdir( "/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/") q.startpermanentrun() upload_job_per_y(q, command_args)
def main(): """ Compute all correlations between EM genes matrix and metabolites and write to disk. """ parser = argparse.ArgumentParser() parser.add_argument('-output_dir', help='Path to output directory', type=str, default=None) parser.add_argument( 'analysisType', help='What type of analysis to perform: single or shuffle', type=str, default='single') parser.add_argument('-data', help='What type of data: EMGenes or KO', type=str, default='EMGenes') parser.add_argument('-metabsPath', help='Path to metabolites file', type=str, default=METABOLON_DIR + '/tmp_files/metabs.df') parser.add_argument('-ppmetPath', help='Path to metabolite values file', type=str, default=METABOLON_DIR + '/tmp_files/ppmet_483x820.df') parser.add_argument('-EMGenesPath', help='Path to EMGenes dataframe', type=str, default=None) parser.add_argument('-ko', '--ko_path', help='Path to KOxsample data frame', type=str, default=METABOLON_DIR + 'KO_EMGenes.df') parser.add_argument('-f', '--from_file', help='Path to external file holding metabolites IDs', type=str, default=None) parser.add_argument('-rnk', '--remove_nonKEGG_metabs', type=bool, help='Whether to remove metabolits without KEGG IDs', default=True) parser.add_argument('-rhc', '--remove_high_corr_metabs', type=bool, help='Whether to remove highly correlated metabolits', default=False) parser.add_argument('-nc', '--n_cols_per_job', help='Number of columns per job', type=int, default=10000) parser.add_argument('-cp', '--minimal_corr_pair', help='Minimal number of pairs for test', type=int, default=10) parser.add_argument('-dp', '--directional_pval', type=bool, help='Whether to compute directional p-value', default=True) parser.add_argument('-oc', '--only_concat', type=bool, help='Whether to only run the concat function', default=False) parser.add_argument('-ns', '--num_of_shuffles', help='Number of shuffles to perform', type=int, default=100) parser.add_argument('-wc', '--write_corr', help='Whether to write the correlation files', type=bool, default=False) parser.add_argument('-pq', '--pass_path_to_q', help='Whether to pass the big df path to q', type=bool, default=True) parser.add_argument('-mem', '--memory_per_job', help='Amount of memory to assign each job', type=str, default='10G') command_args = parser.parse_args() if command_args.analysisType != 'single' and command_args.analysisType != 'shuffle': print(now() + " - [main] analysisType must be either single or shuffle.") return if command_args.data != 'EMGenes' and command_args.data != 'KO': print(now() + " - [main] data must be either EMGenes or KO.") return if not (os.path.exists(command_args.metabsPath) or os.path.exists(command_args.ppmetPath)): print(now() + " - [main] metabsPath or ppmetPath doesn't exist.") return if command_args.only_concat: concat_temp_corr_dfs(command_args.output_dir, command_args.n_cols_per_job, command_args.write_corr) return # return #TODO: check if using args works and then remove this line from Analyses.MetabolonAnalysis import * M = MetabolonEMPairwiseCorrelationWriter() if command_args.EMGenesPath is not None: M.A = Utils.Load(command_args.ppmetPath) M.B = Utils.Load(command_args.EMGenesPath) else: if command_args.data == 'KO': with open(command_args.ko_path, 'rb') as handle: M.B = pickle.load(handle) with open(command_args.ppmetPath, 'rb') as handle: M.A = pickle.load(handle) elif command_args.data == 'EMGenes': M.run() if command_args.output_dir is None and command_args.analysisType == 'single': command_args.output_dir = METABOLON_DIR + '/' + command_args.data + '_single_analysis/' elif command_args.output_dir is None and command_args.analysisType == 'shuffle': command_args.output_dir = METABOLON_DIR + '/' + command_args.data + '_shuffled_analysis/' if not os.path.exists(command_args.output_dir): os.makedirs(command_args.output_dir) with open(command_args.output_dir + '/args' + str(datetime.now()), 'w') as handle: for arg in vars(command_args): handle.write( str(arg) + '\t' + str(getattr(command_args, arg)) + '\n') if command_args.remove_nonKEGG_metabs: M.A = remove_non_kegg_metabolites(M.A, command_args.metabsPath) if command_args.remove_high_corr_metabs: M.A = remove_highly_correlated_metabolites(M.A) if command_args.from_file is not None: M.A = filter_metabolites_from_file(M.A, command_args.from_file) if command_args.analysisType == 'single': print(now() + " - uploading jobs to q...") with qp(jobname='BldSng', q=['himem7.q'], mem_def=command_args.memory_per_job, trds_def=1, max_u=120, tryrerun=True, deleteCSHwithnoerr=True) as q: os.chdir(METABOLON_DIR) q.startpermanentrun() res = upload_jobs(q, M.A, M.B, command_args.output_dir, command_args.minimal_corr_pair, command_args.directional_pval, command_args.n_cols_per_job, command_args.write_corr) concat_temp_corr_dfs(command_args.output_dir, command_args.n_cols_per_job, command_args.write_corr) elif command_args.analysisType == 'shuffle': with qp(jobname='BldShf', q=['himem7.q'], mem_def=command_args.memory_per_job, trds_def=1, max_u=320, tryrerun=True, deleteCSHwithnoerr=True) as q: # M.B = M.B.iloc[:,0:10000] # M.A = M.A.iloc[:,0:2] os.chdir(METABOLON_DIR) q.startpermanentrun() res = upload_jobs_shuffled(q, M.A, M.B, command_args.output_dir, command_args.minimal_corr_pair, command_args.directional_pval, command_args.num_of_shuffles, command_args.write_corr, command_args.pass_path_to_q) return
def main(): parser = argparse.ArgumentParser() parser.add_argument('output_dir', help='Path to output directory', type=str, default=None) parser.add_argument( '-dim_red_method', help='Either PCA, PCoA or None currently only these are supported', type=str, default='PCA') parser.add_argument('-output_df', help='Path to final data frame', type=str, default=None) parser.add_argument('-n_runs', help='Number of random predictions to run', type=int, default=10) parser.add_argument('-start_from_index', help='Start naming files from this index', type=int, default=0) parser.add_argument( '-path_to_X', help='Path to features data - X', type=str, default= '/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/Cardio/Cardio07112017/EMGenes_binary_joined.csv' ) parser.add_argument( '-path_to_Y', help='Path to labels - Y', type=str, default= '/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/Cardio/Cardio07112017/PCoA_sites_EMGenes_Y.dat' ) parser.add_argument('-only_concat', help='Path to final data frame', type=bool, default=False) parser.add_argument('-use_projection', help='Whether to project the test data in each k fold', type=bool, default=False) command_args = parser.parse_args() if command_args.output_dir is None: return if command_args.output_df is None: command_args.output_df = command_args.output_dir + '/final_predictions.dat' if (not os.path.exists(command_args.path_to_X)) or (not os.path.exists( command_args.path_to_Y)): return if command_args.n_runs < 1 or command_args.n_runs > 100000: return if command_args.only_concat: concat_temp_files(command_args) return if command_args.dim_red_method not in LEGAL_DIM_REDUCTION_METHODS: print "dim_red_method must be one of: " + ', '.join( LEGAL_DIM_REDUCTION_METHODS) return # if (command_args.dim_red_method == 'PCoA') and (not re.match(command_args.dim_red_method + '.+', # command_args.path_to_X.split('/')[-1])): # print "dim_red_method doesn't match input X" # return # if (command_args.dim_red_method == 'PCA') and (re.match('PCoA.+', command_args.path_to_X.split('/')[-1])): # print "dim_red_method doesn't match input X" # return # if (command_args.use_projection is False) and (not re.match(command_args.dim_red_method + '.+', command_args.path_to_X.split('/')[-1])): # print "When not using projection X input must start with the dimensionality reduction method" # return # if command_args.dim_red_method == 'PCoA' and command_args.use_projection: # print "Projection of test data using PCoA isn't supported at the moment" # return if not os.path.exists(command_args.output_dir): os.makedirs(command_args.output_dir) with open(command_args.output_dir + '/args' + str(datetime.now()), 'w') as handle: for arg in vars(command_args): handle.write( str(arg) + '\t' + str(getattr(command_args, arg)) + '\n') if command_args.use_projection: mem_def = '20G' else: mem_def = '1G' with qp(jobname='RandPredict', q=['himem7.q'], mem_def=mem_def, trds_def=1, tryrerun=True, max_u=110, delay_batch=5) as q: os.chdir( "/net/mraid08/export/jafar/Microbiome/Analyses/Noamba/temp_q_dir/") q.startpermanentrun() generate_random_predictors(q, command_args) concat_temp_files(command_args)
def main(): """ Compute all correlations between EM genes matrix and metabolites and write to disk. """ parser = argparse.ArgumentParser() parser.add_argument('work_dir', help='Path to working directory', type=str, default=None) parser.add_argument( '-analysisType', help='What type of analysis to perform: normal or shuffle', type=str, default='normal') parser.add_argument( '-keggdb', help='Which KEGG DB to consider: reaction, pathway, module, ko, all', type=str, default='ko') parser.add_argument('-plot', help='Whether to call the plotting function', type=bool, default=False) parser.add_argument('-gsea', help='Name of GSEA results file in working directory', type=str, default='GSEA') parser.add_argument('-metabsPath', help='Path to metabolites file', type=str, default=METABOLON_DIR + '/tmp_files/metabs.df') parser.add_argument( '-stat_test', help='Type of statistical test: mannwhitneyu, directed_mannwhitneyu', type=str, default='directed_mannwhitneyu') parser.add_argument('-dsc', '--down_stream_cols', help='What type of data: bac_genes or ko', type=str, default='bac_genes') parser.add_argument('-tkdbff', '--take_kdb_from_file', help='Path to KEGG DB', type=str, default=None) parser.add_argument('-tkprff', '--take_metab_kdb_pair_from_file', help='Path to metabolite-KEGGDB pairs file', type=str, default=None) parser.add_argument('-min_dsc', '--minimal_downstream_cols', help='Minimal number downstream columns in test', type=int, default=5) parser.add_argument('-max_dsc', '--maximal_downstream_cols', help='Maximal number downstream columns in test', type=int, default=10000) parser.add_argument('-max_kos_db', '--maximal_kos_per_db', help='Maximal number KOs per KEGG DB', type=int, default=100) parser.add_argument('-fpb', '--files_per_batch', help='Number of files to parse in each job', type=int, default=100) parser.add_argument('-kpc', '--kos_per_compound', help='Maximal number of KOs per compound', type=int, default=2000) parser.add_argument('-cpk', '--compounds_per_ko', help='Maximal number of compound per KO', type=int, default=500) command_args = parser.parse_args() if command_args.work_dir is None: print(str(datetime.now()) + " - [main] work_dir must be passed.") return if command_args.analysisType != 'normal' and command_args.analysisType != 'shuffle': print( str(datetime.now()) + " - [main] analysisType must be normal or shuffle.") return if command_args.down_stream_cols not in POSSIBLE_DOWNSTREAM_COL: print( str(datetime.now()) + " - [main] down_stream_cols must be bac_genes or ko.") return if command_args.keggdb != 'all' and command_args.keggdb not in POSSIBLE_KEGG_DB: print(str(datetime.now()) + " - [main] Illegal KEGG DB.") return if command_args.stat_test not in POSSIBLE_STAT_TESTS: print(str(datetime.now()) + " - [main] Illegal stat test.") return with open(command_args.work_dir + '/args' + str(datetime.now()), 'w') as handle: for arg in vars(command_args): handle.write( str(arg) + '\t' + str(getattr(command_args, arg)) + '\n') # mkg = MetabolonKEGGGWAS(command_args.work_dir, command_args.down_stream_cols, command_args.gsea, # command_args.metabsPath, command_args.stat_test, # command_args.plot, command_args.take_kdb_from_file, # command_args.take_metab_kdb_pair_from_file) if command_args.analysisType == 'normal': pval_files = os.listdir(command_args.work_dir) pval_files = [ f for f in pval_files if len(f) > 4 and f[0:5] == 'pvals' ] pval_files.sort(key=natural_keys) if len(pval_files) > command_args.files_per_batch: with qp('MetGWAS', delay_sec=1, mem_def='10G', q=['himem7.q'], trds_def=1, max_u=120, tryrerun=True) as q: os.chdir(METABOLON_DIR) waiton = [] q.startpermanentrun() for i in range(0, len(pval_files), command_args.files_per_batch): print( str(datetime.now()) + " - [main] uploading job " + str(i) + " to q") waiton.append( q.method( divide_jobs, (command_args, pval_files[i:i + command_args.files_per_batch], i))) res = q.waitforresults(waiton) print(str(datetime.now()) + " - [main] Results are back") concat_gsea_files(command_args.work_dir, command_args.gsea) return else: mkg = MetabolonKEGGGWAS( command_args.work_dir, command_args.down_stream_cols, command_args.gsea, command_args.metabsPath, command_args.stat_test, command_args.kos_per_compound, command_args.compounds_per_ko, command_args.plot, command_args.take_kdb_from_file, command_args.take_metab_kdb_pair_from_file) for f in pval_files: mkg._get_pvals_matrix(command_args.work_dir + '/' + f) if command_args.keggdb == 'all': for kdb in POSSIBLE_KEGG_DB: mkg.all_rank_sum_tests( kdb, f, command_args.minimal_downstream_cols, command_args.maximal_downstream_cols, command_args.maximal_kos_per_db) else: mkg.all_rank_sum_tests( command_args.keggdb, f, command_args.minimal_downstream_cols, command_args.maximal_downstream_cols, command_args.maximal_kos_per_db) elif command_args.analysisType == 'shuffle': # no much reason to use this, since shuffling is done before hand pass