def run_single_phate(dat: str, odir: str, tsv: str, meta_pd: pd.DataFrame, case_var: str, phate_labels: list, phate_params: dict, run_params: dict, case_vals_list: list, cur_sh: str, cur_import_sh: str, force: bool, filt: str, cur_raref: str, fp: str, fa: str) -> dict: remove = True qza = '%s.qza' % splitext(tsv)[0] cases = {} with open(cur_sh, 'w') as cur_sh_o, open(cur_import_sh, 'w') as cur_import_sh_o: for case_vals in case_vals_list: case = get_case(case_vals, '', case_var) cur_rad = '%s/%s_%s%s' % (odir, case.strip('_'), filt, cur_raref) if not isdir(cur_rad): os.makedirs(cur_rad) new_meta = '%s/meta.tsv' % cur_rad new_qza = '%s/tab.qza' % cur_rad new_tsv = '%s/tab.tsv' % cur_rad phate_html = '%s/phate_%s_%s_%s.html' % (cur_rad, dat, filt, case) phate_tsv = '%s_xphate.tsv' % splitext(phate_html)[0] if len(glob.glob('%s/TOO_FEW.*' % cur_rad)): continue cases[case] = phate_tsv if force or not isfile(phate_html) or not isfile(phate_tsv): new_meta_pd = get_new_meta_pd(meta_pd, case, case_var, case_vals) new_meta_pd.reset_index().to_csv(new_meta, index=False, sep='\t') write_phate_cmd(qza, new_qza, new_tsv, new_meta, fp, fa, phate_html, phate_labels, phate_params, run_params["n_nodes"], run_params["n_procs"], cur_sh_o, cur_import_sh_o) remove = False
def run_distance_decay(i_datasets_folder: str, betas: dict, p_distance_decay: str, datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> (dict, list): job_folder2 = get_job_folder(i_datasets_folder, 'decay/chunks') decay_config = read_yaml_file(p_distance_decay) subsets, modes, params = get_decay_config(decay_config) all_sh_pbs = {} decay_res = {} for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items(): if not split: out_sh = '%s/run_decay_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) decay_res[dat] = [] for idx, metrics_groups_metas_qzas_dms_trees in enumerate( rarefs_metrics_groups_metas_qzas_dms_trees): decay_raref = {} cur_raref = datasets_rarefs[dat][idx] odir = get_analysis_folder(i_datasets_folder, 'decay/%s%s' % (dat, cur_raref)) if split: out_sh = '%s/run_decay_%s_%s%s%s.sh' % ( job_folder2, prjct_nm, dat, cur_raref, filt_raref) for metric, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items( ): for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items( ): for (meta, qza, mat_qza, tree) in metas_qzas_mat_qzas_trees: meta_pd = read_meta_pd(meta).set_index('sample_name') cases_dict = check_metadata_cases_dict( meta, meta_pd, dict(subsets), 'decay') for case_var, case_vals_list in cases_dict.items(): for case_vals in case_vals_list: case = get_case(case_vals, case_var).replace(' ', '_') cur_sh = '%s/run_decay_%s%s_%s_%s_%s%s.sh' % ( job_folder2, dat, cur_raref, metric, group, case, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) new_meta_pd = get_new_meta_pd( meta_pd, case, case_var, case_vals) res = run_single_decay( odir, group, new_meta_pd, cur_sh, mat_qza, case, modes, force, run_params["n_nodes"], run_params["n_procs"], int(params['iteration']), int(params['step'])) decay_raref[(metric, group, case)] = res decay_res[dat].append(decay_raref)
def merge_subsets_apply(self): subsets_fp = [[pair, var, subset, get_case(subset, var)] for var, subsets in self.subsets.items() for subset in subsets for pair in self.mmvecs.pair.unique()] if subsets_fp: subsets = pd.DataFrame( subsets_fp, columns=['pair', 'variable', 'factors', 'subset']) self.mmvecs = self.mmvecs.merge(subsets, on=['pair'], how='outer')
def run_single_deicode(odir: str, tsv: str, meta_pd: pd.DataFrame, case_var: str, case_vals_list: list, cur_sh: str, force: bool) -> None: """ Performs robust center log-ratio transform robust PCA and ranks the features by the loadings of the resulting SVD. https://library.qiime2.org/plugins/deicode/19/ (in-loop function). :param odir: output analysis directory. :param tsv: features table input to the beta diversity matrix. :param meta_pd: metadata table. :param case_var: metadata variable to make groups from. :param case_vals_list: groups for the metadata variable. :param cur_sh: input bash script file. :param force: Force the re-writing of scripts for all commands. """ remove = True qza = '%s.qza' % splitext(tsv)[0] with open(cur_sh, 'w') as cur_sh_o: for case_vals in case_vals_list: case = get_case(case_vals, '', case_var) cur_rad = odir + '/' + basename(tsv).replace('.tsv', '_%s' % case) new_meta = '%s.meta' % cur_rad new_mat_qza = '%s_DM.qza' % cur_rad new_qza = '%s.qza' % cur_rad ordi_qza = '%s_deicode_ordination.qza' % cur_rad ordi_qzv = '%s_deicode_ordination_biplot.qzv' % cur_rad if force or not isfile(ordi_qzv): new_meta_pd = get_new_meta_pd(meta_pd, case, case_var, case_vals) if new_meta_pd.shape[0] < 10: continue new_meta_pd.reset_index().to_csv(new_meta, index=False, sep='\t') write_deicode_biplot(qza, new_meta, new_qza, ordi_qza, new_mat_qza, ordi_qzv, cur_sh_o) remove = False
def merge_subsets_apply(self): subsets_fp = [[dataset, var, subset, get_case(subset, var), ''] for var, subsets in self.songbird_subsets.items() for subset in subsets for dataset in self.songbirds.dataset.unique()] if subsets_fp: subsets = pd.DataFrame( subsets_fp, columns=['dataset', 'variable', 'factors', 'subset', 'pair']) self.songbirds = self.songbirds.merge(subsets, on=['dataset'], how='outer')
def run_multi_kw(odir: str, meta_pd: pd.DataFrame, div_qza: str, case_vals_list: list, case_var: str, cur_sh: str, force: bool) -> None: """ Run alpha-group-significance: Alpha diversity comparisons. https://docs.qiime2.org/2019.10/plugins/available/diversity/alpha-group-significance/ (in-loop function). :param odir: output analysis directory. :param meta_pd: metadata table. :param div_qza: :param case_vals_list: :param metric: :param case_var: :param cur_sh: input bash script file. :param force: Force the re-writing of scripts for all commands. """ remove = True with open(cur_sh, 'w') as cur_sh_o: for case_vals in case_vals_list: case = get_case(case_vals, case_var) cur_rad = odir + '/' + basename(div_qza).replace( '.qza', '_%s' % case) new_qzv = '%s_kruskal-wallis.qzv' % cur_rad if force or not isfile(new_qzv): new_meta = '%s.meta' % cur_rad new_meta_pd = get_new_meta_pd(meta_pd, case, case_var, case_vals) new_meta_pd.reset_index().to_csv(new_meta, index=False, sep='\t') new_div = get_new_alpha_div(case, div_qza, cur_rad, new_meta_pd, cur_sh_o) write_alpha_group_significance_cmd(new_div, new_meta, new_qzv, cur_sh_o) remove = False
def run_single_adonis(odir: str, subset: str, case_vals_list: list, metric: str, case_var: str, form: str, formula: str, qza: str, mat_qza: str, meta_pd: pd.DataFrame, cur_sh: str, force: bool) -> None: """ Run adonis: adonis PERMANOVA test for beta group significance. https://docs.qiime2.org/2019.10/plugins/available/diversity/adonis/ (in-loop function). :param odir: output analysis directory. :param case_vals_list: :param metric: :param case_var: :param form: :param formula: :param tsv: features table input to the beta diversity matrix. :param mat_qza: :param meta_pd: metadata table. :param cur_sh: input bash script file. :param force: Force the re-writing of scripts for all commands. """ remove = True with open(cur_sh, 'w') as cur_sh_o: for case_vals in case_vals_list: case = '%s__%s' % (metric, get_case(case_vals, case_var, form)) if subset: cur_rad = '%s/%s_%s_%s' % (odir, splitext(basename(qza))[0], subset, case) else: cur_rad = '%s/%s_%s' % (odir, splitext(basename(qza))[0], case) new_meta = '%s.meta' % cur_rad new_qzv = '%s_adonis.qzv' % cur_rad new_mat_qza = '%s/%s' % (odir, basename(mat_qza).replace('.qza', '_%s.qza' % case)) new_meta_pd = get_new_meta_pd(meta_pd, case, case_var, case_vals) new_meta_pd.reset_index().to_csv(new_meta, index=False, sep='\t') if force or not isfile(new_qzv): write_diversity_adonis(new_meta, mat_qza, new_mat_qza, formula, new_qzv, cur_sh_o) remove = False
def get_common_datasets(i_datasets_folder: str, mmvec_pairs: dict, filtering: dict, filt_datasets: dict, common_datasets_done: dict, input_to_filtered: dict, train_test_dict: dict, force: bool, subsets: dict) -> (dict, list): """ :param i_datasets_folder: :param mmvec_pairs: :param filt_datasets: :param force: Force the re-writing of scripts for all commands. :return: """ common_jobs = [] common_datasets = {} for pair, pair_datasets in mmvec_pairs.items(): # print("pair, pair_datasets") # print(pair, pair_datasets) (omic1_, bool1), (omic2_, bool2) = pair_datasets if omic1_ not in input_to_filtered or omic2_ not in input_to_filtered: continue omic1 = input_to_filtered[omic1_] omic2 = input_to_filtered[omic2_] if (omic1, bool1) not in filt_datasets or (omic2, bool2) not in filt_datasets: continue pair_filtering = filtering[pair] for case_var, case_vals_list in subsets.items(): for case_vals in case_vals_list: case = get_case(case_vals, case_var) data_dir = get_analysis_folder( i_datasets_folder, 'mmvec/common/data/%s/%s' % (pair, case)) meta_dir = get_analysis_folder( i_datasets_folder, 'mmvec/common/metadata/%s/%s' % (pair, case)) for preval_abund, preval_abund_dats in sorted( pair_filtering.items()): preval_filt1, abund_filter1 = preval_abund_dats[(omic1_, bool1)] preval_filt2, abund_filter2 = preval_abund_dats[(omic2_, bool2)] filt1 = '%s_%s' % (preval_filt1, abund_filter1) filt2 = '%s_%s' % (preval_filt2, abund_filter2) if (case, preval_abund) not in filt_datasets[(omic1, bool1)]: continue if (case, preval_abund) not in filt_datasets[(omic2, bool2)]: continue tsv1, qza1, meta1, meta_pd1, sams1 = filt_datasets[( omic1, bool1)][(case, preval_abund)] tsv2, qza2, meta2, meta_pd2, sams2 = filt_datasets[( omic2, bool2)][(case, preval_abund)] common_sams = sorted(set(sams1) & set(sams2)) len_common_sams = len(common_sams) if len_common_sams < 10: print( 'Not enough samples: %s (%s) vs %s (%s) -> skipping' % (omic1, filt1, omic2, filt2)) continue meta_fp = '%s/meta_%s_%s_%s__%s_%s_%s__%s_%ss.tsv' % ( meta_dir, omic1, preval_filt1, abund_filter1, omic2, preval_filt2, abund_filter2, pair, len_common_sams) new_tsv1 = '%s/tab_%s_%s_%s__%s_%ss.tsv' % ( data_dir, omic1, preval_filt1, abund_filter1, pair, len_common_sams) new_qza1 = '%s.qza' % splitext(new_tsv1)[0] new_tsv2 = '%s/tab_%s_%s_%s__%s_%ss.tsv' % ( data_dir, omic2, preval_filt2, abund_filter2, pair, len_common_sams) new_qza2 = '%s.qza' % splitext(new_tsv2)[0] common_datasets.setdefault(pair, []).append([ meta_fp, omic1, omic2, filt1, filt2, new_tsv1, new_tsv2, new_qza1, new_qza2, len_common_sams, case ]) meta_subset1 = get_meta_common_sorted( meta_pd1, common_sams) meta_subset2 = get_meta_common_sorted( meta_pd2, common_sams) merge_and_write_metas(meta_subset1, meta_subset2, meta_fp, omic1, omic2, train_test_dict) if meta_fp in common_datasets_done[pair]: print('\t\t\t* [DONE]', pair, ':', omic1, filt1, omic2, filt2) continue if force or not isfile(new_qza1): cmd = filter_feature_table(qza1, new_qza1, meta_fp) common_jobs.append(cmd) if force or not isfile(new_tsv1): cmd = run_export(new_qza1, new_tsv1, 'FeatureTable') common_jobs.append(cmd) if force or not isfile(new_qza2): cmd = filter_feature_table(qza2, new_qza2, meta_fp) common_jobs.append(cmd) if force or not isfile(new_tsv2): cmd = run_export(new_qza2, new_tsv2, 'FeatureTable') common_jobs.append(cmd) print('\t\t\t* [TODO]', pair, ':', omic1, '[%s: %s]' % (filt1, meta_subset1.shape[0]), omic2, '[%s: %s]' % (filt2, meta_subset2.shape[0]))
def check_common_datasets(i_datasets_folder: str, mmvec_pairs: dict, mmvec_filtering: dict, filt_datasets_pass: dict, input_to_filtered: dict, mmvec_subsets: dict) -> (dict, list): """ :param i_datasets_folder: :param mmvec_pairs: :param force: Force the re-writing of scripts for all commands. :return: """ common_datasets_pass = {} for pair, pair_datasets in mmvec_pairs.items(): pair_filtering = mmvec_filtering[pair] common_datasets_pass[pair] = [] data_dir_ = get_analysis_folder(i_datasets_folder, 'mmvec/common/data/%s' % pair) meta_dir_ = get_analysis_folder(i_datasets_folder, 'mmvec/common/metadata/%s' % pair) (omic1_, bool1), (omic2_, bool2) = pair_datasets if omic1_ not in input_to_filtered or omic2_ not in input_to_filtered: continue omic1 = input_to_filtered[omic1_] omic2 = input_to_filtered[omic2_] if (omic1, bool1) not in filt_datasets_pass or ( omic2, bool2) not in filt_datasets_pass: continue for case_var, case_vals_list in mmvec_subsets.items(): for case_vals in case_vals_list: case = get_case(case_vals, case_var) data_dir = data_dir_ + '/' + case meta_dir = meta_dir_ + '/' + case for preval_abund in sorted(pair_filtering): preval_filt1, abund_filter1 = pair_filtering[preval_abund][ (omic1_, bool1)] preval_filt2, abund_filter2 = pair_filtering[preval_abund][ (omic2_, bool2)] if not filt_datasets_pass[(omic1, bool1)][(case, preval_abund)]: continue if not filt_datasets_pass[(omic2, bool2)][(case, preval_abund)]: continue filt1 = '_'.join([preval_filt1, abund_filter1]) filt2 = '_'.join([preval_filt2, abund_filter2]) tsv1, qza1, meta1, meta_pd1, sams1 = filt_datasets_pass[( omic1, bool1)][(case, preval_abund)] tsv2, qza2, meta2, meta_pd2, sams2 = filt_datasets_pass[( omic2, bool2)][(case, preval_abund)] common_sams = sorted(set(sams1) & set(sams2)) if len(common_sams) < 10: print( 'Not enough samples: %s (%s) vs %s (%s) -> skipping' % (omic1, filt1, omic2, filt2)) continue meta_fp = '%s/meta_%s_%s_%s__%s_%s_%s__%s_%ss.tsv' % ( meta_dir, omic1, preval_filt1, abund_filter1, omic2, preval_filt2, abund_filter2, pair, len(common_sams)) new_tsv1 = '%s/tab_%s_%s_%s__%s_%ss.tsv' % ( data_dir, omic1, preval_filt1, abund_filter1, pair, len(common_sams)) new_qza1 = '%s.qza' % splitext(new_tsv1)[0] new_tsv2 = '%s/tab_%s_%s_%s__%s_%ss.tsv' % ( data_dir, omic2, preval_filt2, abund_filter2, pair, len(common_sams)) new_qza2 = '%s.qza' % splitext(new_tsv2)[0] if isfile(meta_fp) and isfile(new_qza1) and isfile( new_qza2): common_datasets_pass[pair].append(meta_fp)
def run_single_sourcetracking(odir: str, tsv: str, meta_pd: pd.DataFrame, case_var: str, sourcetracking_params: dict, method: str, imports: set, sourcetracking_sourcesink: dict, case_vals_list: list, cur_sh: str, cur_import_sh: str, force: bool, filt: str, cur_raref: str, fp: str, fa: str, n_nodes: str, n_procs: str) -> list: cases = [] remove = True qza = '%s.qza' % splitext(tsv)[0] with open(cur_sh, 'w') as cur_sh_o, open(cur_import_sh, 'w') as cur_import_sh_o: for case_vals in case_vals_list: case = get_case(case_vals, '', case_var) for sourcesink_name, sourcesink_d in sourcetracking_sourcesink.items( ): column = sourcesink_d['column'] sink = sourcesink_d['sink'] sources = [''] if 'source' in sourcesink_d: sources = sourcesink_d['source'] new_meta_pd = get_new_meta_pd(meta_pd, case, case_var, case_vals) if column not in new_meta_pd.columns: raise IOError('"%s" not in metadata...' % column) if sink not in set(new_meta_pd[column].unique()): raise IOError( 'All sinks "%s" not in metadata column "%s"' % (sink, column)) if sources != [''] and not set(sources).issubset( set(new_meta_pd[column].unique())): raise IOError( 'All sources "%s" not in metadata column "%s"' % (sources, column)) cur_rad = '%s/%s_%s%s/%s' % (odir, case.strip('_'), filt, cur_raref, sourcesink_name) if not isdir(cur_rad): os.makedirs(cur_rad) replacements = { sink: sink.replace('/', '').replace('(', '').replace(')', '').replace(' ', '_') } for source in sources: replacements.update({ source: source.replace('/', '').replace('(', '').replace( ')', '').replace(' ', '_') }) sink = replacements[sink] sources = [replacements[source] for source in sources] folder = '%s/%s-%s' % (cur_rad, column, sink) if sources != ['']: folder = '%s_%s' % (folder, '_'.join(sources)) new_meta = '%s/meta.tsv' % cur_rad new_qza = '%s/tab.qza' % cur_rad new_tsv = '%s/tab.tsv' % cur_rad new_meta_pd = new_meta_pd[[column]].reset_index() new_meta_pd.replace({column: replacements}, inplace=True) new_meta_pd.to_csv(new_meta, index=False, sep='\t') loo = False missing = False folder_method = folder + '/' + method if method == 'q2': for root, dirs, files in os.walk(folder_method): if len(root.split(folder_method)[-1].split('/')) == 4: print(method, root.split(folder_method)[-1].split('/')) if 'predictions.tsv' not in files: print('\n'.join(files)) missing = True outs = folder_method + '/t0/r*/*/predictions.tsv' elif method == 'feast': outs = folder_method + '/t0/out.r0*' elif method == 'sourcetracker': if 'loo' in sourcetracking_params and sourcetracking_params[ 'loo']: loo = True outs = folder_method + '/t0/loo/mixing_proportions.txt' else: outs = folder_method + '/t0/r0/mixing_proportions.txt' for root, dirs, files in os.walk(folder_method): if len(root.split(folder_method)[-1].split('/')) == 3: print(method, root.split(folder_method)[-1].split('/')) if 'mixing_proportions.txt' not in files: print('\n'.join(files)) missing = True if force or not len(glob.glob(outs)) or missing: write_sourcetracking(qza, new_qza, new_tsv, new_meta, method, fp, fa, cur_rad, column, sink, sources, sourcetracking_params, loo, n_nodes, n_procs, cur_sh_o, cur_import_sh_o, imports) cur_sh_o.write('echo "sh %s/cmd_%s.sh"\n' % (folder_method, method)) cur_sh_o.write('sh %s/cmd_%s.sh\n\n\n' % (folder_method, method)) remove = False
def run_nestedness(i_datasets_folder: str, betas: dict, datasets_collapsed_map: dict, p_nestedness_groups: str, datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> (dict, list, dict): job_folder2 = get_job_folder(i_datasets_folder, 'nestedness/chunks') nestedness_config = read_yaml_file(p_nestedness_groups) if 'soft' not in nestedness_config: print( 'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)' ) return {} if nestedness_config['soft'].endswith('Autocorrelation.jar') and isfile( nestedness_config['soft']): binary = nestedness_config['soft'] else: binary = '%s/bin/Autocorrelation.jar' % nestedness_config['soft'] if not isfile(binary): print( 'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)' ) return {} subsets, nodfs, colors, nulls, modes, params = get_nestedness_config( nestedness_config) nodfs_fps = {} all_sh_pbs = {} nestedness_res = {} for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items(): if not split: out_sh = '%s/run_nestedness_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) stats_tax_dat, level = get_stats_tax_dat(dat, datasets_collapsed_map) nestedness_res[dat] = [] for idx, metrics_groups_metas_qzas_dms_trees in enumerate( rarefs_metrics_groups_metas_qzas_dms_trees): nestedness_raref = {} cur_raref = datasets_rarefs[dat][idx] odir = get_analysis_folder(i_datasets_folder, 'nestedness/%s%s' % (dat, cur_raref)) if split: out_sh = '%s/run_nestedness_%s_%s%s%s.sh' % ( job_folder2, prjct_nm, dat, cur_raref, filt_raref) for _, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items( ): for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items( ): meta, qza, mat_qza, tree = metas_qzas_mat_qzas_trees[0] meta_pd = read_meta_pd(meta).set_index('sample_name') cases_dict = check_metadata_cases_dict( meta, meta_pd, dict(subsets), 'nestedness') for case_var, case_vals_list in cases_dict.items(): for case_vals in case_vals_list: case = get_case(case_vals, case_var).replace(' ', '_') cur_sh = '%s/run_nestedness_%s%s_%s_%s%s.sh' % ( job_folder2, dat, cur_raref, group, case, filt_raref) cur_sh = cur_sh.replace(' ', '-') # print("case", case) all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) res, group_case_nodfs = run_single_nestedness( odir, cur_raref, level, group, meta_pd, nodfs, nulls, modes, cur_sh, qza, case, case_var, case_vals, binary, params, force) nodfs_fps.setdefault(stats_tax_dat, []).extend(group_case_nodfs) nestedness_raref[(group, case)] = res break nestedness_res[dat].append(nestedness_raref)
def run_single_doc(i_dataset_folder: str, odir: str, tsv: str, meta_pd: pd.DataFrame, case_var: str, doc_params: dict, case_vals_list: list, cur_sh: str, cur_import_sh: str, force: bool, filt: str, cur_raref: str, fp: str, fa: str, n_nodes: str, n_procs: str, dat_phates: dict, doc_phate: bool, need_to_run_phate: list, need_to_run_less_phate: list) -> list: remove = True qza = '%s.qza' % splitext(tsv)[0] cases = [] with open(cur_sh, 'w') as cur_sh_o, open(cur_import_sh, 'w') as cur_import_sh_o: for case_vals in case_vals_list: token = ''.join([str(random.choice(range(100))) for x in range(3)]) case = get_case(case_vals, '', case_var) cur_rad = '%s/%s%s/%s' % (odir, case.strip('_'), cur_raref, filt) cases.append(cur_rad) cur_rad_r = '%s/R' % cur_rad cur_rad_token = '%s/tmp/%s' % (i_dataset_folder, token) if not isdir(cur_rad_r): os.makedirs(cur_rad_r) new_meta = '%s/meta.tsv' % cur_rad new_qza = '%s/tab.qza' % cur_rad new_tsv = '%s/tab.tsv' % cur_rad new_tsv_token = '%s/tab.tsv' % cur_rad_token if force or not isfile('%s/DO.tsv' % cur_rad): new_meta_pd = get_new_meta_pd(meta_pd, case, case_var, case_vals) new_meta_pd.reset_index().to_csv(new_meta, index=False, sep='\t') write_doc(qza, fp, fa, new_meta, new_qza, new_tsv, cur_rad, new_tsv_token, cur_rad_token, n_nodes, n_procs, doc_params, cur_sh_o, cur_import_sh_o) remove = False # run DOC on each cluster from PHATE if doc_phate and filt in dat_phates and case_var in dat_phates[ filt] and case in dat_phates[filt][case_var]: # get the clusters xphate_tsv = dat_phates[filt][case_var][case] if not isfile(xphate_tsv): if not need_to_run_phate: print('Unable to run DOC on a set of PHATE clusters:\n' '(Be sure to run PHATE first...)') need_to_run_phate.append( xphate_tsv.replace('%s/qiime/phate' % i_dataset_folder, '...')) continue xphate_pd = pd.read_csv(xphate_tsv, header=0, sep='\t', dtype={'sample_name': str}) xphate_pd = xphate_pd.loc[xphate_pd['variable'].str.contains( 'cluster_k')] if len(xphate_pd[['knn', 'decay', 't']].drop_duplicates()) > 5: if not need_to_run_less_phate: print( 'Warning: PHATE has been for multiple parameters combinations:\n' ' --> It may be unwise to let DOC run on every combination...\n' ' --> Be sure to run PHATE using few, desired sets of parameters!' ) need_to_run_less_phate.append( xphate_tsv.replace('%s/qiime/phate' % i_dataset_folder, '...')) cols = [ 'sample_name', 'knn', 'decay', 't', 'variable', 'factor' ] xphate_clusters = dict(xphate_pd[cols].groupby( ['knn', 'decay', 't', 'variable', 'factor']).apply(func=lambda x: x.sample_name.tolist())) new_meta_pd = get_new_meta_pd(meta_pd, case, case_var, case_vals) # repeat DOC command for the clusters cur_rad_phate = '%s/phate' % cur_rad if not isdir(cur_rad_phate): os.makedirs(cur_rad_phate) doc_phate_processed = [] for (knn, decay, t, k, cluster), samples_phate in xphate_clusters.items(): if len(samples_phate) < 50: doc_phate_processed.append([ knn, decay, t, k, cluster, len(samples_phate), 'TOO FEW' ]) continue token = ''.join( [str(random.choice(range(100))) for x in range(3)]) cur_rad_phate_clust = '%s/%s_%s_%s_k%s_clust%s' % ( cur_rad_phate, knn, decay, t, k, cluster) doc_phate_processed.append([ knn, decay, t, k, cluster, len(samples_phate), cur_rad_phate_clust ]) cases.append(cur_rad_phate_clust) cur_rad_phate_clust_r = '%s/R' % cur_rad_phate_clust cur_rad_token = '%s/tmp/%s' % (i_dataset_folder, token) if not isdir(cur_rad_phate_clust_r): os.makedirs(cur_rad_phate_clust_r) new_meta = '%s/meta.tsv' % cur_rad_phate_clust new_qza = '%s/tab.qza' % cur_rad_phate_clust new_tsv = '%s/tab.tsv' % cur_rad_phate_clust new_tsv_token = '%s/tab.tsv' % cur_rad_phate_clust if force or not isfile('%s/DO.tsv' % cur_rad_phate_clust): new_meta_pd_phate = new_meta_pd.loc[ samples_phate, :].copy() new_meta_pd_phate.reset_index().to_csv(new_meta, index=False, sep='\t') write_doc(qza, fp, fa, new_meta, new_qza, new_tsv, cur_rad_phate_clust, new_tsv_token, cur_rad_token, n_nodes, n_procs, doc_params, cur_sh_o, cur_import_sh_o) remove = False phate_doc_out = '%s/phate_processed.txt' % cur_rad_phate with open(phate_doc_out, 'w') as o: o.write('knn\tdecay\tt\tk\tcluster\tsamples\tfate\n') for doc_phate_proc in doc_phate_processed: o.write('%s\n' % '\t'.join(map(str, doc_phate_proc)))
def run_permanova(i_datasets_folder: str, betas: dict, main_testing_groups: tuple, p_perm_tests_min: int, p_beta_type: tuple, datasets_rarefs: dict, p_perm_groups: str, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> dict: """ Run beta-group-significance: Beta diversity group significance. https://docs.qiime2.org/2019.10/plugins/available/diversity/beta-group-significance/ Main per-dataset looper for the PERMANOVA tests on beta diversity matrices. :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets: list of datasets. :param betas: beta diversity matrices. :param main_testing_groups: groups to test. :param p_perm_groups: groups to subset. :param force: Force the re-writing of scripts for all commands. :param prjct_nm: Nick name for your project. :param qiime_env: qiime2-xxxx.xx conda environment. :param chmod: whether to change permission of output files (defalt: 775). """ permanovas = {} job_folder2 = get_job_folder(i_datasets_folder, 'permanova/chunks') main_cases_dict = get_main_cases_dict(p_perm_groups) npermutations = 999 metric_check = set() all_sh_pbs = {} first_print = 0 for dat, metric_groups_metas_qzas_dms_trees_ in betas.items(): permanovas[dat] = [] if not split: out_sh = '%s/run_beta_group_significance_%s_%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref) for idx, metric_groups_metas_qzas_dms_trees in enumerate( metric_groups_metas_qzas_dms_trees_): cur_depth = datasets_rarefs[dat][idx] odir = get_analysis_folder(i_datasets_folder, 'permanova/%s%s' % (dat, cur_depth)) for metric, subset_files in metric_groups_metas_qzas_dms_trees.items( ): permanovas.setdefault(dat, []).append(metric) if split: out_sh = '%s/run_beta_group_significance_%s_%s_%s%s.sh' % ( job_folder2, prjct_nm, dat, metric, filt_raref) for subset, metas_qzas_mat_qzas_trees in subset_files.items(): (meta, qza, mat_qza, tree) = metas_qzas_mat_qzas_trees[0] if not isfile(mat_qza): if not first_print: print( 'Beta diversity, distances matrices must be generated already to automatise PERMANOVA\n' '\t(re-run this after steps "2_run_beta.sh" and "2x_run_beta_export.pbs" are done)' ) first_print += 1 continue if (dat, subset) not in metric_check: meta_pd = read_meta_pd(meta) meta_pd = meta_pd.set_index('sample_name') cases_dict = check_metadata_cases_dict( meta, meta_pd, dict(main_cases_dict), 'PERMANOVA') testing_groups = check_metadata_testing_groups( meta, meta_pd, main_testing_groups, p_perm_tests_min, 'PERMANOVA') metric_check.add((dat, subset)) for case_var, case_vals_list in cases_dict.items(): testing_groups_case_var = list( set(testing_groups + [case_var])) for case_vals in case_vals_list: case = get_case(case_vals, case_var).replace(' ', '_') for testing_group in testing_groups_case_var: if testing_group == 'ALL': continue cur_sh = '%s/run_beta_group_significance_%s%s_%s_%s_%s_%s%s.sh' % ( job_folder2, dat, cur_depth, metric, subset, case, testing_group, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) run_single_perm(odir, subset, meta_pd, cur_sh, metric, case, testing_group, p_perm_tests_min, p_beta_type, qza, mat_qza, case_var, case_vals, npermutations, force)
def run_mantel(i_datasets_folder: str, datasets_filt: dict, p_mantel: str, betas: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, filt_only: bool, eval_depths: dict, jobs: bool, chunkit: int) -> None: """ """ evaluation = '' if eval_depths: evaluation = '_eval' mantel_pairs = {} for dat, depths in eval_depths.items(): sorted_depths = sorted(depths, key=lambda x: int(x.split('_')[-1])) for idx, x in enumerate(sorted_depths[:-1]): y = sorted_depths[(idx + 1)] n0 = x.split('_')[-1] n1 = y.split('_')[-1] mantel_pairs['%s_%s' % (n0, n1)] = [x, y] mantel_subsets = {'ALL': [[]]} else: mantel_pairs, mantel_subsets = get_procrustes_mantel_dicts(p_mantel) get_job_folder(i_datasets_folder, 'mantel%s' % evaluation) all_sh_pbs = {} missing_dats = set() for pair, (dat1_, dat2_) in mantel_pairs.items(): dat1, raref1 = get_dat_idx(dat1_, evaluation, datasets_filt, filt_only) dat2, raref2 = get_dat_idx(dat2_, evaluation, datasets_filt, filt_only) if check_dat_exists(betas, dat1, missing_dats) or check_dat_exists( betas, dat2, missing_dats): continue if evaluation: metrics_groups_metas_qzas_dms_trees1 = betas[dat1] metrics_groups_metas_qzas_dms_trees2 = betas[dat2] else: metrics_groups_metas_qzas_dms_trees1 = betas[dat1][0] metrics_groups_metas_qzas_dms_trees2 = betas[dat2][0] job_folder2 = get_job_folder( i_datasets_folder, 'mantel%s/chunks/%s%s' % (evaluation, pair, filt_raref)) if not split: out_sh = '%s/run_mantel_%s%s_%s%s.sh' % ( job_folder2, prjct_nm, evaluation, pair, filt_raref) for metric, groups_metas_qzas_dms_trees1 in metrics_groups_metas_qzas_dms_trees1.items( ): if split: out_sh = '%s/run_mantel_%s%s_%s_%s%s.sh' % ( job_folder2, prjct_nm, evaluation, pair, metric, filt_raref) if metric not in metrics_groups_metas_qzas_dms_trees2: continue groups_metas_qzas_dms_trees2 = metrics_groups_metas_qzas_dms_trees2[ metric] groups1 = sorted(groups_metas_qzas_dms_trees1.keys()) groups2 = sorted(groups_metas_qzas_dms_trees2.keys()) for (group1_, group2_) in itertools.product(*[groups1, groups2]): if group1_ == '': group1 = 'full' else: group1 = group1_ if group2_ == '': group2 = 'full' else: group2 = group2_ meta1, qza1, dm1, tree1 = groups_metas_qzas_dms_trees1[ group1_][0] meta2, qza2, dm2, tree2 = groups_metas_qzas_dms_trees2[ group2_][0] skip = 0 if not evaluation: if '__raref' in dat1_: dm1, meta1 = get_dm_meta(dat1, dm1, meta1, raref1, metric, i_datasets_folder, skip) if '__raref' in dat2_: dm2, meta2 = get_dm_meta(dat2, dm2, meta2, raref2, metric, i_datasets_folder, skip) if skip: print( '[Mantels] One desired rarefaction depth not run (pair %s)' % pair) continue meta_pd1 = read_meta_pd(meta1) meta_pd2 = read_meta_pd(meta2) common_sams = list( set(meta_pd1.sample_name) & set(meta_pd2.sample_name)) if len(common_sams) < 3: continue meta_pd = meta_pd1.loc[meta_pd1.sample_name.isin(common_sams)] cases_dict = check_metadata_cases_dict(meta1, meta_pd, dict(mantel_subsets), 'mantel') odir = get_analysis_folder( i_datasets_folder, 'mantel%s/%s%s/%s_vs_%s' % (evaluation, pair, filt_raref, group1, group2)) job_folder3 = get_job_folder( i_datasets_folder, 'mantel%s/chunks/%s%s/%s_vs_%s' % (evaluation, pair, filt_raref, group1, group2)) for case_var, case_vals_list in cases_dict.items(): for case_vals in case_vals_list: case_ = get_case(case_vals, case_var).replace(' ', '_') cur = '%s__%s' % (metric, case_) cur_sh = '%s/run_mantel%s_%s%s.sh' % ( job_folder3, evaluation, cur, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((pair, out_sh), []).append(cur_sh) dm_out1 = '%s/dm_%s__%s_DM.qza' % (odir, dat1_, cur) dm_out2 = '%s/dm_%s__%s_DM.qza' % (odir, dat2_, cur) mantel_out = '%s/mantel%s_%s__%s__%s.qzv' % ( odir, evaluation, dat1_, dat2_, cur) run_single_procrustes_mantel('mantel', odir, dm1, dm2, meta_pd, dm_out1, dm_out2, mantel_out, cur_sh, cur, case_var, case_vals, force) job_folder = get_job_folder(i_datasets_folder, 'mantel%s' % evaluation) main_sh = write_main_sh( job_folder, '4_run_mantel_%s%s%s' % (prjct_nm, evaluation, filt_raref), all_sh_pbs, '%s.mntl%s%s' % (prjct_nm, evaluation, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_mantel and p_mantel != 1: if p_mantel.startswith('/panfs'): p_mantel = p_mantel.replace(os.getcwd(), '') print('# Mantels (pairs and samples subsets config in %s)' % p_mantel) else: print('# Mantels') print_message('', 'sh', main_sh, jobs)
def run_procrustes(i_datasets_folder: str, datasets_filt: dict, p_procrustes: str, betas: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, filt_only: bool, eval_depths: dict, jobs: bool, chunkit: int) -> None: """ """ evaluation = '' if eval_depths: evaluation = '_eval' procrustes_pairs = {} for dat, depths in eval_depths.items(): sorted_depths = sorted(depths, key=lambda x: int(x.split('_')[-1])) for idx, x in enumerate(sorted_depths[:-1]): y = sorted_depths[(idx + 1)] n0 = x.split('_')[-1] n1 = y.split('_')[-1] procrustes_pairs['%s_%s' % (n0, n1)] = [x, y] procrustes_subsets = {'ALL': [[]]} else: procrustes_pairs, procrustes_subsets = get_procrustes_mantel_dicts( p_procrustes) get_job_folder(i_datasets_folder, 'procrustes%s' % evaluation) dms_tab = [] all_sh_pbs = {} missing_dats = set() for pair, (dat1_, dat2_) in procrustes_pairs.items(): dat1, raref1 = get_dat_idx(dat1_, evaluation, datasets_filt, filt_only) dat2, raref2 = get_dat_idx(dat2_, evaluation, datasets_filt, filt_only) if check_dat_exists(betas, dat1, missing_dats) or check_dat_exists( betas, dat2, missing_dats): continue if evaluation: metrics_groups_metas_qzas_dms_trees1 = betas[dat1] metrics_groups_metas_qzas_dms_trees2 = betas[dat2] else: metrics_groups_metas_qzas_dms_trees1 = betas[dat1][0] metrics_groups_metas_qzas_dms_trees2 = betas[dat2][0] job_folder2 = get_job_folder( i_datasets_folder, 'procrustes%s/chunks/%s%s' % (evaluation, pair, filt_raref)) if not split: out_sh = '%s/run_procrustes_%s%s_%s%s.sh' % ( job_folder2, prjct_nm, evaluation, pair, filt_raref) for metric, groups_metas_qzas_dms_trees1 in metrics_groups_metas_qzas_dms_trees1.items( ): if split: out_sh = '%s/run_procrustes_%s%s_%s_%s%s.sh' % ( job_folder2, prjct_nm, evaluation, pair, metric, filt_raref) if metric not in metrics_groups_metas_qzas_dms_trees2: continue groups_metas_qzas_dms_trees2 = metrics_groups_metas_qzas_dms_trees2[ metric] groups1 = sorted(groups_metas_qzas_dms_trees1.keys()) groups2 = sorted(groups_metas_qzas_dms_trees2.keys()) for (group1_, group2_) in itertools.product(*[groups1, groups2]): if group1_ == '': group1 = 'full' else: group1 = group1_ if group2_ == '': group2 = 'full' else: group2 = group2_ meta1, qza1, dm1, tree1 = groups_metas_qzas_dms_trees1[ group1_][0] meta2, qza2, dm2, tree2 = groups_metas_qzas_dms_trees2[ group2_][0] skip = 0 if not evaluation: if '__raref' in dat1_: dm1, meta1 = get_dm_meta(dat1, dm1, meta1, raref1, metric, i_datasets_folder, skip) if '__raref' in dat2_: dm2, meta2 = get_dm_meta(dat2, dm2, meta2, raref2, metric, i_datasets_folder, skip) if skip: print( '[Proscustes] One desired rarefaction depth not run (pair %s)' % pair) continue meta_pd1 = read_meta_pd(meta1) meta_pd2 = read_meta_pd(meta2) common_sams = list( set(meta_pd1.sample_name) & set(meta_pd2.sample_name)) if len(common_sams) < 3: continue meta_pd = meta_pd1.loc[meta_pd1.sample_name.isin(common_sams)] cases_dict = check_metadata_cases_dict( meta1, meta_pd, dict(procrustes_subsets), 'procrustes') odir = get_analysis_folder( i_datasets_folder, 'procrustes%s/%s%s/%s_vs_%s' % (evaluation, pair, filt_raref, group1, group2)) job_folder3 = get_job_folder( i_datasets_folder, 'procrustes%s/chunks/%s%s/%s_vs_%s' % (evaluation, pair, filt_raref, group1, group2)) for case_var, case_vals_list in cases_dict.items(): for case_vals in case_vals_list: case_ = get_case(case_vals, case_var).replace(' ', '_') cur = '%s__%s' % (metric, case_) cur_sh = '%s/run_procrustes%s_%s%s.sh' % ( job_folder3, evaluation, cur, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((pair, out_sh), []).append(cur_sh) dm_out1 = '%s/dm_%s__%s_DM.qza' % (odir, dat1_, cur) dm_out2 = '%s/dm_%s__%s_DM.qza' % (odir, dat2_, cur) dm_out1_tsv = '%s.tsv' % splitext(dm_out1)[0] dm_out2_tsv = '%s.tsv' % splitext(dm_out2)[0] biplot = '%s/procrustes%s_%s__%s__%s.qzv' % ( odir, evaluation, dat1_, dat2_, cur) run_single_procrustes_mantel('procrustes', odir, dm1, dm2, meta_pd, dm_out1, dm_out2, biplot, cur_sh, cur, case_var, case_vals, force) dms_tab.append([ pair, dat1_, dat2_, group1, group2, case_, metric, dm_out1_tsv, dm_out2_tsv ]) job_folder = get_job_folder(i_datasets_folder, 'procrustes%s' % evaluation) main_sh = write_main_sh( job_folder, '4_run_procrustes_%s%s%s' % (prjct_nm, evaluation, filt_raref), all_sh_pbs, '%s.prcst%s%s' % (prjct_nm, evaluation, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_procrustes and p_procrustes != 1: if p_procrustes.startswith('/panfs'): p_procrustes = p_procrustes.replace(os.getcwd(), '') print('# Procrustes (pairs and samples subsets config in %s)' % p_procrustes) else: print('# Procrustes') print_message('', 'sh', main_sh, jobs) dms_tab_pd = pd.DataFrame(dms_tab, columns=[ 'pair', 'dat1', 'dat2', 'metric', 'group1', 'group2', 'case', 'dm_out1', 'dm_out2', ]) odir = get_analysis_folder(i_datasets_folder, 'procrustes%s/R' % evaluation) out_Rs = glob.glob('%s/pairs_proscrustes_results%s%s*.tsv' % (odir, evaluation, filt_raref)) if len(out_Rs): done_R = pd.concat([pd.read_table(x, sep=' ') for x in out_Rs]) dms_tab_pd = dms_tab_pd.loc[~dms_tab_pd[['dm_out1', 'dm_out2']].sum(1). isin(done_R[['f1', 'f2']].sum(1))] if dms_tab_pd.shape[0]: fp_num = 0 if len(out_Rs): last = sorted( out_Rs, key=lambda fp: int(fp.split('.tsv')[0].split('_')[-1])) fp_num = int(last[-1].split('.tsv')[0].split('_')[-1]) + 1 dms_tab_fp = '%s/pairs%s%s_%s.tsv' % (odir, evaluation, filt_raref, fp_num) dms_tab_pd.to_csv(dms_tab_fp, index=False, sep='\t') out_R = '%s/pairs_proscrustes_results%s%s_%s.tsv' % ( odir, evaluation, filt_raref, fp_num) job_folder = get_job_folder(i_datasets_folder, 'procrustes/R') R_script = '%s/4_run_procrustes_%s%s.R' % (job_folder, prjct_nm, filt_raref) with open(R_script, 'w') as o: o.write("library(vegan)\n") o.write("dms_files <- read.table('%s', h=T)\n" % dms_tab_fp) o.write( "cols <- c('pair', 'd1', 'd2', 'g1', 'g2', 'case', 'metric', 'f1', 'f2', 'samples', 'M2', 'p-value')\n" ) o.write( "res <- setNames(data.frame(matrix(ncol = 12, nrow = 0)), cols)\n" ) o.write("for (i in seq(1, dim(dms_files)[1])) {\n") o.write(" row <- as.vector(unlist(dms_files[i,]))\n") o.write(" pair <- row[1]\n") o.write(" d1 <- row[2]\n") o.write(" d2 <- row[3]\n") o.write(" group1 <- row[4]\n") o.write(" group2 <- row[5]\n") o.write(" case <- row[6]\n") o.write(" metric <- row[7]\n") o.write(" f1 <- row[8]\n") o.write(" f2 <- row[9]\n") o.write(" if (sum(file.exists(f1, f2)) == 2) {\n") o.write( " filin_tsv_pd1 <- read.csv(f1, header = TRUE, check.names=FALSE,\n" ) o.write( " row.names = 1, colClasses = 'character', sep = '\\t')\n" ) o.write( " filin_tsv_pd2 <- read.csv(f2, header = TRUE, check.names=FALSE,\n" ) o.write( " row.names = 1, colClasses = 'character', sep = '\\t')\n" ) o.write(" filin_tsv_pd1 <- data.matrix(filin_tsv_pd1)\n") o.write(" filin_tsv_pd2 <- data.matrix(filin_tsv_pd2)\n") o.write( " filin_tsv_pd1 <- filin_tsv_pd1[rownames(filin_tsv_pd2), rownames(filin_tsv_pd2)]\n" ) o.write( " # procrustes12 <- procrustes(filin_tsv_pd1, filin_tsv_pd2, kind=2, permutations=999)\n" ) o.write( " prtst <- protest(filin_tsv_pd1, filin_tsv_pd2, permutations = 999)\n" ) o.write(" n <- dim(filin_tsv_pd1)[1]\n") o.write( " res[i,] <- c(pair, d1, d2, group1, group2, case, metric, f1, f2, n, prtst$ss, prtst$signif)\n" ) o.write(" }\n") o.write("}\n") o.write("write.table(x = res, file = '%s')\n" % out_R) out_sh = '%s/4_run_procrustes_%s%s_R%s.sh' % (job_folder, prjct_nm, evaluation, filt_raref) out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as o: o.write('R -f %s --vanilla\n' % R_script) run_xpbs( out_sh, out_pbs, '%s.prcrt%s.R%s' % (prjct_nm, evaluation, filt_raref), 'renv', run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, 1, '# Procrustes for stats in R (pairs and samples subsets config in %s)' % p_procrustes, None, False, jobs)