def run_distance_decay(i_datasets_folder: str, betas: dict, p_distance_decay: str, datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> (dict, list): job_folder2 = get_job_folder(i_datasets_folder, 'decay/chunks') decay_config = read_yaml_file(p_distance_decay) subsets, modes, params = get_decay_config(decay_config) all_sh_pbs = {} decay_res = {} for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items(): if not split: out_sh = '%s/run_decay_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) decay_res[dat] = [] for idx, metrics_groups_metas_qzas_dms_trees in enumerate( rarefs_metrics_groups_metas_qzas_dms_trees): decay_raref = {} cur_raref = datasets_rarefs[dat][idx] odir = get_analysis_folder(i_datasets_folder, 'decay/%s%s' % (dat, cur_raref)) if split: out_sh = '%s/run_decay_%s_%s%s%s.sh' % ( job_folder2, prjct_nm, dat, cur_raref, filt_raref) for metric, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items( ): for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items( ): for (meta, qza, mat_qza, tree) in metas_qzas_mat_qzas_trees: meta_pd = read_meta_pd(meta).set_index('sample_name') cases_dict = check_metadata_cases_dict( meta, meta_pd, dict(subsets), 'decay') for case_var, case_vals_list in cases_dict.items(): for case_vals in case_vals_list: case = get_case(case_vals, case_var).replace(' ', '_') cur_sh = '%s/run_decay_%s%s_%s_%s_%s%s.sh' % ( job_folder2, dat, cur_raref, metric, group, case, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) new_meta_pd = get_new_meta_pd( meta_pd, case, case_var, case_vals) res = run_single_decay( odir, group, new_meta_pd, cur_sh, mat_qza, case, modes, force, run_params["n_nodes"], run_params["n_procs"], int(params['iteration']), int(params['step'])) decay_raref[(metric, group, case)] = res decay_res[dat].append(decay_raref)
def run_sourcetracking(i_datasets_folder: str, datasets: dict, p_sourcetracking_config: str, datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, split: bool, jobs: bool, chunkit: int) -> None: job_folder2 = get_job_folder(i_datasets_folder, 'sourcetracking/chunks') sourcetracking_dicts = get_sourcetracking_config(p_sourcetracking_config) sourcetracking_sourcesink = sourcetracking_dicts[0] sourcetracking_filtering = sourcetracking_dicts[1] sourcetracking_params = sourcetracking_dicts[2] main_cases_dict = sourcetracking_dicts[3] all_sh_pbs = {} all_import_sh_pbs = {} for dat, tsv_meta_pds_ in datasets.items(): if dat in sourcetracking_filtering: filters = sourcetracking_filtering[dat] else: filters = {'0_0': ['0', '0']} for idx, tsv_meta_pds in enumerate(tsv_meta_pds_): tsv, meta = tsv_meta_pds meta_pd = read_meta_pd(meta) meta_pd = meta_pd.set_index('sample_name') cases_dict = check_metadata_cases_dict(meta, meta_pd, dict(main_cases_dict), 'sourcetracking') cur_raref = datasets_rarefs[dat][idx] out_import_sh = '%s/run_import_sourcetracking_%s_%s%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref, cur_raref) imports = set() odir = get_analysis_folder(i_datasets_folder, 'sourcetracking/%s' % dat) for method in sourcetracking_params['method']: out_sh = '%s/run_sourcetracking_%s_%s%s%s_%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref, cur_raref, method) for case_var, case_vals_list in cases_dict.items(): for filt, (fp, fa) in filters.items(): cur_sh = '%s/run_sourcetracking_%s_%s_%s%s%s_%s_%s.sh' % ( job_folder2, prjct_nm, dat, case_var, filt_raref, cur_raref, method, filt) cur_sh = cur_sh.replace(' ', '-') cur_import_sh = '%s/run_import_sourcetracking_%s_%s_%s%s%s_%s_%s.sh' % ( job_folder2, prjct_nm, dat, case_var, filt_raref, cur_raref, method, filt) cur_import_sh = cur_import_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) all_import_sh_pbs.setdefault((dat, out_import_sh), []).append(cur_import_sh) run_single_sourcetracking( odir, tsv, meta_pd, case_var, sourcetracking_params, method, imports, sourcetracking_sourcesink, case_vals_list, cur_sh, cur_import_sh, force, filt, cur_raref, fp, fa, run_params["n_nodes"], run_params["n_procs"]) job_folder = get_job_folder(i_datasets_folder, 'sourcetracking') main_sh = write_main_sh( job_folder, '3_run_import_sourcetracking_%s%s' % (prjct_nm, filt_raref), all_import_sh_pbs, '%s.mpt.srctrk%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, '~/.') if main_sh: if p_sourcetracking_config: if p_sourcetracking_config.startswith('/panfs'): p_sourcetracking_config = p_sourcetracking_config.replace( os.getcwd(), '') print('# import sourcetracking (groups config in %s)' % p_sourcetracking_config) else: print('# import sourcetracking') print_message('', 'sh', main_sh, jobs) main_sh = write_main_sh( job_folder, '3_run_sourcetracking_%s%s' % (prjct_nm, filt_raref), all_sh_pbs, '%s.srctrk%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, '~/.') if main_sh: if p_sourcetracking_config: if p_sourcetracking_config.startswith('/panfs'): p_sourcetracking_config = p_sourcetracking_config.replace( os.getcwd(), '') print('# sourcetracking (groups config in %s)' % p_sourcetracking_config) else: print('# sourcetracking') print_message('', 'sh', main_sh, jobs)
def run_nestedness(i_datasets_folder: str, betas: dict, datasets_collapsed_map: dict, p_nestedness_groups: str, datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> (dict, list, dict): job_folder2 = get_job_folder(i_datasets_folder, 'nestedness/chunks') nestedness_config = read_yaml_file(p_nestedness_groups) if 'soft' not in nestedness_config: print( 'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)' ) return {} if nestedness_config['soft'].endswith('Autocorrelation.jar') and isfile( nestedness_config['soft']): binary = nestedness_config['soft'] else: binary = '%s/bin/Autocorrelation.jar' % nestedness_config['soft'] if not isfile(binary): print( 'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)' ) return {} subsets, nodfs, colors, nulls, modes, params = get_nestedness_config( nestedness_config) nodfs_fps = {} all_sh_pbs = {} nestedness_res = {} for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items(): if not split: out_sh = '%s/run_nestedness_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) stats_tax_dat, level = get_stats_tax_dat(dat, datasets_collapsed_map) nestedness_res[dat] = [] for idx, metrics_groups_metas_qzas_dms_trees in enumerate( rarefs_metrics_groups_metas_qzas_dms_trees): nestedness_raref = {} cur_raref = datasets_rarefs[dat][idx] odir = get_analysis_folder(i_datasets_folder, 'nestedness/%s%s' % (dat, cur_raref)) if split: out_sh = '%s/run_nestedness_%s_%s%s%s.sh' % ( job_folder2, prjct_nm, dat, cur_raref, filt_raref) for _, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items( ): for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items( ): meta, qza, mat_qza, tree = metas_qzas_mat_qzas_trees[0] meta_pd = read_meta_pd(meta).set_index('sample_name') cases_dict = check_metadata_cases_dict( meta, meta_pd, dict(subsets), 'nestedness') for case_var, case_vals_list in cases_dict.items(): for case_vals in case_vals_list: case = get_case(case_vals, case_var).replace(' ', '_') cur_sh = '%s/run_nestedness_%s%s_%s_%s%s.sh' % ( job_folder2, dat, cur_raref, group, case, filt_raref) cur_sh = cur_sh.replace(' ', '-') # print("case", case) all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) res, group_case_nodfs = run_single_nestedness( odir, cur_raref, level, group, meta_pd, nodfs, nulls, modes, cur_sh, qza, case, case_var, case_vals, binary, params, force) nodfs_fps.setdefault(stats_tax_dat, []).extend(group_case_nodfs) nestedness_raref[(group, case)] = res break nestedness_res[dat].append(nestedness_raref)
def run_doc(i_datasets_folder: str, datasets: dict, p_doc_config: str, datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, phates: dict, doc_phate: bool, split: bool, jobs: bool, chunkit: int) -> None: job_folder2 = get_job_folder(i_datasets_folder, 'doc/chunks') doc_filtering, doc_params, main_cases_dict = get_doc_config(p_doc_config) all_sh_pbs = {} all_import_sh_pbs = {} dat_cases_tabs = {} need_to_run_phate = [] need_to_run_less_phate = [] for dat, tsv_meta_pds_ in datasets.items(): dat_cases_tabs[dat] = {} if dat in doc_filtering: filters = doc_filtering[dat] else: filters = {'0-0': ['0', '0']} for idx, tsv_meta_pds in enumerate(tsv_meta_pds_): dat_phates = [] if dat in phates: dat_phates = phates[dat][idx] tsv, meta = tsv_meta_pds meta_pd = read_meta_pd(meta) meta_pd = meta_pd.set_index('sample_name') cases_dict = check_metadata_cases_dict(meta, meta_pd, dict(main_cases_dict), 'DOC') cur_raref = datasets_rarefs[dat][idx] dat_cases_tabs[dat][cur_raref] = {} if not split: out_sh = '%s/run_doc_%s%s%s.sh' % (job_folder2, dat, filt_raref, cur_raref) out_import_sh = '%s/run_import_doc_%s%s%s.sh' % ( job_folder2, dat, filt_raref, cur_raref) odir = get_analysis_folder(i_datasets_folder, 'doc/%s' % dat) for filt, (fp, fa) in filters.items(): if split: out_sh = '%s/run_doc_%s%s%s_%s.sh' % ( job_folder2, dat, filt_raref, cur_raref, filt) out_import_sh = '%s/run_import_doc_%s%s%s_%s.sh' % ( job_folder2, dat, filt_raref, cur_raref, filt) for case_var, case_vals_list in cases_dict.items(): cur_sh = '%s/run_doc_%s_%s%s%s_%s.sh' % ( job_folder2, dat, case_var, filt_raref, cur_raref, filt) cur_sh = cur_sh.replace(' ', '-') cur_import_sh = '%s/run_import_doc_%s_%s%s%s_%s.sh' % ( job_folder2, dat, case_var, filt_raref, cur_raref, filt) cur_import_sh = cur_import_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) all_import_sh_pbs.setdefault((dat, out_import_sh), []).append(cur_import_sh) cases = run_single_doc( i_datasets_folder, odir, tsv, meta_pd, case_var, doc_params, case_vals_list, cur_sh, cur_import_sh, force, filt, cur_raref, fp, fa, run_params["n_nodes"], run_params["n_procs"], dat_phates, doc_phate, need_to_run_phate, need_to_run_less_phate) dat_cases_tabs[dat][cur_raref].setdefault(case_var, []).extend(cases) for need_to_run in need_to_run_phate: print(' -', need_to_run) job_folder = get_job_folder(i_datasets_folder, 'doc') main_sh = write_main_sh(job_folder, '3_run_import_doc%s' % filt_raref, all_import_sh_pbs, '%s.doc.mpt%s' % (prjct_nm, filt_raref), "4", "1", "1", "500", "mb", qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_doc_config: if p_doc_config.startswith('/panfs'): p_doc_config = p_doc_config.replace(os.getcwd(), '') print('# Import for DOC (groups config in %s)' % p_doc_config) else: print('# Import DOC') print_message('', 'sh', main_sh, jobs) main_sh = write_main_sh(job_folder, '3_run_doc%s' % filt_raref, all_sh_pbs, '%s.doc%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, jobs, slurm, chunkit, '~/.') if main_sh: if p_doc_config: if p_doc_config.startswith('/panfs'): p_doc_config = p_doc_config.replace(os.getcwd(), '') print('# DOC (groups config in %s)' % p_doc_config) else: print('# DOC') print_message('', 'sh', main_sh, jobs) do_r = 1 if do_r: job_folder = get_job_folder(i_datasets_folder, 'doc/R') job_folder2 = get_job_folder(i_datasets_folder, 'doc/R/chunks') main_written = 0 main_sh = '%s/run_R_doc%s.sh' % (job_folder, filt_raref) with open(main_sh, 'w') as main_o: for dat, raref_case_var_cases in dat_cases_tabs.items(): shs = [] written = 0 odir = get_analysis_folder(i_datasets_folder, 'doc/%s' % dat) log_error = '%s/log.error' % odir for raref, case_var_cases in raref_case_var_cases.items(): for case_var, cases in case_var_cases.items(): for cdx, case in enumerate(cases): plot = '%s_%s_%s_%s' % (dat, raref, case_var, cdx) case_r = '%s/R' % case pdf = '%s/plot.pdf' % case_r do = '%s/DO.tsv' % case_r if not isfile(pdf): cur_r = '%s/run_R_doc_%s_%s_%s_vanilla.R' % ( job_folder2, dat, case_var, cdx) cur_sh = 'echo "*** %s" >> %s\n' % (plot, log_error) cur_sh += 'R -f %s --vanilla 2>> %s\n' % ( cur_r, log_error) cur_sh += 'echo "end" >> %s\n' % log_error shs.append(cur_sh) with open(cur_r, 'w') as o: o.write("library(DOC)\n") o.write("library(ggplot2)\n") if not isfile(do): o.write( "otu <- read.table('%s/tab.tsv', header=T, sep='\\t', comment.char='', check.names=F, nrows=2)\n" % case) o.write( "index_name <- colnames(otu)[1]\n") o.write( "otu <- read.table('%s/tab.tsv', header=T, sep='\\t', comment.char='', check.names=F, row.names=index_name)\n" % case) o.write("if (dim(otu)[1] > 100) {\n") o.write(" res <- DOC(otu)\n") o.write( " res.null <- DOC.null(otu)\n") o.write( " write.table(x=res$DO, file='%s/DO.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " write.table(x=res$LME, file='%s/LME.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " colnames(res$NEG) <- c('Neg_Slope', 'Data')\n" ) o.write( " write.table(x=res$NEG, file='%s/NEG.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " write.table(x=res$FNS, file='%s/FNS.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " write.table(x=res$BOOT, file='%s/BOOT.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " write.table(x=res$CI, file='%s/CI.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " write.table(x=res.null$DO, file='%s/null_DO.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " write.table(x=res.null$LME, file='%s/null_LME.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " colnames(res.null$NEG) <- c('Neg_Slope', 'Data')\n" ) o.write( " write.table(x=res.null$NEG, file='%s/null_NEG.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " write.table(x=res.null$FNS, file='%s/null_FNS.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " write.table(x=res.null$BOOT, file='%s/null_BOOT.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " write.table(x=res.null$CI, file='%s/null_CI.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write("}\n") o.write( "res = list(BOOT=read.table('%s/BOOT.tsv', h=T, sep='\\t'), CI=read.table('%s/CI.tsv', h=T, sep='\\t'), DO=read.table('%s/DO.tsv', h=T, sep='\\t'), LME=read.table('%s/LME.tsv', h=T, sep='\\t'), FNS=read.table('%s/FNS.tsv', h=T, sep='\\t'), NEG=read.table('%s/NEG.tsv', h=T, sep='\\t'))\n" % (case_r, case_r, case_r, case_r, case_r, case_r)) o.write( "res.null = list(BOOT=read.table('%s/null_BOOT.tsv', h=T, sep='\\t'), CI=read.table('%s/null_CI.tsv', h=T, sep='\\t'), DO=read.table('%s/null_DO.tsv', h=T, sep='\\t'), LME=read.table('%s/null_LME.tsv', h=T, sep='\\t'), FNS=read.table('%s/null_FNS.tsv', h=T, sep='\\t'), NEG=read.table('%s/null_NEG.tsv', h=T, sep='\\t'))\n" % (case_r, case_r, case_r, case_r, case_r, case_r)) o.write( "colnames(res$NEG) <- c('Neg.Slope', 'Data')\n" ) o.write( "colnames(res.null$NEG) <- c('Neg.Slope', 'Data')\n" ) o.write( "res$DO <- res$DO[which(res$DO$Overlap <= 1),]\n" ) o.write( "res.null$DO <- res.null$DO[which(res.null$DO$Overlap <= 1),]\n" ) o.write("pdf('%s')\n" % pdf) o.write( "merged <- DOC.merge(list(s_%s = res, s_%s=res.null))\n" % (plot, plot)) o.write("plot(merged)\n") o.write("dev.off()\n") main_written += 1 written += 1 if written: if chunkit and len(shs) >= chunkit: chunks = [ list(x) for x in np.array_split(np.array(shs), chunkit) ] if split and len(shs) >= 3: chunks = [ list(x) for x in np.array_split(np.array(shs), 3) ] else: chunks = [shs] for cdx, chunk in enumerate(chunks): out_sh = '%s/run_R_doc_%s%s_%s.sh' % (job_folder2, dat, filt_raref, cdx) out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as o: for c in chunk: o.write('echo "%s"\n\n' % c) o.write('%s\n\n' % c) run_xpbs( out_sh, out_pbs, '%s.doc.R.%s%s_%s' % (prjct_nm, dat, filt_raref, cdx), 'xdoc', run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', main_o, noloc, slurm, jobs) if main_written: print_message('# DOC (R)', 'sh', main_sh, jobs)
def run_adonis(p_formulas: str, i_datasets_folder: str, betas: dict, datasets_rarefs: dict, p_perm_groups: str, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> None: """ Run beta-group-significance: Beta diversity group significance. https://docs.qiime2.org/2019.10/plugins/available/diversity/beta-group-significance/ Main per-dataset looper for the ADONIS tests on beta diversity matrices. :param p_formulas: formulas to test. :param i_data_sets_folder: Path to the folder containing the data/metadata subfolders. :param data_sets: list of datasets. :param betas: beta diversity matrices. :param p_perm_groups: groups to subset. :param force: Force the re-writing of scripts for all commands. :param prjct_nm: Nick name for your project. :param qiime_env: qiime2-xxxx.xx conda environment. :param chmod: whether to change permission of output files (defalt: 775). """ job_folder2 = get_job_folder(i_datasets_folder, 'adonis/chunks') main_cases_dict = get_main_cases_dict(p_perm_groups) formulas = get_formulas_dict(p_formulas) metric_check = set() all_sh_pbs = {} first_print = 0 for dat, metric_groups_metas_qzas_dms_trees_ in betas.items(): if dat not in formulas: continue if not split: out_sh = '%s/run_adonis_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) for idx, metric_groups_metas_qzas_dms_trees in enumerate(metric_groups_metas_qzas_dms_trees_): cur_depth = datasets_rarefs[dat][idx] odir = get_analysis_folder(i_datasets_folder, 'adonis/%s%s' % (dat, cur_depth)) for metric, subset_files in metric_groups_metas_qzas_dms_trees.items(): if split: out_sh = '%s/run_adonis_%s_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, metric, filt_raref) for subset, metas_qzas_mat_qzas_trees in subset_files.items(): for meta, qza, mat_qza, tree in metas_qzas_mat_qzas_trees: if not isfile(mat_qza): if not first_print: print('Beta diversity, distances matrices must be generated already to automatise PERMANOVA\n' '\t(re-run this after steps "2_run_beta.sh" and "2x_run_beta_export.pbs" are done)') first_print += 1 continue if (dat, subset) not in metric_check: meta_pd = read_meta_pd(meta).set_index('sample_name') cases_dict = check_metadata_cases_dict(meta, meta_pd, dict(main_cases_dict), 'ADONIS') formulas = check_metadata_formulas(meta, meta_pd, formulas[dat], 'ADONIS') metric_check.add((dat, subset)) for fdx, form in enumerate(formulas[dat].keys()): formula = formulas[dat][form] for cdx, case_var in enumerate(cases_dict.keys()): case_vals_list = cases_dict[case_var] cur_sh = '%s/run_adonis_%s%s_%s_%s_%s%s.sh' % ( job_folder2, dat, cur_depth, metric, fdx, cdx, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) run_single_adonis(odir, subset, case_vals_list, metric, case_var, form, formula, qza, mat_qza, meta_pd, cur_sh, force) job_folder = get_job_folder(i_datasets_folder, 'adonis') main_sh = write_main_sh(job_folder, '3_run_adonis_%s%s' % (prjct_nm, filt_raref), all_sh_pbs, '%s.dns%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_perm_groups: print("# Run Adonis (groups config in %s)" % p_perm_groups) else: print("# Run Adonis") print_message('', 'sh', main_sh, jobs)
def run_deicode(i_datasets_folder: str, datasets: dict, datasets_rarefs: dict, p_perm_groups: str, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> None: """ Performs robust center log-ratio transform robust PCA and ranks the features by the loadings of the resulting SVD. https://library.qiime2.org/plugins/deicode/19/ Main per-dataset looper for the ADONIS tests on beta diversity matrices. :param i_data_sets_folder: Path to the folder containing the data/metadata subfolders. :param data_sets: list of data_sets. :param p_perm_groups: groups to subset. :param force: Force the re-writing of scripts for all commands. :param prjct_nm: Nick name for your project. :param qiime_env: qiime2-xxxx.xx conda environment. :param chmod: whether to change permission of output files (defalt: 775). """ job_folder2 = get_job_folder(i_datasets_folder, 'deicode/chunks') main_cases_dict = get_main_cases_dict(p_perm_groups) # jobs = [] all_sh_pbs = {} for dat, tsv_meta_pds_ in datasets.items(): out_sh = '%s/run_deicode_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) for idx, tsv_meta_pds in enumerate(tsv_meta_pds_): cur_raref = datasets_rarefs[dat][idx] tsv, meta = tsv_meta_pds meta_alphas = meta.replace('.tsv', '_alphas.tsv') meta_alphas_full = meta.replace('.tsv', '_alphas_full.tsv') if isfile(meta_alphas_full): meta = meta_alphas_full elif isfile(meta_alphas): meta = meta_alphas meta_pd = read_meta_pd(meta) meta_pd = meta_pd.set_index('sample_name') cases_dict = check_metadata_cases_dict(meta, meta_pd, dict(main_cases_dict), 'DEICODE') odir = get_analysis_folder(i_datasets_folder, 'deicode/%s%s' % (dat, cur_raref)) for case_var, case_vals_list in cases_dict.items(): cur_sh = '%s/run_beta_deicode_%s_%s%s_%s%s.sh' % ( job_folder2, prjct_nm, dat, cur_raref, case_var, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) run_single_deicode(odir, tsv, meta_pd, case_var, case_vals_list, cur_sh, force) job_folder = get_job_folder(i_datasets_folder, 'deicode') main_sh = write_main_sh( job_folder, '3_run_beta_deicode_%s%s' % (filt_raref, prjct_nm), all_sh_pbs, '%s.dcd%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_perm_groups: if p_perm_groups.startswith('/panfs'): p_perm_groups = p_perm_groups.replace(os.getcwd(), '') print('# DEICODE (groups config in %s)' % p_perm_groups) else: print('# DEICODE') print_message('', 'sh', main_sh, jobs)
def run_permanova(i_datasets_folder: str, betas: dict, main_testing_groups: tuple, p_perm_tests_min: int, p_beta_type: tuple, datasets_rarefs: dict, p_perm_groups: str, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> dict: """ Run beta-group-significance: Beta diversity group significance. https://docs.qiime2.org/2019.10/plugins/available/diversity/beta-group-significance/ Main per-dataset looper for the PERMANOVA tests on beta diversity matrices. :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets: list of datasets. :param betas: beta diversity matrices. :param main_testing_groups: groups to test. :param p_perm_groups: groups to subset. :param force: Force the re-writing of scripts for all commands. :param prjct_nm: Nick name for your project. :param qiime_env: qiime2-xxxx.xx conda environment. :param chmod: whether to change permission of output files (defalt: 775). """ permanovas = {} job_folder2 = get_job_folder(i_datasets_folder, 'permanova/chunks') main_cases_dict = get_main_cases_dict(p_perm_groups) npermutations = 999 metric_check = set() all_sh_pbs = {} first_print = 0 for dat, metric_groups_metas_qzas_dms_trees_ in betas.items(): permanovas[dat] = [] if not split: out_sh = '%s/run_beta_group_significance_%s_%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref) for idx, metric_groups_metas_qzas_dms_trees in enumerate( metric_groups_metas_qzas_dms_trees_): cur_depth = datasets_rarefs[dat][idx] odir = get_analysis_folder(i_datasets_folder, 'permanova/%s%s' % (dat, cur_depth)) for metric, subset_files in metric_groups_metas_qzas_dms_trees.items( ): permanovas.setdefault(dat, []).append(metric) if split: out_sh = '%s/run_beta_group_significance_%s_%s_%s%s.sh' % ( job_folder2, prjct_nm, dat, metric, filt_raref) for subset, metas_qzas_mat_qzas_trees in subset_files.items(): (meta, qza, mat_qza, tree) = metas_qzas_mat_qzas_trees[0] if not isfile(mat_qza): if not first_print: print( 'Beta diversity, distances matrices must be generated already to automatise PERMANOVA\n' '\t(re-run this after steps "2_run_beta.sh" and "2x_run_beta_export.pbs" are done)' ) first_print += 1 continue if (dat, subset) not in metric_check: meta_pd = read_meta_pd(meta) meta_pd = meta_pd.set_index('sample_name') cases_dict = check_metadata_cases_dict( meta, meta_pd, dict(main_cases_dict), 'PERMANOVA') testing_groups = check_metadata_testing_groups( meta, meta_pd, main_testing_groups, p_perm_tests_min, 'PERMANOVA') metric_check.add((dat, subset)) for case_var, case_vals_list in cases_dict.items(): testing_groups_case_var = list( set(testing_groups + [case_var])) for case_vals in case_vals_list: case = get_case(case_vals, case_var).replace(' ', '_') for testing_group in testing_groups_case_var: if testing_group == 'ALL': continue cur_sh = '%s/run_beta_group_significance_%s%s_%s_%s_%s_%s%s.sh' % ( job_folder2, dat, cur_depth, metric, subset, case, testing_group, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) run_single_perm(odir, subset, meta_pd, cur_sh, metric, case, testing_group, p_perm_tests_min, p_beta_type, qza, mat_qza, case_var, case_vals, npermutations, force)
def run_mantel(i_datasets_folder: str, datasets_filt: dict, p_mantel: str, betas: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, filt_only: bool, eval_depths: dict, jobs: bool, chunkit: int) -> None: """ """ evaluation = '' if eval_depths: evaluation = '_eval' mantel_pairs = {} for dat, depths in eval_depths.items(): sorted_depths = sorted(depths, key=lambda x: int(x.split('_')[-1])) for idx, x in enumerate(sorted_depths[:-1]): y = sorted_depths[(idx + 1)] n0 = x.split('_')[-1] n1 = y.split('_')[-1] mantel_pairs['%s_%s' % (n0, n1)] = [x, y] mantel_subsets = {'ALL': [[]]} else: mantel_pairs, mantel_subsets = get_procrustes_mantel_dicts(p_mantel) get_job_folder(i_datasets_folder, 'mantel%s' % evaluation) all_sh_pbs = {} missing_dats = set() for pair, (dat1_, dat2_) in mantel_pairs.items(): dat1, raref1 = get_dat_idx(dat1_, evaluation, datasets_filt, filt_only) dat2, raref2 = get_dat_idx(dat2_, evaluation, datasets_filt, filt_only) if check_dat_exists(betas, dat1, missing_dats) or check_dat_exists( betas, dat2, missing_dats): continue if evaluation: metrics_groups_metas_qzas_dms_trees1 = betas[dat1] metrics_groups_metas_qzas_dms_trees2 = betas[dat2] else: metrics_groups_metas_qzas_dms_trees1 = betas[dat1][0] metrics_groups_metas_qzas_dms_trees2 = betas[dat2][0] job_folder2 = get_job_folder( i_datasets_folder, 'mantel%s/chunks/%s%s' % (evaluation, pair, filt_raref)) if not split: out_sh = '%s/run_mantel_%s%s_%s%s.sh' % ( job_folder2, prjct_nm, evaluation, pair, filt_raref) for metric, groups_metas_qzas_dms_trees1 in metrics_groups_metas_qzas_dms_trees1.items( ): if split: out_sh = '%s/run_mantel_%s%s_%s_%s%s.sh' % ( job_folder2, prjct_nm, evaluation, pair, metric, filt_raref) if metric not in metrics_groups_metas_qzas_dms_trees2: continue groups_metas_qzas_dms_trees2 = metrics_groups_metas_qzas_dms_trees2[ metric] groups1 = sorted(groups_metas_qzas_dms_trees1.keys()) groups2 = sorted(groups_metas_qzas_dms_trees2.keys()) for (group1_, group2_) in itertools.product(*[groups1, groups2]): if group1_ == '': group1 = 'full' else: group1 = group1_ if group2_ == '': group2 = 'full' else: group2 = group2_ meta1, qza1, dm1, tree1 = groups_metas_qzas_dms_trees1[ group1_][0] meta2, qza2, dm2, tree2 = groups_metas_qzas_dms_trees2[ group2_][0] skip = 0 if not evaluation: if '__raref' in dat1_: dm1, meta1 = get_dm_meta(dat1, dm1, meta1, raref1, metric, i_datasets_folder, skip) if '__raref' in dat2_: dm2, meta2 = get_dm_meta(dat2, dm2, meta2, raref2, metric, i_datasets_folder, skip) if skip: print( '[Mantels] One desired rarefaction depth not run (pair %s)' % pair) continue meta_pd1 = read_meta_pd(meta1) meta_pd2 = read_meta_pd(meta2) common_sams = list( set(meta_pd1.sample_name) & set(meta_pd2.sample_name)) if len(common_sams) < 3: continue meta_pd = meta_pd1.loc[meta_pd1.sample_name.isin(common_sams)] cases_dict = check_metadata_cases_dict(meta1, meta_pd, dict(mantel_subsets), 'mantel') odir = get_analysis_folder( i_datasets_folder, 'mantel%s/%s%s/%s_vs_%s' % (evaluation, pair, filt_raref, group1, group2)) job_folder3 = get_job_folder( i_datasets_folder, 'mantel%s/chunks/%s%s/%s_vs_%s' % (evaluation, pair, filt_raref, group1, group2)) for case_var, case_vals_list in cases_dict.items(): for case_vals in case_vals_list: case_ = get_case(case_vals, case_var).replace(' ', '_') cur = '%s__%s' % (metric, case_) cur_sh = '%s/run_mantel%s_%s%s.sh' % ( job_folder3, evaluation, cur, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((pair, out_sh), []).append(cur_sh) dm_out1 = '%s/dm_%s__%s_DM.qza' % (odir, dat1_, cur) dm_out2 = '%s/dm_%s__%s_DM.qza' % (odir, dat2_, cur) mantel_out = '%s/mantel%s_%s__%s__%s.qzv' % ( odir, evaluation, dat1_, dat2_, cur) run_single_procrustes_mantel('mantel', odir, dm1, dm2, meta_pd, dm_out1, dm_out2, mantel_out, cur_sh, cur, case_var, case_vals, force) job_folder = get_job_folder(i_datasets_folder, 'mantel%s' % evaluation) main_sh = write_main_sh( job_folder, '4_run_mantel_%s%s%s' % (prjct_nm, evaluation, filt_raref), all_sh_pbs, '%s.mntl%s%s' % (prjct_nm, evaluation, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_mantel and p_mantel != 1: if p_mantel.startswith('/panfs'): p_mantel = p_mantel.replace(os.getcwd(), '') print('# Mantels (pairs and samples subsets config in %s)' % p_mantel) else: print('# Mantels') print_message('', 'sh', main_sh, jobs)
def run_procrustes(i_datasets_folder: str, datasets_filt: dict, p_procrustes: str, betas: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, filt_only: bool, eval_depths: dict, jobs: bool, chunkit: int) -> None: """ """ evaluation = '' if eval_depths: evaluation = '_eval' procrustes_pairs = {} for dat, depths in eval_depths.items(): sorted_depths = sorted(depths, key=lambda x: int(x.split('_')[-1])) for idx, x in enumerate(sorted_depths[:-1]): y = sorted_depths[(idx + 1)] n0 = x.split('_')[-1] n1 = y.split('_')[-1] procrustes_pairs['%s_%s' % (n0, n1)] = [x, y] procrustes_subsets = {'ALL': [[]]} else: procrustes_pairs, procrustes_subsets = get_procrustes_mantel_dicts( p_procrustes) get_job_folder(i_datasets_folder, 'procrustes%s' % evaluation) dms_tab = [] all_sh_pbs = {} missing_dats = set() for pair, (dat1_, dat2_) in procrustes_pairs.items(): dat1, raref1 = get_dat_idx(dat1_, evaluation, datasets_filt, filt_only) dat2, raref2 = get_dat_idx(dat2_, evaluation, datasets_filt, filt_only) if check_dat_exists(betas, dat1, missing_dats) or check_dat_exists( betas, dat2, missing_dats): continue if evaluation: metrics_groups_metas_qzas_dms_trees1 = betas[dat1] metrics_groups_metas_qzas_dms_trees2 = betas[dat2] else: metrics_groups_metas_qzas_dms_trees1 = betas[dat1][0] metrics_groups_metas_qzas_dms_trees2 = betas[dat2][0] job_folder2 = get_job_folder( i_datasets_folder, 'procrustes%s/chunks/%s%s' % (evaluation, pair, filt_raref)) if not split: out_sh = '%s/run_procrustes_%s%s_%s%s.sh' % ( job_folder2, prjct_nm, evaluation, pair, filt_raref) for metric, groups_metas_qzas_dms_trees1 in metrics_groups_metas_qzas_dms_trees1.items( ): if split: out_sh = '%s/run_procrustes_%s%s_%s_%s%s.sh' % ( job_folder2, prjct_nm, evaluation, pair, metric, filt_raref) if metric not in metrics_groups_metas_qzas_dms_trees2: continue groups_metas_qzas_dms_trees2 = metrics_groups_metas_qzas_dms_trees2[ metric] groups1 = sorted(groups_metas_qzas_dms_trees1.keys()) groups2 = sorted(groups_metas_qzas_dms_trees2.keys()) for (group1_, group2_) in itertools.product(*[groups1, groups2]): if group1_ == '': group1 = 'full' else: group1 = group1_ if group2_ == '': group2 = 'full' else: group2 = group2_ meta1, qza1, dm1, tree1 = groups_metas_qzas_dms_trees1[ group1_][0] meta2, qza2, dm2, tree2 = groups_metas_qzas_dms_trees2[ group2_][0] skip = 0 if not evaluation: if '__raref' in dat1_: dm1, meta1 = get_dm_meta(dat1, dm1, meta1, raref1, metric, i_datasets_folder, skip) if '__raref' in dat2_: dm2, meta2 = get_dm_meta(dat2, dm2, meta2, raref2, metric, i_datasets_folder, skip) if skip: print( '[Proscustes] One desired rarefaction depth not run (pair %s)' % pair) continue meta_pd1 = read_meta_pd(meta1) meta_pd2 = read_meta_pd(meta2) common_sams = list( set(meta_pd1.sample_name) & set(meta_pd2.sample_name)) if len(common_sams) < 3: continue meta_pd = meta_pd1.loc[meta_pd1.sample_name.isin(common_sams)] cases_dict = check_metadata_cases_dict( meta1, meta_pd, dict(procrustes_subsets), 'procrustes') odir = get_analysis_folder( i_datasets_folder, 'procrustes%s/%s%s/%s_vs_%s' % (evaluation, pair, filt_raref, group1, group2)) job_folder3 = get_job_folder( i_datasets_folder, 'procrustes%s/chunks/%s%s/%s_vs_%s' % (evaluation, pair, filt_raref, group1, group2)) for case_var, case_vals_list in cases_dict.items(): for case_vals in case_vals_list: case_ = get_case(case_vals, case_var).replace(' ', '_') cur = '%s__%s' % (metric, case_) cur_sh = '%s/run_procrustes%s_%s%s.sh' % ( job_folder3, evaluation, cur, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((pair, out_sh), []).append(cur_sh) dm_out1 = '%s/dm_%s__%s_DM.qza' % (odir, dat1_, cur) dm_out2 = '%s/dm_%s__%s_DM.qza' % (odir, dat2_, cur) dm_out1_tsv = '%s.tsv' % splitext(dm_out1)[0] dm_out2_tsv = '%s.tsv' % splitext(dm_out2)[0] biplot = '%s/procrustes%s_%s__%s__%s.qzv' % ( odir, evaluation, dat1_, dat2_, cur) run_single_procrustes_mantel('procrustes', odir, dm1, dm2, meta_pd, dm_out1, dm_out2, biplot, cur_sh, cur, case_var, case_vals, force) dms_tab.append([ pair, dat1_, dat2_, group1, group2, case_, metric, dm_out1_tsv, dm_out2_tsv ]) job_folder = get_job_folder(i_datasets_folder, 'procrustes%s' % evaluation) main_sh = write_main_sh( job_folder, '4_run_procrustes_%s%s%s' % (prjct_nm, evaluation, filt_raref), all_sh_pbs, '%s.prcst%s%s' % (prjct_nm, evaluation, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_procrustes and p_procrustes != 1: if p_procrustes.startswith('/panfs'): p_procrustes = p_procrustes.replace(os.getcwd(), '') print('# Procrustes (pairs and samples subsets config in %s)' % p_procrustes) else: print('# Procrustes') print_message('', 'sh', main_sh, jobs) dms_tab_pd = pd.DataFrame(dms_tab, columns=[ 'pair', 'dat1', 'dat2', 'metric', 'group1', 'group2', 'case', 'dm_out1', 'dm_out2', ]) odir = get_analysis_folder(i_datasets_folder, 'procrustes%s/R' % evaluation) out_Rs = glob.glob('%s/pairs_proscrustes_results%s%s*.tsv' % (odir, evaluation, filt_raref)) if len(out_Rs): done_R = pd.concat([pd.read_table(x, sep=' ') for x in out_Rs]) dms_tab_pd = dms_tab_pd.loc[~dms_tab_pd[['dm_out1', 'dm_out2']].sum(1). isin(done_R[['f1', 'f2']].sum(1))] if dms_tab_pd.shape[0]: fp_num = 0 if len(out_Rs): last = sorted( out_Rs, key=lambda fp: int(fp.split('.tsv')[0].split('_')[-1])) fp_num = int(last[-1].split('.tsv')[0].split('_')[-1]) + 1 dms_tab_fp = '%s/pairs%s%s_%s.tsv' % (odir, evaluation, filt_raref, fp_num) dms_tab_pd.to_csv(dms_tab_fp, index=False, sep='\t') out_R = '%s/pairs_proscrustes_results%s%s_%s.tsv' % ( odir, evaluation, filt_raref, fp_num) job_folder = get_job_folder(i_datasets_folder, 'procrustes/R') R_script = '%s/4_run_procrustes_%s%s.R' % (job_folder, prjct_nm, filt_raref) with open(R_script, 'w') as o: o.write("library(vegan)\n") o.write("dms_files <- read.table('%s', h=T)\n" % dms_tab_fp) o.write( "cols <- c('pair', 'd1', 'd2', 'g1', 'g2', 'case', 'metric', 'f1', 'f2', 'samples', 'M2', 'p-value')\n" ) o.write( "res <- setNames(data.frame(matrix(ncol = 12, nrow = 0)), cols)\n" ) o.write("for (i in seq(1, dim(dms_files)[1])) {\n") o.write(" row <- as.vector(unlist(dms_files[i,]))\n") o.write(" pair <- row[1]\n") o.write(" d1 <- row[2]\n") o.write(" d2 <- row[3]\n") o.write(" group1 <- row[4]\n") o.write(" group2 <- row[5]\n") o.write(" case <- row[6]\n") o.write(" metric <- row[7]\n") o.write(" f1 <- row[8]\n") o.write(" f2 <- row[9]\n") o.write(" if (sum(file.exists(f1, f2)) == 2) {\n") o.write( " filin_tsv_pd1 <- read.csv(f1, header = TRUE, check.names=FALSE,\n" ) o.write( " row.names = 1, colClasses = 'character', sep = '\\t')\n" ) o.write( " filin_tsv_pd2 <- read.csv(f2, header = TRUE, check.names=FALSE,\n" ) o.write( " row.names = 1, colClasses = 'character', sep = '\\t')\n" ) o.write(" filin_tsv_pd1 <- data.matrix(filin_tsv_pd1)\n") o.write(" filin_tsv_pd2 <- data.matrix(filin_tsv_pd2)\n") o.write( " filin_tsv_pd1 <- filin_tsv_pd1[rownames(filin_tsv_pd2), rownames(filin_tsv_pd2)]\n" ) o.write( " # procrustes12 <- procrustes(filin_tsv_pd1, filin_tsv_pd2, kind=2, permutations=999)\n" ) o.write( " prtst <- protest(filin_tsv_pd1, filin_tsv_pd2, permutations = 999)\n" ) o.write(" n <- dim(filin_tsv_pd1)[1]\n") o.write( " res[i,] <- c(pair, d1, d2, group1, group2, case, metric, f1, f2, n, prtst$ss, prtst$signif)\n" ) o.write(" }\n") o.write("}\n") o.write("write.table(x = res, file = '%s')\n" % out_R) out_sh = '%s/4_run_procrustes_%s%s_R%s.sh' % (job_folder, prjct_nm, evaluation, filt_raref) out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as o: o.write('R -f %s --vanilla\n' % R_script) run_xpbs( out_sh, out_pbs, '%s.prcrt%s.R%s' % (prjct_nm, evaluation, filt_raref), 'renv', run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, 1, '# Procrustes for stats in R (pairs and samples subsets config in %s)' % p_procrustes, None, False, jobs)
def run_phate(p_phate_config: str, i_datasets_folder: str, datasets: dict, datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> dict: job_folder2 = get_job_folder(i_datasets_folder, 'phate/chunks') phate_dicts = get_phate_dicts(p_phate_config) phate_filtering, phate_labels, phate_params, main_cases_dict = phate_dicts phates = {} all_sh_pbs = {} all_import_sh_pbs = {} for dat, tsv_meta_pds_ in datasets.items(): phates[dat] = [] if dat in phate_filtering: filters = phate_filtering[dat] else: filters = {'0_0': ['0', '0']} for idx, tsv_meta_pds in enumerate(tsv_meta_pds_): tsv, meta = tsv_meta_pds meta_pd = read_meta_pd(meta) meta_pd = meta_pd.set_index('sample_name') cases_dict = check_metadata_cases_dict(meta, meta_pd, dict(main_cases_dict), 'phate') cur_raref = datasets_rarefs[dat][idx] if not split: out_sh = '%s/run_phate_%s_%s%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref, cur_raref) out_import_sh = '%s/run_import_phate_%s_%s%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref, cur_raref) odir = get_analysis_folder(i_datasets_folder, 'phate/%s' % dat) raref_phates = {} for filt, (fp, fa) in filters.items(): raref_phates[filt] = {} if split: out_sh = '%s/run_phate_%s_%s%s%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref, cur_raref, filt) out_import_sh = '%s/run_import_phate_%s_%s%s%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref, cur_raref, filt) for case_var, case_vals_list in cases_dict.items(): cur_sh = '%s/run_phate_%s_%s%s%s_%s_%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref, cur_raref, case_var, filt) cur_sh = cur_sh.replace(' ', '-') cur_import_sh = '%s/run_import_phate_%s_%s%s%s_%s_%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref, cur_raref, case_var, filt) cur_import_sh = cur_import_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) all_import_sh_pbs.setdefault((dat, out_import_sh), []).append(cur_import_sh) phate = run_single_phate(dat, odir, tsv, meta_pd, case_var, phate_labels, phate_params, run_params, case_vals_list, cur_sh, cur_import_sh, force, filt, cur_raref, fp, fa) raref_phates[filt][case_var] = phate phates[dat].append(raref_phates) job_folder = get_job_folder(i_datasets_folder, 'phate') main_sh = write_main_sh( job_folder, '3_run_import_phate_%s%s' % (prjct_nm, filt_raref), all_import_sh_pbs, '%s.mrt.pht%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_phate_config: if p_phate_config.startswith('/panfs'): p_phate_config = p_phate_config.replace(os.getcwd(), '') print('# Import for PHATE (groups config in %s)' % p_phate_config) else: print('# Import for PHATE') print_message('', 'sh', main_sh, jobs) main_sh = write_main_sh( job_folder, '3_run_phate_%s%s' % (prjct_nm, filt_raref), all_sh_pbs, '%s.pht%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], 'xphate', chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_phate_config: if p_phate_config.startswith('/panfs'): p_phate_config = p_phate_config.replace(os.getcwd(), '') print('# PHATE (groups config in %s)' % p_phate_config) else: print('# PHATE') print_message('', 'sh', main_sh, jobs) return phates
def run_alpha_group_significance(i_datasets_folder: str, datasets: dict, diversities: dict, datasets_rarefs: dict, p_perm_groups: str, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, As: tuple, split: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> None: """ Run alpha-group-significance: Alpha diversity comparisons. https://docs.qiime2.org/2019.10/plugins/available/diversity/alpha-group-significance/ Main per-dataset looper for the Kruskal-Wallis tests on alpha diversity vectors. :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets: list of datasets. :param diversities: alpha diversity qiime2 Arfetact per dataset. :param p_perm_groups: path to the subsets file. :param force: Force the re-writing of scripts for all commands. :param prjct_nm: Nick name for your project. :param qiime_env: qiime2-xxxx.xx conda environment. :param chmod: whether to change permission of output files (defalt: 775). """ job_folder2 = get_job_folder(i_datasets_folder, 'alpha_group_significance/chunks') # alpha_metrics = get_metrics('alpha_metrics', As) main_cases_dict = get_main_cases_dict(p_perm_groups) jobs = [] all_sh_pbs = {} first_print = 0 for dat, tsv_meta_pds_ in datasets.items(): for idx, tsv_meta_pds in enumerate(tsv_meta_pds_): meta = tsv_meta_pds[1] cur_raref = datasets_rarefs[dat][idx] raref_diversities = diversities[dat][idx] presence_mat = [ 1 for (qza, metric) in raref_diversities[''] if isfile(qza) ] if not presence_mat: if not first_print: print( 'Alpha diversity must be measured already to automatise Kruskal-Wallis tests\n' '\t(re-run this after step "1_run_alpha.sh" is done)') first_print += 1 continue meta_pd = read_meta_pd(meta) meta_pd = meta_pd.set_index('sample_name') cases_dict = check_metadata_cases_dict(meta, meta_pd, dict(main_cases_dict), 'alpha Kruskal-Wallis') odir = get_analysis_folder( i_datasets_folder, 'alpha_group_significance/%s%s' % (dat, cur_raref)) for (qza, metric) in raref_diversities['']: # metric = get_metric(alpha_metrics, qza) div_tsv = '%s.tsv' % splitext(qza)[0] if not isfile(div_tsv) or not isfile(div_tsv): print( ' [KRUSKAL-WALLIS] metric %s not calculated\nSkipping it...' % metric) continue out_sh = '%s/run_alpha_group_significance_%s%s_%s%s.sh' % ( job_folder2, dat, cur_raref, metric, filt_raref) for case_var, case_vals_list in cases_dict.items(): cur_sh = '%s/run_alpha_group_significance_%s%s_%s_%s%s.sh' % ( job_folder2, dat, cur_raref, metric, case_var, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) p = multiprocessing.Process(target=run_multi_kw, args=(odir, meta_pd, qza, case_vals_list, case_var, cur_sh, force)) p.start() jobs.append(p) for j in jobs: j.join() job_folder = get_job_folder(i_datasets_folder, 'alpha_group_significance') main_sh = write_main_sh( job_folder, '6_run_alpha_group_significance_%s%s' % (filt_raref, prjct_nm), all_sh_pbs, '%s.kv%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_perm_groups: print("# Kruskal-Wallis on alpha diversity (groups config in %s)" % p_perm_groups) else: print("# Kruskal-Wallis on alpha diversity") print_message('', 'sh', main_sh, jobs)