def edit_taxonomies(i_datasets_folder: str, taxonomies: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int): job_folder = get_job_folder(i_datasets_folder, 'taxonomy') job_folder2 = get_job_folder(i_datasets_folder, 'taxonomy/chunks') main_written = 0 to_chunk = [] run_pbs = '%s/1_run_taxonomy_edit_%s%s.sh' % (job_folder, prjct_nm, filt_raref) with open(run_pbs, 'w') as o: for dat, (_, qza, tsv) in taxonomies.items(): if not isfile(tsv): continue written = 0 out_pd = pd.read_csv(tsv, dtype=str, sep='\t') taxo = out_pd['Taxon'].tolist() taxo_edit = get_taxa_edit(taxo) if taxo != taxo_edit: out_pd['Taxon'] = taxo_edit out_pd.to_csv(tsv, index=False, sep='\t') cmd = run_import(tsv, qza, 'FeatureData[Taxonomy]') out_sh = '%s/run_taxonomy_edit_%s_%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as cur_sh: cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n\n' % cmd) main_written += 1 written += 1 if written: to_chunk.append(out_sh) if not chunkit: run_xpbs(out_sh, out_pbs, '%s.tx.dt.%s%s' % (prjct_nm, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'taxonomy_edit', prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if main_written: print_message('# Edit features taxonomy to not contain "," characters', 'sh', run_pbs, jobs)
def get_jobs_folders(self, analyses_commands): for analysis in analyses_commands: self.jobs_folders[analysis] = [ '%s/run_%s_%s%s.sh' % (get_job_folder(self.config.i_datasets_folder, analysis), analysis, self.prjct_nm, self.filt_raref), '%s/run_%s_%s%s' % (get_job_folder( self.config.i_datasets_folder, '%s/chunks' % analysis), analysis, self.prjct_nm, self.filt_raref), ]
def summarize_permanova(i_datasets_folder: str, permanovas: dict, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> dict: RESOURCES = pkg_resources.resource_filename("routine_qiime2_analyses", "resources") summarize_fp = '%s/summarize_permanovas.py' % RESOURCES all_sh_pbs = {} job_folder2 = get_job_folder(i_datasets_folder, 'permanova_summarize/chunks') for dat, metrics in permanovas.items(): metrics = [ x for x in [ 'aitchison', 'jaccard', 'braycurtis', 'unweighted_unifrac', 'weighted_unifrac' ] if x in metrics ] permanovas[dat] = [] out_sh = '%s/run_permanova_summarize_%s%s.sh' % (job_folder2, dat, filt_raref) out_py = '%s/run_permanova_summarize_%s%s.py' % (job_folder2, dat, filt_raref) with open(out_py, 'w') as o, open(summarize_fp) as f: for line in f: line_edit = line if 'DATASET' in line: line_edit = line_edit.replace('DATASET', dat) if 'ROUTINE_FOLDER' in line: line_edit = line_edit.replace('ROUTINE_FOLDER', i_datasets_folder) if 'METRICS' in line: line_edit = line_edit.replace('METRICS', str(metrics)) o.write(line_edit) cur_sh = '%s/run_permanova_summarize_%s%s_tmp.sh' % (job_folder2, dat, filt_raref) with open(cur_sh, 'w') as o: o.write('python3 %s\n' % out_py) all_sh_pbs[(dat, out_sh)] = [cur_sh] job_folder = get_job_folder(i_datasets_folder, 'permanova_summarize') main_sh = write_main_sh( job_folder, '3_run_permanova_summarize%s' % filt_raref, all_sh_pbs, '%s.prm%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: print("# SUMMARIZE PERMANOVAS") print_message('', 'sh', main_sh, jobs) return permanovas
def nestedness_nodfs(i_datasets_folder: str, nodfs_fps: dict, collapsed: dict, filt_raref: str, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, jobs: bool, chunkit: int) -> None: RESOURCES = pkg_resources.resource_filename("routine_qiime2_analyses", "resources") nestedness_nodfs_fp = '%s/nestedness_nodfs.py' % RESOURCES job_folder2 = get_job_folder(i_datasets_folder, 'nestedness_figures/chunks') all_sh_pbs = {} for dat, nodfs in nodfs_fps.items(): out_sh = '%s/run_nestedness_nodfs_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) out_py = out_sh.replace('.sh', '.py') cur_sh = '%s/run_nestedness_nodfs_%s%s_tmp.sh' % (job_folder2, dat, filt_raref) cur_sh = cur_sh.replace(' ', '-') with open(cur_sh, 'w') as o: o.write('python3 %s\n' % out_py) all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) # value to edit in template odir = get_analysis_folder(i_datasets_folder, 'nestedness/%s%s' % (dat, filt_raref)) with open(out_py, 'w') as o, open(nestedness_nodfs_fp) as f: for line in f: line_edit = line if '<DAT>' in line: line_edit = line_edit.replace('<DAT>', dat) if '<ODIR>' in line: line_edit = line_edit.replace('<ODIR>', odir) if '<NODFS>' in line: line_edit = line_edit.replace("'<NODFS>'", str(nodfs)) if '<COLLAPSED>' in line: line_edit = line_edit.replace("'<COLLAPSED>'", str(collapsed)) o.write(line_edit) job_folder = get_job_folder(i_datasets_folder, 'nestedness_figures') main_sh = write_main_sh( job_folder, 'run_nestedness_nodfs%s' % filt_raref, all_sh_pbs, '%s.nstd.ndf%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: print("# NESTEDNESS NODFS") print_message('', 'sh', main_sh, jobs)
def run_distance_decay(i_datasets_folder: str, betas: dict, p_distance_decay: str, datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> (dict, list): job_folder2 = get_job_folder(i_datasets_folder, 'decay/chunks') decay_config = read_yaml_file(p_distance_decay) subsets, modes, params = get_decay_config(decay_config) all_sh_pbs = {} decay_res = {} for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items(): if not split: out_sh = '%s/run_decay_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) decay_res[dat] = [] for idx, metrics_groups_metas_qzas_dms_trees in enumerate( rarefs_metrics_groups_metas_qzas_dms_trees): decay_raref = {} cur_raref = datasets_rarefs[dat][idx] odir = get_analysis_folder(i_datasets_folder, 'decay/%s%s' % (dat, cur_raref)) if split: out_sh = '%s/run_decay_%s_%s%s%s.sh' % ( job_folder2, prjct_nm, dat, cur_raref, filt_raref) for metric, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items( ): for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items( ): for (meta, qza, mat_qza, tree) in metas_qzas_mat_qzas_trees: meta_pd = read_meta_pd(meta).set_index('sample_name') cases_dict = check_metadata_cases_dict( meta, meta_pd, dict(subsets), 'decay') for case_var, case_vals_list in cases_dict.items(): for case_vals in case_vals_list: case = get_case(case_vals, case_var).replace(' ', '_') cur_sh = '%s/run_decay_%s%s_%s_%s_%s%s.sh' % ( job_folder2, dat, cur_raref, metric, group, case, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) new_meta_pd = get_new_meta_pd( meta_pd, case, case_var, case_vals) res = run_single_decay( odir, group, new_meta_pd, cur_sh, mat_qza, case, modes, force, run_params["n_nodes"], run_params["n_procs"], int(params['iteration']), int(params['step'])) decay_raref[(metric, group, case)] = res decay_res[dat].append(decay_raref)
def run_taxonomy(method: str, i_datasets_folder: str, datasets: dict, datasets_read: dict, datasets_phylo: dict, datasets_features: dict, datasets_filt_map: dict, i_classifier: str, taxonomies: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> None: """ Parameters ---------- method i_datasets_folder : str Path to the folder containing the data/metadata subfolders. datasets : dict Mappring dataset name -> [data file path, metadata file path]. datasets_read : dict Mapping dataset name -> [data table, metadata table] datasets_phylo : dict To be updated with ('tree_to_use', 'corrected_or_not') per dataset. datasets_features : dict Mapping dataset name -> list of features names in the dataset tsv / biom file. datasets_filt_map : dict i_classifier : str Path to the taxonomic classifier. taxonomies : dict Mapping Dataset name -> [method, assignment qza] force : bool Force the re-writing of scripts for all commands. prjct_nm : str Short nick name for your project. qiime_env : str Name of your qiime2 conda environment (e.g. qiime2-2019.10). chmod : str Whether to change permission of output files (default: 744). noloc : str run_params : dict filt_raref : str jobs : bool chunkit : int Returns ------- """ job_folder = get_job_folder(i_datasets_folder, 'taxonomy') job_folder2 = get_job_folder(i_datasets_folder, 'taxonomy/chunks') amplicon_datasets = [ dat for dat, (tree, correction) in datasets_phylo.items() if tree == 'amplicon' ] wol_datasets = [ dat for dat, (tree, correction) in datasets_phylo.items() if tree == 'wol' ] main_written = 0 to_chunk = [] run_pbs = '%s/1_run_taxonomy_%s%s.sh' % (job_folder, prjct_nm, filt_raref) with open(run_pbs, 'w') as o: for dat, tsv_meta_pds_ in datasets_read.items(): out_sh = '%s/run_taxonomy_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] if dat in datasets_filt_map: taxonomies[dat] = taxonomies[datasets_filt_map[dat]] continue written = 0 with open(out_sh, 'w') as cur_sh: for idx, tsv_meta_pds in enumerate(tsv_meta_pds_): if idx: continue tsv, meta = datasets[dat][idx] if not isinstance(tsv_meta_pds[0], pd.DataFrame) and \ tsv_meta_pds[0] == 'raref': if not isfile(tsv): print('Must have run rarefaction to use it ' 'further...\nExiting') sys.exit(0) tsv_pd, meta_pd = get_raref_tab_meta_pds(meta, tsv) datasets_read[dat][idx] = [tsv_pd, meta_pd] else: tsv_pd, meta_pd = tsv_meta_pds odir = get_analysis_folder(i_datasets_folder, 'taxonomy/%s' % dat) out_rad = '%s/tax_%s' % (odir, dat) if dat in amplicon_datasets: out_qza = '%s_%s.qza' % (out_rad, method) out_tsv = '%s.tsv' % splitext(out_qza)[0] taxonomies[dat] = [method, out_qza, out_tsv] if not i_classifier: print('No classifier passed for 16S ' 'data\nExiting...') continue cmd = run_taxonomy_amplicon(dat, i_datasets_folder, force, tsv_pd, out_qza, out_tsv, i_classifier) else: out_qza = '%s.qza' % out_rad out_tsv = '%s.tsv' % out_rad if dat in wol_datasets: cur_datasets_features = datasets_features[dat] taxonomies[dat] = ['wol', out_qza, out_tsv] cmd = run_taxonomy_wol(force, tsv_pd, out_qza, out_tsv, cur_datasets_features) else: if len( [x for x in tsv_pd.index if str(x).isdigit()]) == tsv_pd.shape[0]: continue taxonomies[dat] = ['feat', out_qza, out_tsv] cmd = run_taxonomy_others(force, tsv_pd, out_qza, out_tsv) if cmd: cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n\n' % cmd) main_written += 1 written += 1 if written: to_chunk.append(out_sh) if not chunkit: run_xpbs(out_sh, out_pbs, '%s.tx.sklrn.%s%s' % (prjct_nm, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'taxonomy', prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if main_written: print_message('# Classify features using classify-sklearn', 'sh', run_pbs, jobs)
def run_collapse(i_datasets_folder: str, datasets: dict, datasets_filt: dict, datasets_read: dict, datasets_features: dict, datasets_phylo: dict, split_taxa_pds: dict, taxonomies: dict, p_collapse_taxo: str, datasets_rarefs: dict, datasets_collapsed: dict, datasets_collapsed_map: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, jobs: bool) -> dict: collapse_taxo = get_collapse_taxo(p_collapse_taxo, datasets_filt) main_written = 0 collapsed = {} datasets_update = {} datasets_read_update = {} datasets_features_update = {} datasets_phylo_update = {} stop_for_collapse = False job_folder = get_job_folder(i_datasets_folder, 'collapsed_taxo') job_folder2 = get_job_folder(i_datasets_folder, 'collapsed_taxo/chunks') run_pbs = '%s/3_run_collapsed_taxo_%s%s.sh' % (job_folder, prjct_nm, filt_raref) with open(run_pbs, 'w') as o: for dat, tab_meta_fps in datasets.items(): if dat not in collapse_taxo: continue # get the taxonomic levels collapse_levels = collapse_taxo[dat] split_taxa_pd, split_taxa_fp = split_taxa_pds[dat] split_levels, remove_empties = get_split_levels( collapse_levels, split_taxa_pd) collapsed[dat] = split_levels # files that will be collapsed using qiime2 tax_qza, tax_fp = taxonomies[dat][1:] written = 0 out_sh = '%s/run_collapsed_taxo_%s_%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as cur_sh: for idx, tab_meta_fp in enumerate(tab_meta_fps): tab_fp, meta_fp = tab_meta_fp tab_qza = '%s.qza' % splitext(tab_fp)[0] for tax, level in split_levels.items(): coll_paths = collapse_paths(dat, tax, tab_fp, meta_fp) dat_tax = coll_paths[0] dat_coll = coll_paths[1] coll_tsv = coll_paths[2] coll_qza = coll_paths[3] coll_meta = coll_paths[4] if isfile(coll_tsv) and isfile(coll_meta): coll_pd = pd.read_csv(coll_tsv, index_col=0, header=0, sep='\t') coll_meta_pd = pd.read_csv( coll_meta, header=0, sep='\t', dtype={'sample_name': str}) if coll_pd.shape[0] < 5: continue cmd = fix_collapsed_data(remove_empties[tax], coll_pd, coll_tsv, coll_qza, coll_meta) if cmd: cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n\n' % cmd) main_written += 1 written += 1 datasets_read_update.setdefault( dat_tax, []).append([coll_pd, coll_meta_pd]) datasets_collapsed.setdefault(dat, []).append(dat_coll) datasets_collapsed_map[dat_coll] = dat datasets_update.setdefault(dat_tax, []).append( [coll_tsv, coll_meta]) datasets_rarefs.setdefault(dat_tax, []).append( datasets_rarefs[dat][idx]) datasets_phylo_update[dat_tax] = ('', 0) else: written += 1 main_written += 1 stop_for_collapse = True cmd = write_collapse_taxo(tab_qza, tax_qza, coll_qza, coll_tsv, meta_fp, coll_meta, level, remove_empties[tax]) cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n\n' % cmd) if written: run_xpbs(out_sh, out_pbs, '%s.cllps.%s%s' % (prjct_nm, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if main_written: print_message( '# Collapse features for taxo levels defined in %s' % p_collapse_taxo, 'sh', run_pbs, jobs) if stop_for_collapse: print('Stopping here as this collapse must be run first for other ' 'analyses to work') sys.exit(0) datasets.update(datasets_update) datasets_read.update(datasets_read_update) datasets_phylo.update(datasets_phylo_update) return collapsed
def shear_tree( i_datasets_folder: str, datasets: dict, datasets_read: dict, datasets_phylo: dict, datasets_features: dict, prjct_nm: str, i_wol_tree: str, trees: dict, datasets_rarefs: dict, force: bool, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, jobs: bool) -> None: """ Get the sub-tree from the Web of Life tree that corresponds to the gOTUs-labeled features. :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets_read: dataset -> [tsv table, meta table] :param datasets_phylo: to be updated with ('tree_to_use', 'corrected_or_not') per dataset. :param datasets_features: dataset -> list of features names in the dataset tsv / biom file. :param prjct_nm: Short nick name for your project. :param i_wol_tree: default on barnacle /projects/wol/profiling/dbs/wol/phylogeny/wol_tree.nwk. :param trees: to be update with tree to use for a dataset phylogenetic analyses. :param force: Force the re-writing of scripts for all commands. :param qiime_env: name of your qiime2 conda environment (e.g. qiime2-2019.10). :param chmod: whether to change permission of output files (defalt: 775). """ # check whether there's dataset(s) that may use # the Web of Life tree (i.e. features contain gID) wol_datasets = [dat for dat, (tree, correction) in datasets_phylo.items() if tree == 'wol'] if len(wol_datasets): job_folder = get_job_folder(i_datasets_folder, 'phylo') job_folder2 = get_job_folder(i_datasets_folder, 'phylo/chunks') i_wol_tree = get_wol_tree(i_wol_tree) wol = TreeNode.read(i_wol_tree) main_written = 0 main_sh = '%s/0_run_import_trees_%s%s.sh' % (job_folder, prjct_nm, filt_raref) with open(main_sh, 'w') as main_o: for dat, tsv_metas_fps_ in datasets.items(): written = 0 if dat not in wol_datasets: continue out_sh = '%s/run_import_tree_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = out_sh.replace('.sh', '.slm') else: out_pbs = out_sh.replace('.sh', '.pbs') with open(out_sh, 'w') as o: for idx, tsv_metas_fps in enumerate(tsv_metas_fps_): tsv, meta = tsv_metas_fps if not isinstance(datasets_read[dat][idx][0], pd.DataFrame) and datasets_read[dat][idx][0] == 'raref': if not isfile(tsv): print('Must have run rarefaction to use it further...\nExiting') sys.exit(0) tsv_pd, meta_pd = get_raref_tab_meta_pds(meta, tsv) datasets_read[dat][idx] = [tsv_pd, meta_pd] else: tsv_pd, meta_pd = datasets_read[dat][idx] cur_raref = datasets_rarefs[dat][idx] cur_datasets_features = dict( gid for gid in datasets_features[dat].items() if gid[1] in tsv_pd.index) analysis_folder = get_analysis_folder(i_datasets_folder, 'phylo/%s' % dat) wol_features_fpo = '%s/tree_%s%s.nwk' % (analysis_folder, dat, cur_raref) wol_features_qza = wol_features_fpo.replace('.nwk', '.qza') # if idx: # trees[dat].append(('', wol_features_qza)) # else: # trees[dat] = [('', wol_features_qza)] if not idx: trees[dat] = ('', wol_features_qza) if force or not isfile(wol_features_qza): wol_features = wol.shear(list(cur_datasets_features.keys())) # rename the tip per the features names associated with each gID for tip in wol_features.tips(): tip.name = cur_datasets_features[tip.name] wol_features.write(wol_features_fpo) cmd = run_import(wol_features_fpo, wol_features_qza, "Phylogeny[Rooted]") o.write("echo '%s'\n" % cmd) o.write('%s\n\n' % cmd) written += 1 main_written += 1 run_xpbs(out_sh, out_pbs, '%s.shr.%s%s' % (prjct_nm, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', main_o, noloc, slurm, jobs) if main_written: print_message("# Shear Web of Life tree to features' genome IDs (%s)" % ', '.join(wol_datasets), 'sh', main_sh, jobs)
def run_doc(i_datasets_folder: str, datasets: dict, p_doc_config: str, datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, phates: dict, doc_phate: bool, split: bool, jobs: bool, chunkit: int) -> None: job_folder2 = get_job_folder(i_datasets_folder, 'doc/chunks') doc_filtering, doc_params, main_cases_dict = get_doc_config(p_doc_config) all_sh_pbs = {} all_import_sh_pbs = {} dat_cases_tabs = {} need_to_run_phate = [] need_to_run_less_phate = [] for dat, tsv_meta_pds_ in datasets.items(): dat_cases_tabs[dat] = {} if dat in doc_filtering: filters = doc_filtering[dat] else: filters = {'0-0': ['0', '0']} for idx, tsv_meta_pds in enumerate(tsv_meta_pds_): dat_phates = [] if dat in phates: dat_phates = phates[dat][idx] tsv, meta = tsv_meta_pds meta_pd = read_meta_pd(meta) meta_pd = meta_pd.set_index('sample_name') cases_dict = check_metadata_cases_dict(meta, meta_pd, dict(main_cases_dict), 'DOC') cur_raref = datasets_rarefs[dat][idx] dat_cases_tabs[dat][cur_raref] = {} if not split: out_sh = '%s/run_doc_%s%s%s.sh' % (job_folder2, dat, filt_raref, cur_raref) out_import_sh = '%s/run_import_doc_%s%s%s.sh' % ( job_folder2, dat, filt_raref, cur_raref) odir = get_analysis_folder(i_datasets_folder, 'doc/%s' % dat) for filt, (fp, fa) in filters.items(): if split: out_sh = '%s/run_doc_%s%s%s_%s.sh' % ( job_folder2, dat, filt_raref, cur_raref, filt) out_import_sh = '%s/run_import_doc_%s%s%s_%s.sh' % ( job_folder2, dat, filt_raref, cur_raref, filt) for case_var, case_vals_list in cases_dict.items(): cur_sh = '%s/run_doc_%s_%s%s%s_%s.sh' % ( job_folder2, dat, case_var, filt_raref, cur_raref, filt) cur_sh = cur_sh.replace(' ', '-') cur_import_sh = '%s/run_import_doc_%s_%s%s%s_%s.sh' % ( job_folder2, dat, case_var, filt_raref, cur_raref, filt) cur_import_sh = cur_import_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) all_import_sh_pbs.setdefault((dat, out_import_sh), []).append(cur_import_sh) cases = run_single_doc( i_datasets_folder, odir, tsv, meta_pd, case_var, doc_params, case_vals_list, cur_sh, cur_import_sh, force, filt, cur_raref, fp, fa, run_params["n_nodes"], run_params["n_procs"], dat_phates, doc_phate, need_to_run_phate, need_to_run_less_phate) dat_cases_tabs[dat][cur_raref].setdefault(case_var, []).extend(cases) for need_to_run in need_to_run_phate: print(' -', need_to_run) job_folder = get_job_folder(i_datasets_folder, 'doc') main_sh = write_main_sh(job_folder, '3_run_import_doc%s' % filt_raref, all_import_sh_pbs, '%s.doc.mpt%s' % (prjct_nm, filt_raref), "4", "1", "1", "500", "mb", qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_doc_config: if p_doc_config.startswith('/panfs'): p_doc_config = p_doc_config.replace(os.getcwd(), '') print('# Import for DOC (groups config in %s)' % p_doc_config) else: print('# Import DOC') print_message('', 'sh', main_sh, jobs) main_sh = write_main_sh(job_folder, '3_run_doc%s' % filt_raref, all_sh_pbs, '%s.doc%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, jobs, slurm, chunkit, '~/.') if main_sh: if p_doc_config: if p_doc_config.startswith('/panfs'): p_doc_config = p_doc_config.replace(os.getcwd(), '') print('# DOC (groups config in %s)' % p_doc_config) else: print('# DOC') print_message('', 'sh', main_sh, jobs) do_r = 1 if do_r: job_folder = get_job_folder(i_datasets_folder, 'doc/R') job_folder2 = get_job_folder(i_datasets_folder, 'doc/R/chunks') main_written = 0 main_sh = '%s/run_R_doc%s.sh' % (job_folder, filt_raref) with open(main_sh, 'w') as main_o: for dat, raref_case_var_cases in dat_cases_tabs.items(): shs = [] written = 0 odir = get_analysis_folder(i_datasets_folder, 'doc/%s' % dat) log_error = '%s/log.error' % odir for raref, case_var_cases in raref_case_var_cases.items(): for case_var, cases in case_var_cases.items(): for cdx, case in enumerate(cases): plot = '%s_%s_%s_%s' % (dat, raref, case_var, cdx) case_r = '%s/R' % case pdf = '%s/plot.pdf' % case_r do = '%s/DO.tsv' % case_r if not isfile(pdf): cur_r = '%s/run_R_doc_%s_%s_%s_vanilla.R' % ( job_folder2, dat, case_var, cdx) cur_sh = 'echo "*** %s" >> %s\n' % (plot, log_error) cur_sh += 'R -f %s --vanilla 2>> %s\n' % ( cur_r, log_error) cur_sh += 'echo "end" >> %s\n' % log_error shs.append(cur_sh) with open(cur_r, 'w') as o: o.write("library(DOC)\n") o.write("library(ggplot2)\n") if not isfile(do): o.write( "otu <- read.table('%s/tab.tsv', header=T, sep='\\t', comment.char='', check.names=F, nrows=2)\n" % case) o.write( "index_name <- colnames(otu)[1]\n") o.write( "otu <- read.table('%s/tab.tsv', header=T, sep='\\t', comment.char='', check.names=F, row.names=index_name)\n" % case) o.write("if (dim(otu)[1] > 100) {\n") o.write(" res <- DOC(otu)\n") o.write( " res.null <- DOC.null(otu)\n") o.write( " write.table(x=res$DO, file='%s/DO.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " write.table(x=res$LME, file='%s/LME.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " colnames(res$NEG) <- c('Neg_Slope', 'Data')\n" ) o.write( " write.table(x=res$NEG, file='%s/NEG.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " write.table(x=res$FNS, file='%s/FNS.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " write.table(x=res$BOOT, file='%s/BOOT.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " write.table(x=res$CI, file='%s/CI.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " write.table(x=res.null$DO, file='%s/null_DO.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " write.table(x=res.null$LME, file='%s/null_LME.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " colnames(res.null$NEG) <- c('Neg_Slope', 'Data')\n" ) o.write( " write.table(x=res.null$NEG, file='%s/null_NEG.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " write.table(x=res.null$FNS, file='%s/null_FNS.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " write.table(x=res.null$BOOT, file='%s/null_BOOT.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write( " write.table(x=res.null$CI, file='%s/null_CI.tsv', sep='\\t', quote=F, row.names=F)\n" % case_r) o.write("}\n") o.write( "res = list(BOOT=read.table('%s/BOOT.tsv', h=T, sep='\\t'), CI=read.table('%s/CI.tsv', h=T, sep='\\t'), DO=read.table('%s/DO.tsv', h=T, sep='\\t'), LME=read.table('%s/LME.tsv', h=T, sep='\\t'), FNS=read.table('%s/FNS.tsv', h=T, sep='\\t'), NEG=read.table('%s/NEG.tsv', h=T, sep='\\t'))\n" % (case_r, case_r, case_r, case_r, case_r, case_r)) o.write( "res.null = list(BOOT=read.table('%s/null_BOOT.tsv', h=T, sep='\\t'), CI=read.table('%s/null_CI.tsv', h=T, sep='\\t'), DO=read.table('%s/null_DO.tsv', h=T, sep='\\t'), LME=read.table('%s/null_LME.tsv', h=T, sep='\\t'), FNS=read.table('%s/null_FNS.tsv', h=T, sep='\\t'), NEG=read.table('%s/null_NEG.tsv', h=T, sep='\\t'))\n" % (case_r, case_r, case_r, case_r, case_r, case_r)) o.write( "colnames(res$NEG) <- c('Neg.Slope', 'Data')\n" ) o.write( "colnames(res.null$NEG) <- c('Neg.Slope', 'Data')\n" ) o.write( "res$DO <- res$DO[which(res$DO$Overlap <= 1),]\n" ) o.write( "res.null$DO <- res.null$DO[which(res.null$DO$Overlap <= 1),]\n" ) o.write("pdf('%s')\n" % pdf) o.write( "merged <- DOC.merge(list(s_%s = res, s_%s=res.null))\n" % (plot, plot)) o.write("plot(merged)\n") o.write("dev.off()\n") main_written += 1 written += 1 if written: if chunkit and len(shs) >= chunkit: chunks = [ list(x) for x in np.array_split(np.array(shs), chunkit) ] if split and len(shs) >= 3: chunks = [ list(x) for x in np.array_split(np.array(shs), 3) ] else: chunks = [shs] for cdx, chunk in enumerate(chunks): out_sh = '%s/run_R_doc_%s%s_%s.sh' % (job_folder2, dat, filt_raref, cdx) out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as o: for c in chunk: o.write('echo "%s"\n\n' % c) o.write('%s\n\n' % c) run_xpbs( out_sh, out_pbs, '%s.doc.R.%s%s_%s' % (prjct_nm, dat, filt_raref, cdx), 'xdoc', run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', main_o, noloc, slurm, jobs) if main_written: print_message('# DOC (R)', 'sh', main_sh, jobs)
def filter_rare_samples(i_datasets_folder: str, datasets: dict, datasets_read: dict, datasets_features: dict, datasets_rarefs: dict, datasets_filt: dict, datasets_filt_map: dict, datasets_phylo: dict, prjct_nm: str, qiime_env: str, p_filt_threshs: str, chmod: str, noloc: bool, run_params: dict, filt_raref: str, jobs: bool, slurm: bool, chunkit: int) -> None: """ Filter the rare features, keep samples with enough reads/features and import to Qiime2. :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets: dataset -> [tsv/biom path, meta path] :param datasets_read: dataset -> [tsv table, meta table] :param datasets_features: dataset -> list of features names in the dataset tsv / biom file. :param datasets_phylo: to be updated with ('tree_to_use', 'corrected_or_not') per dataset. :param prjct_nm: Short nick name for your project. :param qiime_env: name of your qiime2 conda environment (e.g. qiime2-2019.10). :param thresh: min number of reads per sample to keep it. :param chmod: whether to change permission of output files (defalt: 775). """ threshs_dats = read_yaml_file(p_filt_threshs) written = 0 datasets_update = {} datasets_read_update = {} datasets_features_update = {} datasets_phylo_update = {} job_folder = get_job_folder(i_datasets_folder, 'import_filtered') out_sh = '%s/1_run_import_filtered_%s%s.sh' % (job_folder, prjct_nm, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] to_chunk = [] with open(out_sh, 'w') as sh: for dat, tab_meta_pds_ in datasets_read.items(): if dat not in threshs_dats: continue names, thresh_sam, thresh_feat = get_thresholds(threshs_dats[dat]) if no_filtering(dat, thresh_sam, thresh_feat): continue dat_filt = get_dat_filt(dat, names, thresh_sam, thresh_feat) datasets_filt[dat] = dat_filt datasets_filt_map[dat_filt] = dat datasets_rarefs[dat_filt] = [''] tsv_filt, qza_filt, meta_filt = get_fps(i_datasets_folder, dat_filt) if isfile(qza_filt) and isfile(meta_filt): datasets_update[dat_filt] = [[tsv_filt, meta_filt]] tab_filt_pd = pd.read_csv(tsv_filt, index_col=0, header=0, sep='\t') with open(meta_filt) as f: for line in f: break meta_filt_pd = pd.read_csv(meta_filt, header=0, sep='\t', dtype={line.split('\t')[0]: str}, low_memory=False) # datasets_read_update[dat_filt] = [tab_filt_pd, meta_filt_pd] datasets_read_update[dat_filt] = [[tab_filt_pd, meta_filt_pd]] datasets_phylo_update[dat_filt] = datasets_phylo[dat] datasets_features_update[dat_filt] = dict( gid_feat for gid_feat in datasets_features[dat].items() if gid_feat[1] in tab_filt_pd.index) continue for (tab_pd, meta_pd) in tab_meta_pds_: tab_filt_pd = filtering_thresholds(names, thresh_sam, thresh_feat, tab_pd) if harsh_filtering(dat_filt, tab_filt_pd): continue meta_filt_pd = meta_pd.loc[meta_pd.sample_name.isin( tab_filt_pd.columns.tolist())].copy() tab_filt_pd.reset_index().to_csv(tsv_filt, index=False, sep='\t') meta_filt_pd.to_csv(meta_filt, index=False, sep='\t') datasets_update[dat_filt] = [[tsv_filt, meta_filt]] datasets_read_update[dat_filt] = [[tab_filt_pd, meta_filt_pd]] datasets_phylo_update[dat_filt] = datasets_phylo[dat] datasets_features_update[dat_filt] = dict( gid_feat for gid_feat in datasets_features[dat].items() if gid_feat[1] in tab_filt_pd.index) cmd = run_import(tsv_filt, qza_filt, "FeatureTable[Frequency]") sh.write('echo "%s"\n' % cmd) sh.write('%s\n' % cmd) written += 1 if written: run_xpbs( out_sh, out_pbs, '%s.fltr%s' % (prjct_nm, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, '# Filter samples for a min number of %s reads' % p_filt_threshs, None, noloc, slurm, jobs) # after this update, the raw dataset remain included datasets.update(datasets_update) datasets_read.update(datasets_read_update) datasets_features.update(datasets_features_update) datasets_phylo.update(datasets_phylo_update)
def run_phate(p_phate_config: str, i_datasets_folder: str, datasets: dict, datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> dict: job_folder2 = get_job_folder(i_datasets_folder, 'phate/chunks') phate_dicts = get_phate_dicts(p_phate_config) phate_filtering, phate_labels, phate_params, main_cases_dict = phate_dicts phates = {} all_sh_pbs = {} all_import_sh_pbs = {} for dat, tsv_meta_pds_ in datasets.items(): phates[dat] = [] if dat in phate_filtering: filters = phate_filtering[dat] else: filters = {'0_0': ['0', '0']} for idx, tsv_meta_pds in enumerate(tsv_meta_pds_): tsv, meta = tsv_meta_pds meta_pd = read_meta_pd(meta) meta_pd = meta_pd.set_index('sample_name') cases_dict = check_metadata_cases_dict(meta, meta_pd, dict(main_cases_dict), 'phate') cur_raref = datasets_rarefs[dat][idx] if not split: out_sh = '%s/run_phate_%s_%s%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref, cur_raref) out_import_sh = '%s/run_import_phate_%s_%s%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref, cur_raref) odir = get_analysis_folder(i_datasets_folder, 'phate/%s' % dat) raref_phates = {} for filt, (fp, fa) in filters.items(): raref_phates[filt] = {} if split: out_sh = '%s/run_phate_%s_%s%s%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref, cur_raref, filt) out_import_sh = '%s/run_import_phate_%s_%s%s%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref, cur_raref, filt) for case_var, case_vals_list in cases_dict.items(): cur_sh = '%s/run_phate_%s_%s%s%s_%s_%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref, cur_raref, case_var, filt) cur_sh = cur_sh.replace(' ', '-') cur_import_sh = '%s/run_import_phate_%s_%s%s%s_%s_%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref, cur_raref, case_var, filt) cur_import_sh = cur_import_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) all_import_sh_pbs.setdefault((dat, out_import_sh), []).append(cur_import_sh) phate = run_single_phate(dat, odir, tsv, meta_pd, case_var, phate_labels, phate_params, run_params, case_vals_list, cur_sh, cur_import_sh, force, filt, cur_raref, fp, fa) raref_phates[filt][case_var] = phate phates[dat].append(raref_phates) job_folder = get_job_folder(i_datasets_folder, 'phate') main_sh = write_main_sh( job_folder, '3_run_import_phate_%s%s' % (prjct_nm, filt_raref), all_import_sh_pbs, '%s.mrt.pht%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_phate_config: if p_phate_config.startswith('/panfs'): p_phate_config = p_phate_config.replace(os.getcwd(), '') print('# Import for PHATE (groups config in %s)' % p_phate_config) else: print('# Import for PHATE') print_message('', 'sh', main_sh, jobs) main_sh = write_main_sh( job_folder, '3_run_phate_%s%s' % (prjct_nm, filt_raref), all_sh_pbs, '%s.pht%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], 'xphate', chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_phate_config: if p_phate_config.startswith('/panfs'): p_phate_config = p_phate_config.replace(os.getcwd(), '') print('# PHATE (groups config in %s)' % p_phate_config) else: print('# PHATE') print_message('', 'sh', main_sh, jobs) return phates
def run_songbird(p_diff_models: str, i_datasets_folder: str, datasets: dict, datasets_read: dict, datasets_filt: dict, train_test_dict: dict, input_to_filtered: dict, mmvec_outputs: list, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, split: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> list: """ Run songbird: Vanilla regression methods for microbiome differential abundance analysis. https://github.com/biocore/songbird Main per-dataset looper for the songbird datasets. :param p_diff_models: Formulas for multinomial regression-based differential abundance ranking. :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets: list of datasets. :param force: Force the re-writing of scripts for all commands. :param prjct_nm: Nick name for your project. :param qiime_env: qiime2-xxxx.xx conda environment. :param chmod: whether to change permission of output files (default: 775). """ job_folder = get_job_folder(i_datasets_folder, 'songbird') job_folder2 = get_job_folder(i_datasets_folder, 'songbird/chunks') songbird_dicts = get_songbird_dicts(p_diff_models) songbird_models = songbird_dicts[0] songbird_filtering = songbird_dicts[1] unique_filtering = get_unique_filterings(songbird_filtering) params = songbird_dicts[2] models_baselines = songbird_dicts[3] songbird_datasets = songbird_dicts[4] songbird_subsets = songbird_dicts[5] trains = params['train'] batches = params['batches'] learns = params['learns'] epochs = params['epochs'] thresh_feats = params['thresh_feats'] thresh_samples = params['thresh_samples'] diff_priors = params['diff_priors'] summary_intervals = params['summary_interval'] filt_datasets_done, common_datasets_done = check_filtered_and_common_dataset( i_datasets_folder, datasets, datasets_filt, songbird_datasets, {}, songbird_filtering, unique_filtering, 'songbird', input_to_filtered, songbird_subsets) already_computed = {} filt_datasets, common_datasets = make_filtered_and_common_dataset( i_datasets_folder, datasets, datasets_filt, datasets_read, songbird_datasets, train_test_dict, {}, songbird_filtering, unique_filtering, job_folder, force, prjct_nm, qiime_env, chmod, noloc, 'songbird', filt_raref, filt_datasets_done, common_datasets_done, input_to_filtered, already_computed, songbird_subsets, jobs) songbird_models.update( dict((input_to_filtered[x], y) for x, y in songbird_models.items() if x in input_to_filtered)) songbirds = {} for dat, filts_files in filt_datasets.items(): for (case, filts), files in filts_files.items(): songbirds.setdefault(dat[0], []).append( [case, filts, files[0], files[2], ''])
def nestedness_graphs(i_datasets_folder: str, nestedness_res: dict, datasets: dict, split_taxa_pds: dict, datasets_rarefs: dict, colors: dict, datasets_collapsed_map: dict, collapsed: dict, filt_raref: str, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, jobs: bool, chunkit: int): RESOURCES = pkg_resources.resource_filename("routine_qiime2_analyses", "resources") nestedness_graphs_fp = '%s/nestedness_graphs.py' % RESOURCES job_folder2 = get_job_folder(i_datasets_folder, 'nestedness_figures/chunks') all_sh_pbs = {} for dat, nestedness_rarefs in nestedness_res.items(): if not split: out_sh = '%s/run_nestedness_graphs_%s_%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref) stats_tax_dat, level = get_stats_tax_dat(dat, datasets_collapsed_map) for idx, nestedness_raref in enumerate(nestedness_rarefs): cur_raref = datasets_rarefs[dat][idx] if split: out_sh = '%s/run_nestedness_graphs_%s_%s%s%s.sh' % ( job_folder2, prjct_nm, dat, cur_raref, filt_raref) out_py = out_sh.replace('.sh', '.py') cur_sh = '%s/run_nestedness_graphs_%s%s%s_tmp.sh' % ( job_folder2, dat, cur_raref, filt_raref) cur_sh = cur_sh.replace(' ', '-') with open(cur_sh, 'w') as o: o.write('python3 %s\n' % out_py) all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) # value to edit in template tab_fp, meta_fp = datasets[dat][idx] if stats_tax_dat in split_taxa_pds: split_taxa_fp = split_taxa_pds[stats_tax_dat][1] else: split_taxa_fp = '' with open(out_py, 'w') as o, open(nestedness_graphs_fp) as f: for line in f: line_edit = line if '<DAT>' in line: line_edit = line_edit.replace('<DAT>', dat) if '<CUR_RAREF>' in line: line_edit = line_edit.replace('<CUR_RAREF>', cur_raref) if '<TAB_FP>' in line: line_edit = line_edit.replace('<TAB_FP>', tab_fp) if '<META_FP>' in line: line_edit = line_edit.replace('<META_FP>', meta_fp) if '<COLORS_SAMPLE>' in line: line_edit = line_edit.replace("'<COLORS_SAMPLE>'", str(colors['sample'])) if '<COLORS_FEATURE>' in line: line_edit = line_edit.replace("'<COLORS_FEATURE>'", str(colors['feature'])) if '<STATS_TAX_DAT>' in line: line_edit = line_edit.replace('<STATS_TAX_DAT>', stats_tax_dat) if '<SPLIT_TAXA_FP>' in line: line_edit = line_edit.replace('<SPLIT_TAXA_FP>', split_taxa_fp) if '<LEVEL>' in line: line_edit = line_edit.replace('<LEVEL>', level) if '<COLLAPSED>' in line: line_edit = line_edit.replace("'<COLLAPSED>'", str(collapsed)) if '<NESTEDNESS_RAREF>' in line: line_edit = line_edit.replace("'<NESTEDNESS_RAREF>'", str(nestedness_raref)) o.write(line_edit) job_folder = get_job_folder(i_datasets_folder, 'nestedness_figures') main_sh = write_main_sh( job_folder, 'run_nestedness_graphs%s' % filt_raref, all_sh_pbs, '%s.nstd.grph%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: print("# NESTEDNESS GRAPHS") print_message('', 'sh', main_sh, jobs)
if testing_group == 'ALL': continue cur_sh = '%s/run_beta_group_significance_%s%s_%s_%s_%s_%s%s.sh' % ( job_folder2, dat, cur_depth, metric, subset, case, testing_group, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) run_single_perm(odir, subset, meta_pd, cur_sh, metric, case, testing_group, p_perm_tests_min, p_beta_type, qza, mat_qza, case_var, case_vals, npermutations, force) job_folder = get_job_folder(i_datasets_folder, 'permanova') main_sh = write_main_sh( job_folder, '3_run_beta_group_significance_%s%s' % (prjct_nm, filt_raref), all_sh_pbs, '%s.prm%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_perm_groups: print("# PERMANOVA (groups config in %s)" % p_perm_groups) else: print("# PERMANOVA") print_message('', 'sh', main_sh, jobs) return permanovas
def run_rarefy(i_datasets_folder: str, datasets: dict, datasets_read: dict, datasets_phylo: dict, datasets_filt_map: dict, datasets_rarefs: dict, p_raref_depths: str, eval_rarefs: bool, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, run_params: dict, filt_raref: str, filt_only: bool, jobs: bool, slurm: bool, chunkit: int) -> dict: """ Run rarefy: Rarefy table. https://docs.qiime2.org/2019.10/plugins/available/feature-table/rarefy/ :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets: dataset -> [tsv/biom path, meta path] :param datasets_read: dataset -> [tsv table, meta table] :param datasets_features: dataset -> list of features names in the dataset tsv / biom file. :param datasets_phylo: to be updated with ('tree_to_use', 'corrected_or_not') per dataset. :param force: Force the re-writing of scripts for all commands. :param prjct_nm: Nick name for your project. :param qiime_env: qiime2-xxxx.xx conda environment. :param chmod: whether to change permission of output files (defalt: 775). :return: deta divesity matrices. """ evaluation = '' eval_depths = {} datasets_raref_depths, datasets_raref_evals = check_rarefy_need( i_datasets_folder, datasets_read, p_raref_depths) if eval_rarefs: evaluation = '_eval' set_filt_rarefy(datasets_raref_depths, datasets_filt_map) datasets_update = {} datasets_read_update = {} datasets_phylo_update = {} datasets_append = {} main_written = 0 job_folder = get_job_folder(i_datasets_folder, 'rarefy%s' % evaluation) job_folder2 = get_job_folder(i_datasets_folder, 'rarefy%s/chunks' % evaluation) to_chunk = [] run_pbs = '%s/1_run_rarefy_%s%s%s.sh' % (job_folder, prjct_nm, evaluation, filt_raref) with open(run_pbs, 'w') as o: for dat, tsv_meta_pds_ in datasets.items(): written = 0 if dat not in datasets_raref_depths: continue if filt_only and dat not in datasets_filt_map: continue odir = get_analysis_folder(i_datasets_folder, 'rarefy%s/%s' % (evaluation, dat)) out_sh = '%s/run_rarefy_%s%s_%s.sh' % (job_folder2, prjct_nm, evaluation, dat) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as cur_sh: depths = datasets_raref_depths[dat][1] if eval_rarefs: depths = datasets_raref_evals[dat] tsv_pd, meta_pd = datasets_read[dat][0] tsv_sums = tsv_pd.sum() for tsv_meta_pds in tsv_meta_pds_: tsv, meta = tsv_meta_pds for depth_ in depths: depth = get_digit_depth(depth_, tsv_sums) dat_raref = '%s_raref%s%s' % (dat, evaluation, str(depth)) meta_out = '%s/meta_%s.tsv' % (odir, dat_raref) remaining_samples = tsv_sums[ tsv_sums >= depth].index.tolist() meta_raref_pd = meta_pd.loc[ meta_pd.sample_name.isin(remaining_samples), :] meta_raref_pd.to_csv(meta_out, index=False, sep='\t') qza = tsv.replace('.tsv', '.qza') qza_out = '%s/tab_%s.qza' % (odir, dat_raref) tsv_out = '%s.tsv' % splitext(qza_out)[0] if force or not os.path.isfile(tsv_out): cmd = write_rarefy(qza, qza_out, depth) cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n\n' % cmd) cmd = run_export(qza_out, tsv_out, 'FeatureTable[Frequency]') cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n\n' % cmd) main_written += 1 written += 1 if eval_rarefs: eval_depths.setdefault(dat, []).append( '%s_%s' % (dat, str(depth))) datasets_update['%s_%s' % (dat, str(depth))] = [[ tsv_out, meta_out ]] datasets_read_update['%s_%s' % (dat, str(depth))] = ( 'raref', str(depth)) datasets_phylo_update[ '%s_%s' % (dat, str(depth))] = datasets_phylo[dat] else: datasets_append.setdefault(dat, []).append( [tsv_out, meta_out]) if isfile(tsv_out) and isfile(meta_out): tab_filt_pd = pd.read_csv(tsv_out, index_col=0, header=0, sep='\t') with open(meta_out) as f: for line in f: break meta_filt_pd = pd.read_csv( meta_out, header=0, sep='\t', dtype={line.split('\t')[0]: str}, low_memory=False) datasets_read[dat].append( [tab_filt_pd, meta_filt_pd]) else: datasets_read[dat].append( ('raref', str(depth))) datasets_rarefs.setdefault(dat, []).append( '_raref%s%s' % (evaluation, str(depth))) to_chunk.append(out_sh) if not chunkit: run_xpbs( out_sh, out_pbs, '%s.bt%s.%s%s' % (prjct_nm, evaluation, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'rarefy%s' % evaluation, prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if main_written: print_message('# Get rarefied datasets', 'sh', run_pbs, jobs) if eval_rarefs: datasets.update(datasets_update) datasets_read.update(datasets_read_update) datasets_phylo.update(datasets_phylo_update) else: for dat, fps in datasets_append.items(): datasets[dat].extend(fps) return eval_depths
case, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) new_meta_pd = get_new_meta_pd( meta_pd, case, case_var, case_vals) res = run_single_decay( odir, group, new_meta_pd, cur_sh, mat_qza, case, modes, force, run_params["n_nodes"], run_params["n_procs"], int(params['iteration']), int(params['step'])) decay_raref[(metric, group, case)] = res decay_res[dat].append(decay_raref) job_folder = get_job_folder(i_datasets_folder, 'decay') main_sh = write_main_sh( job_folder, '3_run_decay_%s%s' % (prjct_nm, filt_raref), all_sh_pbs, '%s.prm%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_distance_decay: print("# decay (config in %s)" % p_distance_decay) else: print("# decay") print_message('', 'sh', main_sh, jobs) return decay_res
def run_mantel(i_datasets_folder: str, datasets_filt: dict, p_mantel: str, betas: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, filt_only: bool, eval_depths: dict, jobs: bool, chunkit: int) -> None: """ """ evaluation = '' if eval_depths: evaluation = '_eval' mantel_pairs = {} for dat, depths in eval_depths.items(): sorted_depths = sorted(depths, key=lambda x: int(x.split('_')[-1])) for idx, x in enumerate(sorted_depths[:-1]): y = sorted_depths[(idx + 1)] n0 = x.split('_')[-1] n1 = y.split('_')[-1] mantel_pairs['%s_%s' % (n0, n1)] = [x, y] mantel_subsets = {'ALL': [[]]} else: mantel_pairs, mantel_subsets = get_procrustes_mantel_dicts(p_mantel) get_job_folder(i_datasets_folder, 'mantel%s' % evaluation) all_sh_pbs = {} missing_dats = set() for pair, (dat1_, dat2_) in mantel_pairs.items(): dat1, raref1 = get_dat_idx(dat1_, evaluation, datasets_filt, filt_only) dat2, raref2 = get_dat_idx(dat2_, evaluation, datasets_filt, filt_only) if check_dat_exists(betas, dat1, missing_dats) or check_dat_exists( betas, dat2, missing_dats): continue if evaluation: metrics_groups_metas_qzas_dms_trees1 = betas[dat1] metrics_groups_metas_qzas_dms_trees2 = betas[dat2] else: metrics_groups_metas_qzas_dms_trees1 = betas[dat1][0] metrics_groups_metas_qzas_dms_trees2 = betas[dat2][0] job_folder2 = get_job_folder( i_datasets_folder, 'mantel%s/chunks/%s%s' % (evaluation, pair, filt_raref)) if not split: out_sh = '%s/run_mantel_%s%s_%s%s.sh' % ( job_folder2, prjct_nm, evaluation, pair, filt_raref) for metric, groups_metas_qzas_dms_trees1 in metrics_groups_metas_qzas_dms_trees1.items( ): if split: out_sh = '%s/run_mantel_%s%s_%s_%s%s.sh' % ( job_folder2, prjct_nm, evaluation, pair, metric, filt_raref) if metric not in metrics_groups_metas_qzas_dms_trees2: continue groups_metas_qzas_dms_trees2 = metrics_groups_metas_qzas_dms_trees2[ metric] groups1 = sorted(groups_metas_qzas_dms_trees1.keys()) groups2 = sorted(groups_metas_qzas_dms_trees2.keys()) for (group1_, group2_) in itertools.product(*[groups1, groups2]): if group1_ == '': group1 = 'full' else: group1 = group1_ if group2_ == '': group2 = 'full' else: group2 = group2_ meta1, qza1, dm1, tree1 = groups_metas_qzas_dms_trees1[ group1_][0] meta2, qza2, dm2, tree2 = groups_metas_qzas_dms_trees2[ group2_][0] skip = 0 if not evaluation: if '__raref' in dat1_: dm1, meta1 = get_dm_meta(dat1, dm1, meta1, raref1, metric, i_datasets_folder, skip) if '__raref' in dat2_: dm2, meta2 = get_dm_meta(dat2, dm2, meta2, raref2, metric, i_datasets_folder, skip) if skip: print( '[Mantels] One desired rarefaction depth not run (pair %s)' % pair) continue meta_pd1 = read_meta_pd(meta1) meta_pd2 = read_meta_pd(meta2) common_sams = list( set(meta_pd1.sample_name) & set(meta_pd2.sample_name)) if len(common_sams) < 3: continue meta_pd = meta_pd1.loc[meta_pd1.sample_name.isin(common_sams)] cases_dict = check_metadata_cases_dict(meta1, meta_pd, dict(mantel_subsets), 'mantel') odir = get_analysis_folder( i_datasets_folder, 'mantel%s/%s%s/%s_vs_%s' % (evaluation, pair, filt_raref, group1, group2)) job_folder3 = get_job_folder( i_datasets_folder, 'mantel%s/chunks/%s%s/%s_vs_%s' % (evaluation, pair, filt_raref, group1, group2)) for case_var, case_vals_list in cases_dict.items(): for case_vals in case_vals_list: case_ = get_case(case_vals, case_var).replace(' ', '_') cur = '%s__%s' % (metric, case_) cur_sh = '%s/run_mantel%s_%s%s.sh' % ( job_folder3, evaluation, cur, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((pair, out_sh), []).append(cur_sh) dm_out1 = '%s/dm_%s__%s_DM.qza' % (odir, dat1_, cur) dm_out2 = '%s/dm_%s__%s_DM.qza' % (odir, dat2_, cur) mantel_out = '%s/mantel%s_%s__%s__%s.qzv' % ( odir, evaluation, dat1_, dat2_, cur) run_single_procrustes_mantel('mantel', odir, dm1, dm2, meta_pd, dm_out1, dm_out2, mantel_out, cur_sh, cur, case_var, case_vals, force) job_folder = get_job_folder(i_datasets_folder, 'mantel%s' % evaluation) main_sh = write_main_sh( job_folder, '4_run_mantel_%s%s%s' % (prjct_nm, evaluation, filt_raref), all_sh_pbs, '%s.mntl%s%s' % (prjct_nm, evaluation, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_mantel and p_mantel != 1: if p_mantel.startswith('/panfs'): p_mantel = p_mantel.replace(os.getcwd(), '') print('# Mantels (pairs and samples subsets config in %s)' % p_mantel) else: print('# Mantels') print_message('', 'sh', main_sh, jobs)
def run_procrustes(i_datasets_folder: str, datasets_filt: dict, p_procrustes: str, betas: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, filt_only: bool, eval_depths: dict, jobs: bool, chunkit: int) -> None: """ """ evaluation = '' if eval_depths: evaluation = '_eval' procrustes_pairs = {} for dat, depths in eval_depths.items(): sorted_depths = sorted(depths, key=lambda x: int(x.split('_')[-1])) for idx, x in enumerate(sorted_depths[:-1]): y = sorted_depths[(idx + 1)] n0 = x.split('_')[-1] n1 = y.split('_')[-1] procrustes_pairs['%s_%s' % (n0, n1)] = [x, y] procrustes_subsets = {'ALL': [[]]} else: procrustes_pairs, procrustes_subsets = get_procrustes_mantel_dicts( p_procrustes) get_job_folder(i_datasets_folder, 'procrustes%s' % evaluation) dms_tab = [] all_sh_pbs = {} missing_dats = set() for pair, (dat1_, dat2_) in procrustes_pairs.items(): dat1, raref1 = get_dat_idx(dat1_, evaluation, datasets_filt, filt_only) dat2, raref2 = get_dat_idx(dat2_, evaluation, datasets_filt, filt_only) if check_dat_exists(betas, dat1, missing_dats) or check_dat_exists( betas, dat2, missing_dats): continue if evaluation: metrics_groups_metas_qzas_dms_trees1 = betas[dat1] metrics_groups_metas_qzas_dms_trees2 = betas[dat2] else: metrics_groups_metas_qzas_dms_trees1 = betas[dat1][0] metrics_groups_metas_qzas_dms_trees2 = betas[dat2][0] job_folder2 = get_job_folder( i_datasets_folder, 'procrustes%s/chunks/%s%s' % (evaluation, pair, filt_raref)) if not split: out_sh = '%s/run_procrustes_%s%s_%s%s.sh' % ( job_folder2, prjct_nm, evaluation, pair, filt_raref) for metric, groups_metas_qzas_dms_trees1 in metrics_groups_metas_qzas_dms_trees1.items( ): if split: out_sh = '%s/run_procrustes_%s%s_%s_%s%s.sh' % ( job_folder2, prjct_nm, evaluation, pair, metric, filt_raref) if metric not in metrics_groups_metas_qzas_dms_trees2: continue groups_metas_qzas_dms_trees2 = metrics_groups_metas_qzas_dms_trees2[ metric] groups1 = sorted(groups_metas_qzas_dms_trees1.keys()) groups2 = sorted(groups_metas_qzas_dms_trees2.keys()) for (group1_, group2_) in itertools.product(*[groups1, groups2]): if group1_ == '': group1 = 'full' else: group1 = group1_ if group2_ == '': group2 = 'full' else: group2 = group2_ meta1, qza1, dm1, tree1 = groups_metas_qzas_dms_trees1[ group1_][0] meta2, qza2, dm2, tree2 = groups_metas_qzas_dms_trees2[ group2_][0] skip = 0 if not evaluation: if '__raref' in dat1_: dm1, meta1 = get_dm_meta(dat1, dm1, meta1, raref1, metric, i_datasets_folder, skip) if '__raref' in dat2_: dm2, meta2 = get_dm_meta(dat2, dm2, meta2, raref2, metric, i_datasets_folder, skip) if skip: print( '[Proscustes] One desired rarefaction depth not run (pair %s)' % pair) continue meta_pd1 = read_meta_pd(meta1) meta_pd2 = read_meta_pd(meta2) common_sams = list( set(meta_pd1.sample_name) & set(meta_pd2.sample_name)) if len(common_sams) < 3: continue meta_pd = meta_pd1.loc[meta_pd1.sample_name.isin(common_sams)] cases_dict = check_metadata_cases_dict( meta1, meta_pd, dict(procrustes_subsets), 'procrustes') odir = get_analysis_folder( i_datasets_folder, 'procrustes%s/%s%s/%s_vs_%s' % (evaluation, pair, filt_raref, group1, group2)) job_folder3 = get_job_folder( i_datasets_folder, 'procrustes%s/chunks/%s%s/%s_vs_%s' % (evaluation, pair, filt_raref, group1, group2)) for case_var, case_vals_list in cases_dict.items(): for case_vals in case_vals_list: case_ = get_case(case_vals, case_var).replace(' ', '_') cur = '%s__%s' % (metric, case_) cur_sh = '%s/run_procrustes%s_%s%s.sh' % ( job_folder3, evaluation, cur, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((pair, out_sh), []).append(cur_sh) dm_out1 = '%s/dm_%s__%s_DM.qza' % (odir, dat1_, cur) dm_out2 = '%s/dm_%s__%s_DM.qza' % (odir, dat2_, cur) dm_out1_tsv = '%s.tsv' % splitext(dm_out1)[0] dm_out2_tsv = '%s.tsv' % splitext(dm_out2)[0] biplot = '%s/procrustes%s_%s__%s__%s.qzv' % ( odir, evaluation, dat1_, dat2_, cur) run_single_procrustes_mantel('procrustes', odir, dm1, dm2, meta_pd, dm_out1, dm_out2, biplot, cur_sh, cur, case_var, case_vals, force) dms_tab.append([ pair, dat1_, dat2_, group1, group2, case_, metric, dm_out1_tsv, dm_out2_tsv ]) job_folder = get_job_folder(i_datasets_folder, 'procrustes%s' % evaluation) main_sh = write_main_sh( job_folder, '4_run_procrustes_%s%s%s' % (prjct_nm, evaluation, filt_raref), all_sh_pbs, '%s.prcst%s%s' % (prjct_nm, evaluation, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_procrustes and p_procrustes != 1: if p_procrustes.startswith('/panfs'): p_procrustes = p_procrustes.replace(os.getcwd(), '') print('# Procrustes (pairs and samples subsets config in %s)' % p_procrustes) else: print('# Procrustes') print_message('', 'sh', main_sh, jobs) dms_tab_pd = pd.DataFrame(dms_tab, columns=[ 'pair', 'dat1', 'dat2', 'metric', 'group1', 'group2', 'case', 'dm_out1', 'dm_out2', ]) odir = get_analysis_folder(i_datasets_folder, 'procrustes%s/R' % evaluation) out_Rs = glob.glob('%s/pairs_proscrustes_results%s%s*.tsv' % (odir, evaluation, filt_raref)) if len(out_Rs): done_R = pd.concat([pd.read_table(x, sep=' ') for x in out_Rs]) dms_tab_pd = dms_tab_pd.loc[~dms_tab_pd[['dm_out1', 'dm_out2']].sum(1). isin(done_R[['f1', 'f2']].sum(1))] if dms_tab_pd.shape[0]: fp_num = 0 if len(out_Rs): last = sorted( out_Rs, key=lambda fp: int(fp.split('.tsv')[0].split('_')[-1])) fp_num = int(last[-1].split('.tsv')[0].split('_')[-1]) + 1 dms_tab_fp = '%s/pairs%s%s_%s.tsv' % (odir, evaluation, filt_raref, fp_num) dms_tab_pd.to_csv(dms_tab_fp, index=False, sep='\t') out_R = '%s/pairs_proscrustes_results%s%s_%s.tsv' % ( odir, evaluation, filt_raref, fp_num) job_folder = get_job_folder(i_datasets_folder, 'procrustes/R') R_script = '%s/4_run_procrustes_%s%s.R' % (job_folder, prjct_nm, filt_raref) with open(R_script, 'w') as o: o.write("library(vegan)\n") o.write("dms_files <- read.table('%s', h=T)\n" % dms_tab_fp) o.write( "cols <- c('pair', 'd1', 'd2', 'g1', 'g2', 'case', 'metric', 'f1', 'f2', 'samples', 'M2', 'p-value')\n" ) o.write( "res <- setNames(data.frame(matrix(ncol = 12, nrow = 0)), cols)\n" ) o.write("for (i in seq(1, dim(dms_files)[1])) {\n") o.write(" row <- as.vector(unlist(dms_files[i,]))\n") o.write(" pair <- row[1]\n") o.write(" d1 <- row[2]\n") o.write(" d2 <- row[3]\n") o.write(" group1 <- row[4]\n") o.write(" group2 <- row[5]\n") o.write(" case <- row[6]\n") o.write(" metric <- row[7]\n") o.write(" f1 <- row[8]\n") o.write(" f2 <- row[9]\n") o.write(" if (sum(file.exists(f1, f2)) == 2) {\n") o.write( " filin_tsv_pd1 <- read.csv(f1, header = TRUE, check.names=FALSE,\n" ) o.write( " row.names = 1, colClasses = 'character', sep = '\\t')\n" ) o.write( " filin_tsv_pd2 <- read.csv(f2, header = TRUE, check.names=FALSE,\n" ) o.write( " row.names = 1, colClasses = 'character', sep = '\\t')\n" ) o.write(" filin_tsv_pd1 <- data.matrix(filin_tsv_pd1)\n") o.write(" filin_tsv_pd2 <- data.matrix(filin_tsv_pd2)\n") o.write( " filin_tsv_pd1 <- filin_tsv_pd1[rownames(filin_tsv_pd2), rownames(filin_tsv_pd2)]\n" ) o.write( " # procrustes12 <- procrustes(filin_tsv_pd1, filin_tsv_pd2, kind=2, permutations=999)\n" ) o.write( " prtst <- protest(filin_tsv_pd1, filin_tsv_pd2, permutations = 999)\n" ) o.write(" n <- dim(filin_tsv_pd1)[1]\n") o.write( " res[i,] <- c(pair, d1, d2, group1, group2, case, metric, f1, f2, n, prtst$ss, prtst$signif)\n" ) o.write(" }\n") o.write("}\n") o.write("write.table(x = res, file = '%s')\n" % out_R) out_sh = '%s/4_run_procrustes_%s%s_R%s.sh' % (job_folder, prjct_nm, evaluation, filt_raref) out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as o: o.write('R -f %s --vanilla\n' % R_script) run_xpbs( out_sh, out_pbs, '%s.prcrt%s.R%s' % (prjct_nm, evaluation, filt_raref), 'renv', run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, 1, '# Procrustes for stats in R (pairs and samples subsets config in %s)' % p_procrustes, None, False, jobs)
def run_barplot(i_datasets_folder: str, datasets: dict, taxonomies: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> None: """Visualize taxonomy with an interactive bar plot. Parameters ---------- i_datasets_folder : str Path to the folder containing the data/metadata subfolders datasets : dict Mappig dataset name -> [tsv file path, metadata file path] taxonomies : dict Mappig dataset name -> [classification_method, tax_qza] force : bool Force the re-writing of scripts for all commands prjct_nm : str Short nick name for your project qiime_env : str Mame of a qiime2 conda environment chmod : str Whether to change permission of output files (defalt: 744) noloc : bool run_params : dict filt_raref : str jobs : bool chunkit : int Returns ------- """ job_folder = get_job_folder(i_datasets_folder, 'barplot') job_folder2 = get_job_folder(i_datasets_folder, 'barplot/chunks') written = 0 to_chunk = [] run_pbs = '%s/1_run_barplot_%s%s.sh' % (job_folder, prjct_nm, filt_raref) with open(run_pbs, 'w') as o: for dat, tsv_meta_pds_ in datasets.items(): out_sh = '%s/run_barplot_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as cur_sh: for tsv_meta_pds in tsv_meta_pds_: tsv, meta = tsv_meta_pds if dat not in taxonomies: continue method, tax_qza, tax_tsv = taxonomies[dat] if not method: method = 'taxofromfile' qza = '%s.qza' % splitext(tsv)[0] odir = get_analysis_folder(i_datasets_folder, 'barplot/%s' % dat) out_qzv = '%s/bar_%s_%s.qzv' % (odir, dat, method) if force or not isfile(out_qzv): write_barplots(out_qzv, qza, meta, tax_qza, cur_sh) written += 1 to_chunk.append(out_sh) if not chunkit: run_xpbs(out_sh, out_pbs, '%s.brplt.%s%s' % (prjct_nm, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'barplot', prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if written: print_message('# Make sample compositions barplots', 'sh', run_pbs, jobs)
def run_nestedness(i_datasets_folder: str, betas: dict, datasets_collapsed_map: dict, p_nestedness_groups: str, datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> (dict, list, dict): job_folder2 = get_job_folder(i_datasets_folder, 'nestedness/chunks') nestedness_config = read_yaml_file(p_nestedness_groups) if 'soft' not in nestedness_config: print( 'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)' ) return {} if nestedness_config['soft'].endswith('Autocorrelation.jar') and isfile( nestedness_config['soft']): binary = nestedness_config['soft'] else: binary = '%s/bin/Autocorrelation.jar' % nestedness_config['soft'] if not isfile(binary): print( 'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)' ) return {} subsets, nodfs, colors, nulls, modes, params = get_nestedness_config( nestedness_config) nodfs_fps = {} all_sh_pbs = {} nestedness_res = {} for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items(): if not split: out_sh = '%s/run_nestedness_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) stats_tax_dat, level = get_stats_tax_dat(dat, datasets_collapsed_map) nestedness_res[dat] = [] for idx, metrics_groups_metas_qzas_dms_trees in enumerate( rarefs_metrics_groups_metas_qzas_dms_trees): nestedness_raref = {} cur_raref = datasets_rarefs[dat][idx] odir = get_analysis_folder(i_datasets_folder, 'nestedness/%s%s' % (dat, cur_raref)) if split: out_sh = '%s/run_nestedness_%s_%s%s%s.sh' % ( job_folder2, prjct_nm, dat, cur_raref, filt_raref) for _, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items( ): for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items( ): meta, qza, mat_qza, tree = metas_qzas_mat_qzas_trees[0] meta_pd = read_meta_pd(meta).set_index('sample_name') cases_dict = check_metadata_cases_dict( meta, meta_pd, dict(subsets), 'nestedness') for case_var, case_vals_list in cases_dict.items(): for case_vals in case_vals_list: case = get_case(case_vals, case_var).replace(' ', '_') cur_sh = '%s/run_nestedness_%s%s_%s_%s%s.sh' % ( job_folder2, dat, cur_raref, group, case, filt_raref) cur_sh = cur_sh.replace(' ', '-') # print("case", case) all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) res, group_case_nodfs = run_single_nestedness( odir, cur_raref, level, group, meta_pd, nodfs, nulls, modes, cur_sh, qza, case, case_var, case_vals, binary, params, force) nodfs_fps.setdefault(stats_tax_dat, []).extend(group_case_nodfs) nestedness_raref[(group, case)] = res break nestedness_res[dat].append(nestedness_raref)
cur_sh = '%s/run_songbird_%s_%s_%s_%s_%s_%s.sh' % ( job_folder2, dat_pair, filt, case, modx, mdx, idx) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) diffs, tensor_html = run_single_songbird( odir, odir_base, qza, new_qza, new_meta, cur_sh, force, batch, learn, epoch, diff_prior, thresh_feat, thresh_sample, formula, train_column, summary_interval, metadatas, baselines, model_baseline, baseline_formula) songbird_outputs.append([ dat, filt, '%s_%s' % (params.replace('/', '__'), model), case, diffs, model_baseline, tensor_html, pair ]) job_folder = get_job_folder(i_datasets_folder, 'songbird') main_sh = write_main_sh( job_folder, '2_songbird_%s%s' % (prjct_nm, filt_raref), all_sh_pbs, '%s.sngbrd%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, jobs, chunkit) if main_sh: if p_diff_models.startswith('/panfs'): p_diff_models = p_diff_models.replace(os.getcwd(), '') print_message("# Songbird (configs in %s)" % p_diff_models, 'sh', main_sh, jobs) return songbird_outputs
def run_qemistree(i_datasets_folder: str, datasets: dict, prjct_nm: str, i_qemistree: str, taxonomies: dict, force: bool, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> None: """ :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets_read: dataset -> [tsv table, meta table] :param prjct_nm: Short nick name for your project. :param i_qemistree: path to qemistree folder (feature-data and tree). :param taxonomies: dataset -> [method, assignment qza] :param force: Force the re-writing of scripts for all commands. :param qiime_env: name of your qiime2 conda environment (e.g. qiime2-2019.10). :param chmod: whether to change permission of output files (defalt: 775). """ job_folder = get_job_folder(i_datasets_folder, 'qemistree') job_folder2 = get_job_folder(i_datasets_folder, 'qemistree/chunks') written = 0 to_chunk = [] run_pbs = '%s/1_run_qemistree_%s%s.sh' % (job_folder, prjct_nm, filt_raref) with open(run_pbs, 'w') as o: for dat, tsv_meta_pds in datasets.items(): feature_data = '%s/feature-data_%s.qza' % (i_qemistree, dat) qemistree = '%s/qemistree_%s.qza' % (i_qemistree, dat) if not isfile(feature_data) or not isfile(qemistree): continue out_sh = '%s/run_qemistree_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] odir = get_analysis_folder(i_datasets_folder, 'qemistree/%s' % dat) classyfire_qza = '%s/%s-classyfire.qza' % (odir, dat) classyfire_tsv = '%s.tsv' % splitext(classyfire_qza)[0] with open(out_sh, 'w') as cur_sh: if force or not isfile(classyfire_tsv): write_qemistree(feature_data, classyfire_qza, classyfire_tsv, qemistree, cur_sh) written += 1 if isfile(classyfire_tsv): odir = get_analysis_folder(i_datasets_folder, 'taxonomy/%s' % dat) out_rad = '%s/tax_%s' % (odir, dat) tax_qza = '%s.qza' % out_rad tax_tsv = '%s.tsv' % out_rad classyfire_pd = pd.read_csv(classyfire_tsv, header=0, sep='\t') with open(tax_tsv, 'w') as o: cols = ['id', 'kingdom', 'superclass', 'class', 'subclass', 'direct_parent'] o.write('Feature ID\tTaxon\n') for row in classyfire_pd[cols].values: o.write('%s\t%s\n' % (row[0], '; '.join(row[1:]))) run_export(tax_tsv, tax_qza, 'FeatureData[Taxonomy]') taxonomies[dat] = ['direct_parent', tax_qza] written += 1 else: print('[Warning] Maybe run qemistree first and then re-run pipeline to ' 'have the classyfire taxonomy include in the barplots!') to_chunk.append(out_sh) if not chunkit: run_xpbs(out_sh, out_pbs, '%s.qmstr.%s%s' % (prjct_nm, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'qemistree', prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if written: print_message('# Make qemistree classyfire classifications', 'sh', run_pbs, jobs)
filt_raref) cur_sh = cur_sh.replace(' ', '-') # print("case", case) all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) res, group_case_nodfs = run_single_nestedness( odir, cur_raref, level, group, meta_pd, nodfs, nulls, modes, cur_sh, qza, case, case_var, case_vals, binary, params, force) nodfs_fps.setdefault(stats_tax_dat, []).extend(group_case_nodfs) nestedness_raref[(group, case)] = res break nestedness_res[dat].append(nestedness_raref) job_folder = get_job_folder(i_datasets_folder, 'nestedness') main_sh = write_main_sh( job_folder, '3_run_nestedness_%s%s' % (prjct_nm, filt_raref), all_sh_pbs, '%s.prm%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_nestedness_groups: print("# nestedness (config in %s)" % p_nestedness_groups) else: print("# nestedness") print_message('', 'sh', main_sh, jobs) return nestedness_res, colors, nodfs_fps
def run_sepp(i_datasets_folder: str, datasets: dict, datasets_read: dict, datasets_phylo: dict, datasets_rarefs: dict, prjct_nm: str, i_sepp_tree: str, trees: dict, force: bool, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, jobs: bool) -> None: """ Run SEPP on the datasets composed or 16S deblur sequences (e.g. from redbiom/Qiita). :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets: dataset -> [tsv/biom path, meta path] :param datasets_read: dataset -> [tsv table, meta table] :param datasets_phylo: to be updated with ('tree_to_use', 'corrected_or_not') per dataset. :param prjct_nm: Short nick name for your project. :param i_sepp_tree: database to use for sepp phylogeny reads placement. :param trees: to be update with tree to use for a dataset phylogenetic analyses. :param force: Force the re-writing of scripts for all commands. :param qiime_env: name of your qiime2 conda environment (e.g. qiime2-2019.10). :param chmod: whether to change permission of output files (defalt: 775). """ # check whether there's dataset(s) that may use the reference tree (i.e. features are DNA sequences) sepp_datasets = [dat for dat, (tree, correction) in datasets_phylo.items() if tree == 'amplicon'] if len(sepp_datasets): ref_tree_qza = get_sepp_tree(i_sepp_tree) job_folder = get_job_folder(i_datasets_folder, 'phylo') job_folder2 = get_job_folder(i_datasets_folder, 'phylo/chunks') main_written = 0 main_sh = '%s/1_run_sepp_%s%s.sh' % (job_folder, prjct_nm, filt_raref) with open(main_sh, 'w') as main_o: for dat, tsv_metas_fps_ in datasets.items(): written = 0 if dat not in sepp_datasets: continue out_sh = '%s/run_sepp_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as cur_sh: for idx, tsv_metas_fps in enumerate(tsv_metas_fps_): tsv, meta = tsv_metas_fps if not isinstance(datasets_read[dat][idx][0], pd.DataFrame) and datasets_read[dat][idx][0] == 'raref': qza_raw_in = '%s/data/tab_%s_inTree.qza' % (i_datasets_folder, dat) if isfile(qza_raw_in) and not force: odir_sepp = get_analysis_folder(i_datasets_folder, 'phylo/%s' % dat) out_fp_sepp_tree = '%s/tree_%s.qza' % (odir_sepp, dat) # if idx: # trees[dat].append((qza_raw_in, out_fp_sepp_tree)) # else: # trees[dat] = [(qza_raw_in, out_fp_sepp_tree)] if not idx: trees[dat] = (qza_raw_in, out_fp_sepp_tree) print('Using the non rarefied tree (no need to recompute)...\nExiting') continue elif not isfile(tsv): print('Must have run rarefaction to use it further...\nExiting') sys.exit(0) tsv_pd, meta_pd = get_raref_tab_meta_pds(meta, tsv) datasets_read[dat][idx] = [tsv_pd, meta_pd] else: tsv_pd, meta_pd = datasets_read[dat][idx] qza = '%s.qza' % splitext(tsv)[0] if not isfile(qza): print('Need to first import %s to .qza to do reads placement ' '(see "# Import tables to qiime2")\nExiting...' % tsv) sys.exit(0) cur_raref = datasets_rarefs[dat][idx] qza_in = '%s_inTree%s.qza' % (splitext(tsv)[0], cur_raref) qza_in_tsv = '%s.tsv' % splitext(qza_in)[0] qza_out = '%s_notInTree%s.qza' % (splitext(tsv)[0], cur_raref) odir_seqs = get_analysis_folder(i_datasets_folder, 'seqs/%s' % dat) odir_sepp = get_analysis_folder(i_datasets_folder, 'phylo/%s' % dat) out_fp_seqs_rad = '%s/seq_%s%s' % (odir_seqs, dat, cur_raref) out_fp_seqs_fasta = '%s.fasta' % out_fp_seqs_rad out_fp_seqs_qza = '%s.qza' % out_fp_seqs_rad out_fp_sepp_tree = '%s/tree_%s%s.qza' % (odir_sepp, dat, cur_raref) # if idx: # trees[dat].append((qza_in, out_fp_sepp_tree)) # else: # trees[dat] = [(qza_in, out_fp_sepp_tree)] if not idx: trees[dat] = (qza_in, out_fp_sepp_tree) written = 0 if force or not isfile(out_fp_seqs_qza): cmd = write_seqs_fasta(out_fp_seqs_fasta, out_fp_seqs_qza, tsv_pd) cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n\n' % cmd) written += 1 main_written += 1 if force or not isfile(out_fp_sepp_tree) or not isfile(qza_in_tsv): cmd = write_fragment_insertion( out_fp_seqs_qza, ref_tree_qza, out_fp_sepp_tree, qza, qza_in, qza_in_tsv, qza_out) cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n\n' % cmd) written += 1 main_written += 1 run_xpbs(out_sh, out_pbs, '%s.spp.%s%s' % (prjct_nm, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', main_o, noloc, slurm, jobs) if main_written: print_message("# Fragment insertion using SEPP (%s)" % ', '.join(sepp_datasets), 'sh', main_sh, jobs)
def run_permanova(i_datasets_folder: str, betas: dict, main_testing_groups: tuple, p_perm_tests_min: int, p_beta_type: tuple, datasets_rarefs: dict, p_perm_groups: str, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> dict: """ Run beta-group-significance: Beta diversity group significance. https://docs.qiime2.org/2019.10/plugins/available/diversity/beta-group-significance/ Main per-dataset looper for the PERMANOVA tests on beta diversity matrices. :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets: list of datasets. :param betas: beta diversity matrices. :param main_testing_groups: groups to test. :param p_perm_groups: groups to subset. :param force: Force the re-writing of scripts for all commands. :param prjct_nm: Nick name for your project. :param qiime_env: qiime2-xxxx.xx conda environment. :param chmod: whether to change permission of output files (defalt: 775). """ permanovas = {} job_folder2 = get_job_folder(i_datasets_folder, 'permanova/chunks') main_cases_dict = get_main_cases_dict(p_perm_groups) npermutations = 999 metric_check = set() all_sh_pbs = {} first_print = 0 for dat, metric_groups_metas_qzas_dms_trees_ in betas.items(): permanovas[dat] = [] if not split: out_sh = '%s/run_beta_group_significance_%s_%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref) for idx, metric_groups_metas_qzas_dms_trees in enumerate( metric_groups_metas_qzas_dms_trees_): cur_depth = datasets_rarefs[dat][idx] odir = get_analysis_folder(i_datasets_folder, 'permanova/%s%s' % (dat, cur_depth)) for metric, subset_files in metric_groups_metas_qzas_dms_trees.items( ): permanovas.setdefault(dat, []).append(metric) if split: out_sh = '%s/run_beta_group_significance_%s_%s_%s%s.sh' % ( job_folder2, prjct_nm, dat, metric, filt_raref) for subset, metas_qzas_mat_qzas_trees in subset_files.items(): (meta, qza, mat_qza, tree) = metas_qzas_mat_qzas_trees[0] if not isfile(mat_qza): if not first_print: print( 'Beta diversity, distances matrices must be generated already to automatise PERMANOVA\n' '\t(re-run this after steps "2_run_beta.sh" and "2x_run_beta_export.pbs" are done)' ) first_print += 1 continue if (dat, subset) not in metric_check: meta_pd = read_meta_pd(meta) meta_pd = meta_pd.set_index('sample_name') cases_dict = check_metadata_cases_dict( meta, meta_pd, dict(main_cases_dict), 'PERMANOVA') testing_groups = check_metadata_testing_groups( meta, meta_pd, main_testing_groups, p_perm_tests_min, 'PERMANOVA') metric_check.add((dat, subset)) for case_var, case_vals_list in cases_dict.items(): testing_groups_case_var = list( set(testing_groups + [case_var])) for case_vals in case_vals_list: case = get_case(case_vals, case_var).replace(' ', '_') for testing_group in testing_groups_case_var: if testing_group == 'ALL': continue cur_sh = '%s/run_beta_group_significance_%s%s_%s_%s_%s_%s%s.sh' % ( job_folder2, dat, cur_depth, metric, subset, case, testing_group, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) run_single_perm(odir, subset, meta_pd, cur_sh, metric, case, testing_group, p_perm_tests_min, p_beta_type, qza, mat_qza, case_var, case_vals, npermutations, force)
def run_deicode(i_datasets_folder: str, datasets: dict, datasets_rarefs: dict, p_perm_groups: str, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> None: """ Performs robust center log-ratio transform robust PCA and ranks the features by the loadings of the resulting SVD. https://library.qiime2.org/plugins/deicode/19/ Main per-dataset looper for the ADONIS tests on beta diversity matrices. :param i_data_sets_folder: Path to the folder containing the data/metadata subfolders. :param data_sets: list of data_sets. :param p_perm_groups: groups to subset. :param force: Force the re-writing of scripts for all commands. :param prjct_nm: Nick name for your project. :param qiime_env: qiime2-xxxx.xx conda environment. :param chmod: whether to change permission of output files (defalt: 775). """ job_folder2 = get_job_folder(i_datasets_folder, 'deicode/chunks') main_cases_dict = get_main_cases_dict(p_perm_groups) # jobs = [] all_sh_pbs = {} for dat, tsv_meta_pds_ in datasets.items(): out_sh = '%s/run_deicode_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) for idx, tsv_meta_pds in enumerate(tsv_meta_pds_): cur_raref = datasets_rarefs[dat][idx] tsv, meta = tsv_meta_pds meta_alphas = meta.replace('.tsv', '_alphas.tsv') meta_alphas_full = meta.replace('.tsv', '_alphas_full.tsv') if isfile(meta_alphas_full): meta = meta_alphas_full elif isfile(meta_alphas): meta = meta_alphas meta_pd = read_meta_pd(meta) meta_pd = meta_pd.set_index('sample_name') cases_dict = check_metadata_cases_dict(meta, meta_pd, dict(main_cases_dict), 'DEICODE') odir = get_analysis_folder(i_datasets_folder, 'deicode/%s%s' % (dat, cur_raref)) for case_var, case_vals_list in cases_dict.items(): cur_sh = '%s/run_beta_deicode_%s_%s%s_%s%s.sh' % ( job_folder2, prjct_nm, dat, cur_raref, case_var, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) run_single_deicode(odir, tsv, meta_pd, case_var, case_vals_list, cur_sh, force) job_folder = get_job_folder(i_datasets_folder, 'deicode') main_sh = write_main_sh( job_folder, '3_run_beta_deicode_%s%s' % (filt_raref, prjct_nm), all_sh_pbs, '%s.dcd%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_perm_groups: if p_perm_groups.startswith('/panfs'): p_perm_groups = p_perm_groups.replace(os.getcwd(), '') print('# DEICODE (groups config in %s)' % p_perm_groups) else: print('# DEICODE') print_message('', 'sh', main_sh, jobs)
def run_sourcetracking(i_datasets_folder: str, datasets: dict, p_sourcetracking_config: str, datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, split: bool, jobs: bool, chunkit: int) -> None: job_folder2 = get_job_folder(i_datasets_folder, 'sourcetracking/chunks') sourcetracking_dicts = get_sourcetracking_config(p_sourcetracking_config) sourcetracking_sourcesink = sourcetracking_dicts[0] sourcetracking_filtering = sourcetracking_dicts[1] sourcetracking_params = sourcetracking_dicts[2] main_cases_dict = sourcetracking_dicts[3] all_sh_pbs = {} all_import_sh_pbs = {} for dat, tsv_meta_pds_ in datasets.items(): if dat in sourcetracking_filtering: filters = sourcetracking_filtering[dat] else: filters = {'0_0': ['0', '0']} for idx, tsv_meta_pds in enumerate(tsv_meta_pds_): tsv, meta = tsv_meta_pds meta_pd = read_meta_pd(meta) meta_pd = meta_pd.set_index('sample_name') cases_dict = check_metadata_cases_dict(meta, meta_pd, dict(main_cases_dict), 'sourcetracking') cur_raref = datasets_rarefs[dat][idx] out_import_sh = '%s/run_import_sourcetracking_%s_%s%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref, cur_raref) imports = set() odir = get_analysis_folder(i_datasets_folder, 'sourcetracking/%s' % dat) for method in sourcetracking_params['method']: out_sh = '%s/run_sourcetracking_%s_%s%s%s_%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref, cur_raref, method) for case_var, case_vals_list in cases_dict.items(): for filt, (fp, fa) in filters.items(): cur_sh = '%s/run_sourcetracking_%s_%s_%s%s%s_%s_%s.sh' % ( job_folder2, prjct_nm, dat, case_var, filt_raref, cur_raref, method, filt) cur_sh = cur_sh.replace(' ', '-') cur_import_sh = '%s/run_import_sourcetracking_%s_%s_%s%s%s_%s_%s.sh' % ( job_folder2, prjct_nm, dat, case_var, filt_raref, cur_raref, method, filt) cur_import_sh = cur_import_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) all_import_sh_pbs.setdefault((dat, out_import_sh), []).append(cur_import_sh) run_single_sourcetracking( odir, tsv, meta_pd, case_var, sourcetracking_params, method, imports, sourcetracking_sourcesink, case_vals_list, cur_sh, cur_import_sh, force, filt, cur_raref, fp, fa, run_params["n_nodes"], run_params["n_procs"]) job_folder = get_job_folder(i_datasets_folder, 'sourcetracking') main_sh = write_main_sh( job_folder, '3_run_import_sourcetracking_%s%s' % (prjct_nm, filt_raref), all_import_sh_pbs, '%s.mpt.srctrk%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, '~/.') if main_sh: if p_sourcetracking_config: if p_sourcetracking_config.startswith('/panfs'): p_sourcetracking_config = p_sourcetracking_config.replace( os.getcwd(), '') print('# import sourcetracking (groups config in %s)' % p_sourcetracking_config) else: print('# import sourcetracking') print_message('', 'sh', main_sh, jobs) main_sh = write_main_sh( job_folder, '3_run_sourcetracking_%s%s' % (prjct_nm, filt_raref), all_sh_pbs, '%s.srctrk%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, '~/.') if main_sh: if p_sourcetracking_config: if p_sourcetracking_config.startswith('/panfs'): p_sourcetracking_config = p_sourcetracking_config.replace( os.getcwd(), '') print('# sourcetracking (groups config in %s)' % p_sourcetracking_config) else: print('# sourcetracking') print_message('', 'sh', main_sh, jobs)
def run_adonis(p_formulas: str, i_datasets_folder: str, betas: dict, datasets_rarefs: dict, p_perm_groups: str, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> None: """ Run beta-group-significance: Beta diversity group significance. https://docs.qiime2.org/2019.10/plugins/available/diversity/beta-group-significance/ Main per-dataset looper for the ADONIS tests on beta diversity matrices. :param p_formulas: formulas to test. :param i_data_sets_folder: Path to the folder containing the data/metadata subfolders. :param data_sets: list of datasets. :param betas: beta diversity matrices. :param p_perm_groups: groups to subset. :param force: Force the re-writing of scripts for all commands. :param prjct_nm: Nick name for your project. :param qiime_env: qiime2-xxxx.xx conda environment. :param chmod: whether to change permission of output files (defalt: 775). """ job_folder2 = get_job_folder(i_datasets_folder, 'adonis/chunks') main_cases_dict = get_main_cases_dict(p_perm_groups) formulas = get_formulas_dict(p_formulas) metric_check = set() all_sh_pbs = {} first_print = 0 for dat, metric_groups_metas_qzas_dms_trees_ in betas.items(): if dat not in formulas: continue if not split: out_sh = '%s/run_adonis_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) for idx, metric_groups_metas_qzas_dms_trees in enumerate(metric_groups_metas_qzas_dms_trees_): cur_depth = datasets_rarefs[dat][idx] odir = get_analysis_folder(i_datasets_folder, 'adonis/%s%s' % (dat, cur_depth)) for metric, subset_files in metric_groups_metas_qzas_dms_trees.items(): if split: out_sh = '%s/run_adonis_%s_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, metric, filt_raref) for subset, metas_qzas_mat_qzas_trees in subset_files.items(): for meta, qza, mat_qza, tree in metas_qzas_mat_qzas_trees: if not isfile(mat_qza): if not first_print: print('Beta diversity, distances matrices must be generated already to automatise PERMANOVA\n' '\t(re-run this after steps "2_run_beta.sh" and "2x_run_beta_export.pbs" are done)') first_print += 1 continue if (dat, subset) not in metric_check: meta_pd = read_meta_pd(meta).set_index('sample_name') cases_dict = check_metadata_cases_dict(meta, meta_pd, dict(main_cases_dict), 'ADONIS') formulas = check_metadata_formulas(meta, meta_pd, formulas[dat], 'ADONIS') metric_check.add((dat, subset)) for fdx, form in enumerate(formulas[dat].keys()): formula = formulas[dat][form] for cdx, case_var in enumerate(cases_dict.keys()): case_vals_list = cases_dict[case_var] cur_sh = '%s/run_adonis_%s%s_%s_%s_%s%s.sh' % ( job_folder2, dat, cur_depth, metric, fdx, cdx, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) run_single_adonis(odir, subset, case_vals_list, metric, case_var, form, formula, qza, mat_qza, meta_pd, cur_sh, force) job_folder = get_job_folder(i_datasets_folder, 'adonis') main_sh = write_main_sh(job_folder, '3_run_adonis_%s%s' % (prjct_nm, filt_raref), all_sh_pbs, '%s.dns%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: if p_perm_groups: print("# Run Adonis (groups config in %s)" % p_perm_groups) else: print("# Run Adonis") print_message('', 'sh', main_sh, jobs)
def run_mmvec(p_mmvec_pairs: str, i_datasets_folder: str, datasets: dict, datasets_filt: dict, datasets_read: dict, train_test_dict: dict, force: bool, gpu: bool, standalone: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, split: bool, filt_raref: str, run_params: dict, input_to_filtered: dict, jobs: bool, chunkit: int) -> list: """Run mmvec: Neural networks for microbe-metabolite interaction analysis. https://github.com/biocore/mmvec Main two-datasets looper for the mmvec co-occurrences. Parameters ---------- p_mmvec_pairs :param p_mmvec_pairs: Pairs of datasets for which to compute co-occurrences probabilities. :param p_diff_models: Formulas for multinomial regression-based differential abundance ranking. :param datasets: list of data_sets. :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets_read: dataset -> [tsv table, meta table] (here it updates tsv table after features correction) :param force: Force the re-writing of scripts for all commands. :param gpu: Use GPUs instead of CPUs for MMVEC. :param standalone: :param prjct_nm: Nick name for your project. :param qiime_env: qiime2-xxxx.xx conda environment. :param chmod: whether to change permission of output files (default: 644). i_datasets_folder datasets datasets_filt datasets_read train_test_dict force gpu standalone prjct_nm qiime_env chmod noloc split filt_raref run_params input_to_filtered jobs chunkit Returns ------- """ mmvec_dicts = get_mmvec_dicts(p_mmvec_pairs) mmvec_pairs = mmvec_dicts[0] mmvec_filtering = mmvec_dicts[1] mmvec_params = mmvec_dicts[2] mmvec_subsets = mmvec_dicts[3] unique_datasets = list( set([dat for pair_dats in mmvec_pairs.values() for dat in pair_dats])) unique_filterings = get_unique_mmvec_filtering(mmvec_filtering) print(mmvec_pairs) print() print(mmvec_filtering) print() print(mmvec_params) print() print(mmvec_subsets) print() print(unique_datasets) print() print(unique_filterings) print() print("datasets") print(datasets) print("datasets_filt") print(datasets_filt) print(fdsa) filt_datasets_done, common_datasets_done = check_filtered_and_common_dataset( i_datasets_folder, datasets, datasets_filt, unique_datasets, mmvec_pairs, mmvec_filtering, unique_filterings, 'mmvec', input_to_filtered, mmvec_subsets) already_computed = {} job_folder = get_job_folder(i_datasets_folder, 'mmvec') filt_datasets, common_datasets = make_filtered_and_common_dataset( i_datasets_folder, datasets, datasets_filt, datasets_read, unique_datasets, train_test_dict, mmvec_pairs, mmvec_filtering, unique_filterings, job_folder, force, prjct_nm, qiime_env, chmod, noloc, 'mmvec', filt_raref, filt_datasets_done, common_datasets_done, input_to_filtered, already_computed, mmvec_subsets, jobs) all_sh_pbs = {} mmvec_outputs = [] for pair, pair_data in common_datasets.items(): job_folder2 = get_job_folder(i_datasets_folder, 'mmvec/chunks/%s' % pair) if not split: out_sh = '%s/chunks/run_mmvec_%s_%s%s.sh' % (job_folder, prjct_nm, pair, filt_raref) for (meta_fp, omic1, omic2, filt1, filt2, tsv1, tsv2, qza1, qza2, ncommon, case) in pair_data: train_columns = mmvec_params['train_column'] n_examples = mmvec_params['n_examples'] batches = mmvec_params['batches'] learns = mmvec_params['learns'] epochs = mmvec_params['epochs'] priors = mmvec_params['priors'] thresh_feats = mmvec_params['thresh_feats'] latent_dims = mmvec_params['latent_dims'] if split: out_sh = '%s/chunks/run_mmvec_%s_%s_%s_%s_%s_%s_%s%s.sh' % ( job_folder, prjct_nm, pair, case, omic1, filt1, omic2, filt2, filt_raref) if train_columns != ['None']: n_examples = [''] for idx, it in enumerate( itertools.product(train_columns, n_examples, batches, learns, epochs, priors, thresh_feats, latent_dims)): train_column, n_example, batch, learn, epoch, prior, thresh_feat, latent_dim = [ str(x) for x in it ] res_dir = 'b-%s_l-%s_e-%s_p-%s_f-%s_d-%s_t-%s_n-%s_gpu-%s'\ % ( batch, learn, epoch, prior.replace('.', ''), thresh_feat, latent_dim, train_column, n_example, str(gpu)[0] ) # skip is not at least half samples (for training if train columns specified) if train_columns != ['None']: meta_pd = pd.read_table( meta_fp, usecols=['sample_name', train_column.lower()]) ntrain = meta_pd[ train_column.lower()].value_counts()['Train'] if ncommon < (1.2 * ntrain): print('\t\t--> skipped pair "%s" (too few samples ' '[%s samples for %s training samples]):' % (pair, ncommon)) print('\t\t - %s %s' % (omic1, filt1)) print('\t\t - %s %s' % (omic2, filt2)) continue else: if int(ncommon) < (1.2 * int(n_example)): print('\t\t--> skipped pair "%s" (too few samples ' '[%s samples for %s examples]):' % (pair, ncommon, n_example)) print('\t\t - %s %s' % (omic1, filt1)) print('\t\t - %s %s' % (omic2, filt2)) continue odir = get_analysis_folder( i_datasets_folder, 'mmvec/paired/%s/%s/%s_%s__%s_%s/%s' % (pair, case, omic1, filt1, omic2, filt2, res_dir)) mmvec_outputs.append([ pair, case, omic1, omic2, filt1, filt2, ncommon, meta_fp, tsv1, tsv2, qza1, qza2, 'mmvec_out__%s' % res_dir, odir ]) cur_sh = '%s/run_mmvec_%s_%s_%s_%s_%s%s.sh' % ( job_folder2, pair, case, filt1, filt2, res_dir, filt_raref) all_sh_pbs.setdefault((pair, out_sh), []).append(cur_sh) run_single_mmvec(odir, meta_fp, qza1, qza2, res_dir, cur_sh, batch, learn, epoch, prior, thresh_feat, latent_dim, train_column, n_example, gpu, force, standalone, qiime_env) main_sh = write_main_sh( job_folder, '3_mmvec_%s%s' % (prjct_nm, filt_raref), all_sh_pbs, '%s.mmvc%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, jobs, chunkit) if main_sh: if p_mmvec_pairs.startswith('/panfs'): p_mmvec_pairs = p_mmvec_pairs.replace(os.getcwd(), '') print_message("# MMVEC (datasets pairs in %s)" % p_mmvec_pairs, 'sh', main_sh, jobs) return mmvec_outputs
def import_datasets(i_datasets_folder: str, datasets: dict, datasets_phylo: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, run_params: dict, filt_raref: str, jobs: bool, slurm: bool, chunkit: int) -> None: """Initial imports of the .tsv datasets in to Qiime2 Artefacts Parameters ---------- i_datasets_folder : str Names identifying the datasets in the input folder datasets : dict Mapping dataset name -> [data file path, metadata file path] datasets_phylo : dict Mapping dataset name -> ('tree_to_use', 'corrected_or_not') force : bool Force the re-writing of scripts for all commands prjct_nm : str Nick name for the project. qiime_env : str Name of a qiime2 conda environment where analysis tools to be run are installed chmod : str noloc : bool run_params : dict filt_raref : str jobs : bool chunkit : int Returns ------- """ job_folder = get_job_folder(i_datasets_folder, 'import_tables') job_folder2 = get_job_folder(i_datasets_folder, 'import_tables/chunks') to_chunk = [] main_written = 0 run_pbs = '%s/0_run_import_%s%s.sh' % (job_folder, prjct_nm, filt_raref) with open(run_pbs, 'w') as o: for dat, tsv_meta_pds_ in datasets.items(): written = 0 out_sh = '%s/0_run_import_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as cur_sh: for tsv_meta_pds in tsv_meta_pds_: # REMOVE IF FIXED NOT KEPT tsv, meta = tsv_meta_pds qza = '%s.qza' % splitext(tsv)[0] if datasets_phylo[dat][1]: cmd = run_import(tsv, qza, 'FeatureTable[Frequency]') cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n' % cmd) written += 1 elif force or not isfile(qza): cmd = run_import(tsv, qza, 'FeatureTable[Frequency]') cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n' % cmd) written += 1 if written: main_written += 1 to_chunk.append(out_sh) if not chunkit: job_name = '%s.mprt.%s%s' % (prjct_nm, dat, filt_raref) run_xpbs(out_sh, out_pbs, job_name, qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'imports', prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if main_written: print_message('# Import tables to qiime2', 'sh', run_pbs, jobs)