def mmbird(self, paired_datasets, differentials): if not paired_datasets.mmvec_pd.shape[0]: print('No mmvec output detected...') return None self.prep_mmvec(paired_datasets.mmvec_pd) if differentials.songbird_pd.shape[0]: songbird_pd = self.prep_songbird(differentials.songbird_pd) self.merge_mmvec_songbird(songbird_pd) self.get_taxo_pds() self.get_omics_songbirds_taxa() self.get_mmvec_res() self.show_mmvec_issues() omics_pairs = [ tuple(x) for x in self.mmvec_songbird_pd[ ['omic_subset_filt1', 'omic_subset_filt2']].values.tolist() ] pc_sb_correlations = self.get_pair_cmds(omics_pairs) if len(pc_sb_correlations): out_folder = get_analysis_folder(self.config.i_datasets_folder, 'mmbird') out_correlations = '%s/pc_vs_songbird_correlations.tsv' % out_folder pc_sb_correlations_pd = pd.concat(pc_sb_correlations) if pc_sb_correlations_pd.shape[0]: pc_sb_correlations_pd.to_csv(out_correlations, index=False, sep='\t') print('\t\t==> Written:', out_correlations) else: print('\t\t==> No good songbird model to ' 'make correlations with mmvec PCs...') self.register_command('mmbird')
def get_precomputed_taxonomies(i_datasets_folder: str, datasets: dict, datasets_filt_map: dict, taxonomies: dict, method: str) -> None: """Update taxonomies dict with file found existing Parameters ---------- i_datasets_folder Path to the folder containing the data/metadata subfolders datasets : dict Mapping dataset name -> [tsv file path, metadata file path] datasets_filt_map : dict taxonomies : dict Mapping dataset name -> [classification_method, tax_qza] method : str """ for dat in datasets: dat_raw = get_raw_of_filtered_dataset(dat, datasets_filt_map) analysis_folder = get_analysis_folder(i_datasets_folder, 'taxonomy/%s' % dat_raw) tax_qza = '%s/tax_%s_%s.qza' % (analysis_folder, dat_raw, method) tax_tsv = '%s.tsv' % splitext(tax_qza)[0] if isfile(tax_tsv): taxonomies[dat] = ['', tax_qza, tax_tsv] tax_qza = '%s/tax_%s.qza' % (analysis_folder, dat_raw) tax_tsv = '%s.tsv' % splitext(tax_qza)[0] if isfile(tax_tsv): taxonomies[dat] = ['', tax_qza, tax_tsv]
def get_datasets_paths(self): paths = [] if self.songbirds.shape[0]: for r, row in self.songbirds.iterrows(): dataset = row['dataset'] filter = row['filter'] subset = row['subset'] for analysis in ['mmvec', 'songbird']: odir = get_analysis_folder( self.config.i_datasets_folder, '%s/datasets/%s/%s' % (analysis, dataset, subset)) rad = '%s_%s' % (dataset, filter) tsv = '%s/tab_%s.tsv' % (odir, rad) qza = '%s.qza' % splitext(tsv)[0] meta = '%s/meta_%s.tsv' % (odir, rad) if isfile(tsv) and isfile(qza) and isfile(meta): paths.append([tsv, qza, meta]) break elif analysis == 'songbird': paths.append([tsv, qza, meta]) if paths: self.songbirds = pd.concat([ self.songbirds, pd.DataFrame(paths, columns=['tsv', 'qza', 'meta']) ], axis=1)
def create_songbird_feature_metadata(i_datasets_folder: str, taxonomies: dict, q2_pd: pd.DataFrame): q2_pd = q2_pd.loc[(q2_pd.pair == 'no_pair') & (q2_pd.Pseudo_Q_squared > 0)] for dat in taxonomies.keys(): dat_q2_pd = q2_pd.loc[q2_pd.dat.str.contains(dat)] dat_sbs = [] for (pair, dat, dataset_filter, subset, model, songbird_filter, parameters, baseline, differentials, Pseudo_Q_squared) in dat_q2_pd.values: sb_pd = pd.read_table(differentials, index_col=0).iloc[1:] sb_pd.columns = [ '%s__%s__%s__%s__%s__%s__%s (Q2=%s): %s' % (dat, dataset_filter, subset, model, songbird_filter, parameters, baseline, Pseudo_Q_squared, x) for x in sb_pd.columns ] dat_sbs.append(sb_pd) if len(dat_sbs): dat_sbs_pd = pd.concat(dat_sbs, axis=1, sort=False) odir = get_analysis_folder(i_datasets_folder, 'songbird/%s' % dat) fpo_tsv = '%s/sb_%s.tsv' % (odir, dat) fpo_qza = '%s/sb_%s.qza' % (odir, dat) dat_sbs_pd.reset_index().rename( columns={ dat_sbs_pd.reset_index().columns.tolist()[0]: 'Feature ID' }).to_csv(fpo_tsv, index=True, sep='\t') run_import(fpo_tsv, fpo_qza, 'FeatureData[Differential]')
def summarize_songbirds(self): q2s = [] songbird = get_analysis_folder(self.config.i_datasets_folder, 'songbird') for root, dirs, files in os.walk(songbird): for fil in files: if fil == 'tensorboard.html': path = root + '/' + fil diff = '%s/differentials.tsv' % dirname(root) root_split = root.split('%s/' % songbird)[-1].split('/') d, pr, fr, sb, sr, ps, ml, be = root_split with open(path) as f: for line in f: if 'Pseudo Q-squared' in line: ls = line.split( 'Pseudo Q-squared:</a></strong> ') q2s.append([ pr, d, fr, sb, ml, sr, ps, be, diff, float(ls[-1].split('<')[0]) ]) if q2s: self.q2s_pd = pd.DataFrame(q2s, columns=[ 'pair', 'dataset', 'filter', 'subset', 'model', 'songbird_filter', 'parameters', 'baseline', 'differentials', 'Pseudo_Q_squared' ]) q2s_fp = '%s/songbird_q2.tsv' % songbird self.q2s_pd.to_csv(q2s_fp, index=False, sep='\t') print('\t\t==> Written:', q2s_fp)
def run_taxonomy_amplicon(dat: str, i_datasets_folder: str, force: bool, tsv_pd: pd.DataFrame, out_qza: str, out_tsv: str, i_classifier: str) -> str: """ :param dat: Current dataset. :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param force: Force the re-writing of scripts for all commands. :param tsv_pd: Current features table for the current dataset. :param out_qza: Taxonomy classification output to generate. :param out_tsv: Taxonomy classification output exported. :param i_classifier: Path to the taxonomic classifier. """ cmd = '' if isfile(out_tsv) and not isfile(out_qza): cmd += run_import(out_tsv, out_qza, 'FeatureData[Taxonomy]') else: ref_classifier_qza = get_taxonomy_classifier(i_classifier) odir_seqs = get_analysis_folder(i_datasets_folder, 'seqs/%s' % dat) out_fp_seqs_rad = '%s/seq_%s' % (odir_seqs, dat) out_fp_seqs_fasta = '%s.fasta' % out_fp_seqs_rad out_fp_seqs_qza = '%s.qza' % out_fp_seqs_rad if force or not isfile(out_fp_seqs_qza): cmd += write_seqs_fasta(out_fp_seqs_fasta, out_fp_seqs_qza, tsv_pd) if force or not isfile(out_qza): cmd += write_taxonomy_sklearn(out_qza, out_fp_seqs_qza, ref_classifier_qza) cmd += run_export(out_qza, out_tsv, '') return cmd
def get_common_paths(self): cmds = {} paths = [] pfs = ['pair', 'filter', 'subset'] for (pair, filter, subset), mmvec in self.mmvecs.groupby(pfs): data_dir = get_analysis_folder( self.config.i_datasets_folder, 'mmvec/common/data/%s/%s' % (pair, subset)) meta_dir = get_analysis_folder( self.config.i_datasets_folder, 'mmvec/common/metadata/%s/%s' % (pair, subset)) mmvec_d = mmvec.iloc[0, :].to_dict() dat1, dat2 = mmvec_d['dataset1'], mmvec_d['dataset2'] prev1, prev2 = mmvec_d['prevalence1'], mmvec_d['prevalence2'] abun1, abun2 = mmvec_d['abundance1'], mmvec_d['abundance2'] qza1, meta1 = self.get_dataset_path(dat1, filter, subset) qza2, meta2 = self.get_dataset_path(dat2, filter, subset) if not isfile(meta1) or not isfile(meta2): continue meta1_pd, meta2_pd = read_meta_pd(meta1), read_meta_pd(meta2) sams = set(meta1_pd.sample_name) & set(meta2_pd.sample_name) if len(sams) < 10: print('Not enough samples in pair %s: %s (%s) vs %s (%s)' % (pair, mmvec_d['dataset1'], meta1_pd.shape[0], mmvec_d['dataset2'], meta2_pd.shape[0])) continue meta_fp, new_tsv1, new_qza1, new_tsv2, new_qza2 = self.get_new_fps( meta_dir, data_dir, qza1, qza2, dat1, prev1, abun1, dat2, prev2, abun2, pair, len(sams), cmds) meta_subset = get_meta_subset(meta1_pd, meta2_pd, sams) meta_subset.to_csv(meta_fp, index=False, sep='\t') paths.append([ pair, filter, subset, sams, meta_fp, new_tsv1, new_tsv2, new_qza1, new_qza2 ]) print('\t\t\t* [TODO]', pair, filter, subset, ':', dat1, 'vs', dat2, '(%s samples)' % meta_subset.shape[0]) if paths: common_paths_pd = pd.DataFrame( paths, columns=(pfs + [ 'common_sams', 'meta_fp', 'new_tsv1', 'new_tsv2', 'new_qza1', 'new_qza2' ])) self.mmvecs = self.mmvecs.merge(common_paths_pd, on=['pair', 'filter', 'subset']) self.register_command('mmvec_paired_imports', cmds)
def set_seqs_paths(self, config): for dataset, data in self.datasets.items(): if data.phylo and data.phylo[0] == 'amplicon': odir = get_analysis_folder(config.i_datasets_folder, 'seqs/%s' % dataset) seqs_fas = '%s/seq_%s.fasta' % (odir, dataset) seqs_qza = '%s.qza' % splitext(seqs_fas)[0] data.seqs = (seqs_qza, seqs_fas)
def get_main_dirs(pair_dir, filt, subset, params_dir, model, config) -> tuple: datdir = '%s/%s/%s/%s/%s' % (pair_dir, filt, subset, params_dir, model) odir = get_analysis_folder(config.i_datasets_folder, 'songbird/%s' % datdir) new_qza = '%s/tab.qza' % odir new_meta = '%s/metadata.tsv' % odir return datdir, odir, new_qza, new_meta
def run_distance_decay(i_datasets_folder: str, betas: dict, p_distance_decay: str, datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> (dict, list): job_folder2 = get_job_folder(i_datasets_folder, 'decay/chunks') decay_config = read_yaml_file(p_distance_decay) subsets, modes, params = get_decay_config(decay_config) all_sh_pbs = {} decay_res = {} for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items(): if not split: out_sh = '%s/run_decay_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) decay_res[dat] = [] for idx, metrics_groups_metas_qzas_dms_trees in enumerate( rarefs_metrics_groups_metas_qzas_dms_trees): decay_raref = {} cur_raref = datasets_rarefs[dat][idx] odir = get_analysis_folder(i_datasets_folder, 'decay/%s%s' % (dat, cur_raref)) if split: out_sh = '%s/run_decay_%s_%s%s%s.sh' % ( job_folder2, prjct_nm, dat, cur_raref, filt_raref) for metric, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items( ): for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items( ): for (meta, qza, mat_qza, tree) in metas_qzas_mat_qzas_trees: meta_pd = read_meta_pd(meta).set_index('sample_name') cases_dict = check_metadata_cases_dict( meta, meta_pd, dict(subsets), 'decay') for case_var, case_vals_list in cases_dict.items(): for case_vals in case_vals_list: case = get_case(case_vals, case_var).replace(' ', '_') cur_sh = '%s/run_decay_%s%s_%s_%s_%s%s.sh' % ( job_folder2, dat, cur_raref, metric, group, case, filt_raref) cur_sh = cur_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) new_meta_pd = get_new_meta_pd( meta_pd, case, case_var, case_vals) res = run_single_decay( odir, group, new_meta_pd, cur_sh, mat_qza, case, modes, force, run_params["n_nodes"], run_params["n_procs"], int(params['iteration']), int(params['step'])) decay_raref[(metric, group, case)] = res decay_res[dat].append(decay_raref)
def get_precomputed_trees(self, config): for dataset_, data in self.datasets.items(): dataset = self._get_filt_raw(dataset_) analysis_folder = get_analysis_folder(config.i_datasets_folder, 'phylo/%s' % dataset) tree_qza = '%s/tree_%s.qza' % (analysis_folder, dataset) tree_nwk = '%s.nwk' % splitext(tree_qza)[0] if isfile(tree_nwk) and isfile(tree_qza): data.tree = ('', tree_qza, tree_nwk) data.phylo = ('precpu', 0)
def mmvec(self) -> None: """Main script for the creation of mmvec jobs. It iterates over the rows of the table created upfront and over each combination of parameters and collect the output info for potential reuse in figure generation and post-analysis. Parameters ---------- config : Class instance of AnalysesConfig Contains all the routine analyses config info. """ cmds = {} mess = set() mmvec = [] params_pd = self.get_params_combinations() for r, row in self.mmvecs.iterrows(): self.process_params_combinations(row, params_pd, mess) pair, filter, subset = row['pair'], row['filter'], row['subset'] d1, p1, a1 = row['dataset1'], row['prevalence1'], row['abundance1'] d2, p2, a2 = row['dataset2'], row['prevalence2'], row['abundance2'] for p, params in params_pd.iterrows(): res_dir = self.get_res_dir(params) odir = get_analysis_folder( self.config.i_datasets_folder, 'mmvec/paired/%s/%s/%s_%s-%s__%s_%s-%s/%s' % (pair, subset, d1, p1, a1, d2, p2, a2, res_dir)) mod_dir, mod_rnk, mod_rdn, mod_stt = self.get_out( odir, 'model') nul_dir, nul_rnk, nul_rdn, nul_stt = self.get_out(odir, 'null') summary = '%s/paired-summary.qzv' % odir mmvec.append([ pair, filter, subset, d1, d2, p1, a1, p2, a2, len(row['common_sams']), row['meta_fp'], row['new_tsv1'], row['new_tsv2'], row['new_qza1'], row['new_qza2'], 'mmvec_out__%s' % res_dir, odir ]) if self.config.force or not isfile(summary): cmd = write_mmvec_cmd( row['meta_fp'], row['new_qza1'], row['new_qza2'], res_dir, mod_dir, nul_dir, mod_rnk, mod_rdn, mod_stt, nul_rnk, nul_rdn, nul_stt, summary, params['batches'], params['learns'], params['epochs'], params['input_prior'], params['output_prior'], params['thresh_feats'], params['latent_dims'], params['train_column'], params['n_examples'], params['summary_interval'], self.config.gpu, self.config.standalone, self.config.qiime_env) cmds.setdefault(row['pair'], []).append(cmd) if mmvec: self.get_mmvec_pd(mmvec) self.register_command('mmvec', cmds)
def nestedness_nodfs(i_datasets_folder: str, nodfs_fps: dict, collapsed: dict, filt_raref: str, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, jobs: bool, chunkit: int) -> None: RESOURCES = pkg_resources.resource_filename("routine_qiime2_analyses", "resources") nestedness_nodfs_fp = '%s/nestedness_nodfs.py' % RESOURCES job_folder2 = get_job_folder(i_datasets_folder, 'nestedness_figures/chunks') all_sh_pbs = {} for dat, nodfs in nodfs_fps.items(): out_sh = '%s/run_nestedness_nodfs_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) out_py = out_sh.replace('.sh', '.py') cur_sh = '%s/run_nestedness_nodfs_%s%s_tmp.sh' % (job_folder2, dat, filt_raref) cur_sh = cur_sh.replace(' ', '-') with open(cur_sh, 'w') as o: o.write('python3 %s\n' % out_py) all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) # value to edit in template odir = get_analysis_folder(i_datasets_folder, 'nestedness/%s%s' % (dat, filt_raref)) with open(out_py, 'w') as o, open(nestedness_nodfs_fp) as f: for line in f: line_edit = line if '<DAT>' in line: line_edit = line_edit.replace('<DAT>', dat) if '<ODIR>' in line: line_edit = line_edit.replace('<ODIR>', odir) if '<NODFS>' in line: line_edit = line_edit.replace("'<NODFS>'", str(nodfs)) if '<COLLAPSED>' in line: line_edit = line_edit.replace("'<COLLAPSED>'", str(collapsed)) o.write(line_edit) job_folder = get_job_folder(i_datasets_folder, 'nestedness_figures') main_sh = write_main_sh( job_folder, 'run_nestedness_nodfs%s' % filt_raref, all_sh_pbs, '%s.nstd.ndf%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit) if main_sh: print("# NESTEDNESS NODFS") print_message('', 'sh', main_sh, jobs)
def set_tree_paths(self, config): for dataset, data in self.datasets.items(): if dataset in Datasets.filt_raw: continue if data.phylo: odir = get_analysis_folder(config.i_datasets_folder, 'phylo/%s' % dataset) tree_nwk = '%s/tree_%s.nwk' % (odir, dataset) tree_qza = '%s.qza' % splitext(tree_nwk)[0] if data.phylo[0] == 'amplicon': intree_qza = '%s_inTree.qza' % splitext(tree_nwk)[0] data.tree = (intree_qza, tree_qza, tree_nwk) else: data.tree = ('', tree_qza, tree_nwk)
def get_precomputed_taxonomy(self, config, method='sklearn'): for dataset_, data in self.datasets.items(): dataset = self._get_filt_raw(dataset_) analysis_folder = get_analysis_folder(config.i_datasets_folder, 'taxonomy/%s' % dataset) tax_qza = '%s/tax_%s_%s.qza' % (analysis_folder, dataset, method) tax_tsv = '%s.tsv' % splitext(tax_qza)[0] if isfile(tax_tsv) and isfile(tax_qza): data.tax = ['', tax_qza, tax_tsv] tax_qza = '%s/tax_%s.qza' % (analysis_folder, dataset) tax_tsv = '%s.tsv' % splitext(tax_qza)[0] if isfile(tax_tsv) and isfile(tax_qza): data.tax = ['', tax_qza, tax_tsv]
def get_dm_meta(dat, dm, meta, raref, metric, i_datasets_folder, skip): dm_rgx = '%s%s*/*%s_DM.qza' % (dirname(dm), raref, metric) dm_rgx_glob = glob.glob(dm_rgx) if len(dm_rgx_glob) >= 1: dm = sorted(dm_rgx_glob)[0] else: skip += 1 meta_dir = get_analysis_folder(i_datasets_folder, 'rarefy/%s' % dat) meta_rgx = '%s/meta_%s%s*tsv' % (meta_dir, dat, raref) meta_rgx_glob = glob.glob(meta_rgx) if len(meta_rgx_glob) >= 1: meta = sorted(meta_rgx_glob)[0] else: skip += 1 return dm, meta
def set_taxonomy_paths(self, config, method): for dataset_, data in self.datasets.items(): dataset = self._get_filt_raw(dataset_) odir = get_analysis_folder(config.i_datasets_folder, 'taxonomy/%s' % dataset) if data.phylo and data.phylo[0] == 'amplicon': tax_tsv = '%s/tax_%s_%s.tsv' % (odir, dataset, method) meth = method else: tax_tsv = '%s/tax_%s.tsv' % (odir, dataset) if data.phylo and data.phylo[0] == 'wol': meth = 'wol' else: meth = 'feat' tax_qza = '%s.qza' % splitext(tax_tsv)[0] data.tax = [meth, tax_qza, tax_tsv]
def set_rarefaction_paths(self, config): for dataset, data in self.datasets.items(): if dataset in Datasets.filt_raw: data.raref_depths = self.datasets[ Datasets.filt_raw[dataset]].raref_depths if not data.raref_depths: continue odir = get_analysis_folder(config.i_datasets_folder, 'rarefy/%s' % dataset) for depth_ in data.raref_depths[1]: depth = '_raref%s' % get_digit_depth(depth_, data.data[0].sum()) data.tsv.append('%s/tab_%s%s.tsv' % (odir, dataset, depth)) data.qza.append('%s/tab_%s%s.qza' % (odir, dataset, depth)) data.meta.append('%s/meta_%s%s.tsv' % (odir, dataset, depth)) data.rarefs.append(depth)
def get_precomputed_trees( i_datasets_folder: str, datasets: dict, datasets_filt_map: dict, datasets_phylo: dict, trees: dict) -> None: """ :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets: dataset -> [tsv/biom path, meta path] :param datasets_phylo: to be updated with ('tree_to_use', 'corrected_or_not') per dataset. :param trees: to be update with tree to use for a dataset phylogenetic analyses. """ for dat in datasets: if dat in datasets_filt_map: dat_tree = datasets_filt_map[dat] else: dat_tree = dat analysis_folder = get_analysis_folder(i_datasets_folder, 'phylo/%s' % dat_tree) tree_qza = '%s/tree_%s.qza' % (analysis_folder, dat_tree) if isfile(tree_qza): trees[dat] = ('', tree_qza) datasets_phylo[dat] = ('precpu', 0)
def get_tax_fp(i_datasets_folder: str, omic: str, input_to_filtered: dict) -> str: tax_dir = get_analysis_folder(i_datasets_folder, 'taxonomy') omic_taxs = [x for x, y in input_to_filtered.items() if y == omic] if len(omic_taxs): omic_tax_ = omic_taxs[0] if '__raref' in omic_tax_: omic_tax = '__raref'.join(omic_tax_.split('__raref')[:-1]) else: omic_tax = omic_tax_ else: print('\nNo taxonomy file for "%s"' % omic) return '' omic_tax_fps = glob.glob('%s/%s/tax_%s*.tsv' % (tax_dir, omic_tax, omic_tax)) if len(omic_tax_fps): omic_tax_fp = omic_tax_fps[0] else: omic_tax_fp = '' return omic_tax_fp
def get_datasets_paths(self): datasets_paths = self.mmvecs.copy() # print("datasets_paths") # print(datasets_paths.iloc[:, :5]) datasets_paths = datasets_paths.drop(columns=['pair', 'omic']) datasets_paths = datasets_paths.loc[~datasets_paths.astype(str). duplicated()] # print("datasets_paths") # print(datasets_paths.iloc[:, :5]) # print("datasets_paths[['dataset', 'filter', 'subset']].values") # print(datasets_paths[['dataset', 'filter', 'subset']].values) paths = [] for r, row in datasets_paths.iterrows(): dataset = row['dataset'] filter = row['filter'] subset = row['subset'] odir = get_analysis_folder( self.config.i_datasets_folder, 'mmvec/datasets/%s/%s' % (dataset, subset)) rad = '%s_%s' % (dataset, filter) tsv = '%s/tab_%s.tsv' % (odir, rad) qza = '%s.qza' % splitext(tsv)[0] meta = '%s/meta_%s.tsv' % (odir, rad) paths.append([dataset, filter, subset, tsv, qza, meta]) # print("paths") # print(paths) # datasets_paths = pd.concat([ # datasets_paths, # pd.DataFrame(paths, columns=['tsv', 'qza', 'meta']) # ], axis=1) datasets_paths = datasets_paths.merge( pd.DataFrame( paths, columns=['dataset', 'filter', 'subset', 'tsv', 'qza', 'meta']), on=['dataset', 'filter', 'subset'], how='left') # print("datasets_paths") # print(datasets_paths.iloc[:, :5]) return datasets_paths
def create_songbird_feature_metadata(self): if self.q2s_pd.shape[0]: q2_pd = self.q2s_pd.loc[(self.q2s_pd.pair == 'no_pair') & (self.q2s_pd.Pseudo_Q_squared > 0)] for dat, dataset_pd in q2_pd.groupby('dataset'): dataset_sbs = [] for r, row in dataset_pd.iterrows(): pr = 'pair=%s' % row['pair'] fr = 'filter=%s' % row['filter'] sb = 'subset=%s' % row['subset'] ml = 'model=%s' % row['model'] st = 'sb_filt=%s' % row['songbird_filter'] ps = 'params=%s' % row['parameters'] be = 'baseline=%s' % row['baseline'] q2 = '[Q2=%s]' % row['Pseudo_Q_squared'] diffs = row['differentials'] sb_pd = pd.read_csv(diffs, index_col=0, sep='\t') sb_pd.columns = [ '%s %s: %s' % ('__'.join([dat, pr, fr, sb, ml, st, ps, be]), q2, x) for x in sb_pd.columns ] dataset_sbs.append(sb_pd) if len(dataset_sbs): dataset_sbs_pd = pd.concat(dataset_sbs, axis=1, sort=False) odir = get_analysis_folder(self.config.i_datasets_folder, 'songbird/%s' % dat) fpo_tsv = '%s/differentials_%s.tsv' % (odir, dat) fpo_qza = '%s/differentials_%s.qza' % (odir, dat) dataset_sbs_pd = dataset_sbs_pd.reset_index() dataset_sbs_pd = dataset_sbs_pd.rename( columns={ dataset_sbs_pd.columns.tolist()[0]: 'Feature ID' }) dataset_sbs_pd.to_csv(fpo_tsv, index=True, sep='\t') run_import(fpo_tsv, fpo_qza, 'FeatureData[Differential]')
def check_common_datasets(i_datasets_folder: str, mmvec_pairs: dict, mmvec_filtering: dict, filt_datasets_pass: dict, input_to_filtered: dict, mmvec_subsets: dict) -> (dict, list): """ :param i_datasets_folder: :param mmvec_pairs: :param force: Force the re-writing of scripts for all commands. :return: """ common_datasets_pass = {} for pair, pair_datasets in mmvec_pairs.items(): pair_filtering = mmvec_filtering[pair] common_datasets_pass[pair] = [] data_dir_ = get_analysis_folder(i_datasets_folder, 'mmvec/common/data/%s' % pair) meta_dir_ = get_analysis_folder(i_datasets_folder, 'mmvec/common/metadata/%s' % pair) (omic1_, bool1), (omic2_, bool2) = pair_datasets if omic1_ not in input_to_filtered or omic2_ not in input_to_filtered: continue omic1 = input_to_filtered[omic1_] omic2 = input_to_filtered[omic2_] if (omic1, bool1) not in filt_datasets_pass or ( omic2, bool2) not in filt_datasets_pass: continue for case_var, case_vals_list in mmvec_subsets.items(): for case_vals in case_vals_list: case = get_case(case_vals, case_var) data_dir = data_dir_ + '/' + case meta_dir = meta_dir_ + '/' + case for preval_abund in sorted(pair_filtering): preval_filt1, abund_filter1 = pair_filtering[preval_abund][ (omic1_, bool1)] preval_filt2, abund_filter2 = pair_filtering[preval_abund][ (omic2_, bool2)] if not filt_datasets_pass[(omic1, bool1)][(case, preval_abund)]: continue if not filt_datasets_pass[(omic2, bool2)][(case, preval_abund)]: continue filt1 = '_'.join([preval_filt1, abund_filter1]) filt2 = '_'.join([preval_filt2, abund_filter2]) tsv1, qza1, meta1, meta_pd1, sams1 = filt_datasets_pass[( omic1, bool1)][(case, preval_abund)] tsv2, qza2, meta2, meta_pd2, sams2 = filt_datasets_pass[( omic2, bool2)][(case, preval_abund)] common_sams = sorted(set(sams1) & set(sams2)) if len(common_sams) < 10: print( 'Not enough samples: %s (%s) vs %s (%s) -> skipping' % (omic1, filt1, omic2, filt2)) continue meta_fp = '%s/meta_%s_%s_%s__%s_%s_%s__%s_%ss.tsv' % ( meta_dir, omic1, preval_filt1, abund_filter1, omic2, preval_filt2, abund_filter2, pair, len(common_sams)) new_tsv1 = '%s/tab_%s_%s_%s__%s_%ss.tsv' % ( data_dir, omic1, preval_filt1, abund_filter1, pair, len(common_sams)) new_qza1 = '%s.qza' % splitext(new_tsv1)[0] new_tsv2 = '%s/tab_%s_%s_%s__%s_%ss.tsv' % ( data_dir, omic2, preval_filt2, abund_filter2, pair, len(common_sams)) new_qza2 = '%s.qza' % splitext(new_tsv2)[0] if isfile(meta_fp) and isfile(new_qza1) and isfile( new_qza2): common_datasets_pass[pair].append(meta_fp)
def run_qemistree(i_datasets_folder: str, datasets: dict, prjct_nm: str, i_qemistree: str, taxonomies: dict, force: bool, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> None: """ :param i_datasets_folder: Path to the folder containing the data/metadata subfolders. :param datasets_read: dataset -> [tsv table, meta table] :param prjct_nm: Short nick name for your project. :param i_qemistree: path to qemistree folder (feature-data and tree). :param taxonomies: dataset -> [method, assignment qza] :param force: Force the re-writing of scripts for all commands. :param qiime_env: name of your qiime2 conda environment (e.g. qiime2-2019.10). :param chmod: whether to change permission of output files (defalt: 775). """ job_folder = get_job_folder(i_datasets_folder, 'qemistree') job_folder2 = get_job_folder(i_datasets_folder, 'qemistree/chunks') written = 0 to_chunk = [] run_pbs = '%s/1_run_qemistree_%s%s.sh' % (job_folder, prjct_nm, filt_raref) with open(run_pbs, 'w') as o: for dat, tsv_meta_pds in datasets.items(): feature_data = '%s/feature-data_%s.qza' % (i_qemistree, dat) qemistree = '%s/qemistree_%s.qza' % (i_qemistree, dat) if not isfile(feature_data) or not isfile(qemistree): continue out_sh = '%s/run_qemistree_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] odir = get_analysis_folder(i_datasets_folder, 'qemistree/%s' % dat) classyfire_qza = '%s/%s-classyfire.qza' % (odir, dat) classyfire_tsv = '%s.tsv' % splitext(classyfire_qza)[0] with open(out_sh, 'w') as cur_sh: if force or not isfile(classyfire_tsv): write_qemistree(feature_data, classyfire_qza, classyfire_tsv, qemistree, cur_sh) written += 1 if isfile(classyfire_tsv): odir = get_analysis_folder(i_datasets_folder, 'taxonomy/%s' % dat) out_rad = '%s/tax_%s' % (odir, dat) tax_qza = '%s.qza' % out_rad tax_tsv = '%s.tsv' % out_rad classyfire_pd = pd.read_csv(classyfire_tsv, header=0, sep='\t') with open(tax_tsv, 'w') as o: cols = ['id', 'kingdom', 'superclass', 'class', 'subclass', 'direct_parent'] o.write('Feature ID\tTaxon\n') for row in classyfire_pd[cols].values: o.write('%s\t%s\n' % (row[0], '; '.join(row[1:]))) run_export(tax_tsv, tax_qza, 'FeatureData[Taxonomy]') taxonomies[dat] = ['direct_parent', tax_qza] written += 1 else: print('[Warning] Maybe run qemistree first and then re-run pipeline to ' 'have the classyfire taxonomy include in the barplots!') to_chunk.append(out_sh) if not chunkit: run_xpbs(out_sh, out_pbs, '%s.qmstr.%s%s' % (prjct_nm, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'qemistree', prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if written: print_message('# Make qemistree classyfire classifications', 'sh', run_pbs, jobs)
def run_sourcetracking(i_datasets_folder: str, datasets: dict, p_sourcetracking_config: str, datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, split: bool, jobs: bool, chunkit: int) -> None: job_folder2 = get_job_folder(i_datasets_folder, 'sourcetracking/chunks') sourcetracking_dicts = get_sourcetracking_config(p_sourcetracking_config) sourcetracking_sourcesink = sourcetracking_dicts[0] sourcetracking_filtering = sourcetracking_dicts[1] sourcetracking_params = sourcetracking_dicts[2] main_cases_dict = sourcetracking_dicts[3] all_sh_pbs = {} all_import_sh_pbs = {} for dat, tsv_meta_pds_ in datasets.items(): if dat in sourcetracking_filtering: filters = sourcetracking_filtering[dat] else: filters = {'0_0': ['0', '0']} for idx, tsv_meta_pds in enumerate(tsv_meta_pds_): tsv, meta = tsv_meta_pds meta_pd = read_meta_pd(meta) meta_pd = meta_pd.set_index('sample_name') cases_dict = check_metadata_cases_dict(meta, meta_pd, dict(main_cases_dict), 'sourcetracking') cur_raref = datasets_rarefs[dat][idx] out_import_sh = '%s/run_import_sourcetracking_%s_%s%s%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref, cur_raref) imports = set() odir = get_analysis_folder(i_datasets_folder, 'sourcetracking/%s' % dat) for method in sourcetracking_params['method']: out_sh = '%s/run_sourcetracking_%s_%s%s%s_%s.sh' % ( job_folder2, prjct_nm, dat, filt_raref, cur_raref, method) for case_var, case_vals_list in cases_dict.items(): for filt, (fp, fa) in filters.items(): cur_sh = '%s/run_sourcetracking_%s_%s_%s%s%s_%s_%s.sh' % ( job_folder2, prjct_nm, dat, case_var, filt_raref, cur_raref, method, filt) cur_sh = cur_sh.replace(' ', '-') cur_import_sh = '%s/run_import_sourcetracking_%s_%s_%s%s%s_%s_%s.sh' % ( job_folder2, prjct_nm, dat, case_var, filt_raref, cur_raref, method, filt) cur_import_sh = cur_import_sh.replace(' ', '-') all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) all_import_sh_pbs.setdefault((dat, out_import_sh), []).append(cur_import_sh) run_single_sourcetracking( odir, tsv, meta_pd, case_var, sourcetracking_params, method, imports, sourcetracking_sourcesink, case_vals_list, cur_sh, cur_import_sh, force, filt, cur_raref, fp, fa, run_params["n_nodes"], run_params["n_procs"]) job_folder = get_job_folder(i_datasets_folder, 'sourcetracking') main_sh = write_main_sh( job_folder, '3_run_import_sourcetracking_%s%s' % (prjct_nm, filt_raref), all_import_sh_pbs, '%s.mpt.srctrk%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, '~/.') if main_sh: if p_sourcetracking_config: if p_sourcetracking_config.startswith('/panfs'): p_sourcetracking_config = p_sourcetracking_config.replace( os.getcwd(), '') print('# import sourcetracking (groups config in %s)' % p_sourcetracking_config) else: print('# import sourcetracking') print_message('', 'sh', main_sh, jobs) main_sh = write_main_sh( job_folder, '3_run_sourcetracking_%s%s' % (prjct_nm, filt_raref), all_sh_pbs, '%s.srctrk%s' % (prjct_nm, filt_raref), run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, '~/.') if main_sh: if p_sourcetracking_config: if p_sourcetracking_config.startswith('/panfs'): p_sourcetracking_config = p_sourcetracking_config.replace( os.getcwd(), '') print('# sourcetracking (groups config in %s)' % p_sourcetracking_config) else: print('# sourcetracking') print_message('', 'sh', main_sh, jobs)
def songbird(self) -> None: """Main script for the creation of songbird jobs. It iterates over the rows of the table created upfront and over each combination of parameters and collect the output info for potential reuse in figure generation and post-analysis. Parameters ---------- config : Class instance of AnalysesConfig Contains all the routine analyses config info. project Darasets. """ cmds = {} mess = set() songbird = [] dat_cmds, dat_bcmds = {}, {} params_pd = self.get_params_combinations() for r, row in self.songbirds.iterrows(): qza, pair, meta_fp = row['qza'], row['pair'], row['meta'] dat, filt, subset = row['dataset'], row['filter'], row['subset'] if dat not in self.songbird_models: continue dat_pair, pair_dir = self.get_dat_pair_dir(dat, pair) meta_pd = read_meta_pd(meta_fp) models = self.check_metadata_models(meta_fp, meta_pd, self.songbird_models[dat]) row_params_pd = params_pd.copy() self.process_params_combinations(dat, meta_pd, row_params_pd, mess) for p, params in row_params_pd.iterrows(): params_dir = self.get_params_dir(params) baselines, model_baselines = {}, {'1': '1'} for modx, model in enumerate(models.keys()): formula, meta_vars, drop = models[model] datdir, odir, new_qza, new_meta = self.get_main_dirs( pair_dir, filt, subset, params_dir, model, self.config) self.write_new_meta(meta_pd, new_meta, meta_vars, drop, params) if dat in self.models_baselines and model in \ self.models_baselines[dat]: model_baselines = self.models_baselines[dat][model] for mdx, model_baseline in enumerate(model_baselines): bformula = model_baselines[model_baseline] bodir = get_analysis_folder( self.config.i_datasets_folder, 'songbird/%s/b-%s' % (datdir, model_baseline)) out_paths = self.get_out_paths(odir, bodir, model_baseline, baselines) # convergence = self.check_stats_convergence(out_paths) cmd, bcmd = songbird_cmd(qza, new_qza, new_meta, params, formula, bformula, out_paths) songbird.append([ dat, filt, '%s_%s' % (params_dir.replace('/', '__'), model), subset, out_paths['diff'], model_baseline, out_paths['html'], pair ]) if cmd: dat_cmds.setdefault(dat, []).append(cmd) if bcmd: dat_bcmds.setdefault(dat, []).append(bcmd) for dat in dat_bcmds: # first come the scripts generating (reused) baselines models if dat_bcmds[dat]: cmds.setdefault(dat, []).extend(dat_bcmds[dat]) for dat in dat_cmds: # and then the scripts generating the actual models if dat_cmds[dat]: cmds.setdefault(dat, []).extend(dat_cmds[dat]) if songbird: self.get_songbird_pd(songbird) self.show_models_issues() self.register_command('songbird', cmds) self.summarize_songbirds() self.create_songbird_feature_metadata()
def run_nestedness(i_datasets_folder: str, betas: dict, datasets_collapsed_map: dict, p_nestedness_groups: str, datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, split: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> (dict, list, dict): job_folder2 = get_job_folder(i_datasets_folder, 'nestedness/chunks') nestedness_config = read_yaml_file(p_nestedness_groups) if 'soft' not in nestedness_config: print( 'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)' ) return {} if nestedness_config['soft'].endswith('Autocorrelation.jar') and isfile( nestedness_config['soft']): binary = nestedness_config['soft'] else: binary = '%s/bin/Autocorrelation.jar' % nestedness_config['soft'] if not isfile(binary): print( 'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)' ) return {} subsets, nodfs, colors, nulls, modes, params = get_nestedness_config( nestedness_config) nodfs_fps = {} all_sh_pbs = {} nestedness_res = {} for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items(): if not split: out_sh = '%s/run_nestedness_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) stats_tax_dat, level = get_stats_tax_dat(dat, datasets_collapsed_map) nestedness_res[dat] = [] for idx, metrics_groups_metas_qzas_dms_trees in enumerate( rarefs_metrics_groups_metas_qzas_dms_trees): nestedness_raref = {} cur_raref = datasets_rarefs[dat][idx] odir = get_analysis_folder(i_datasets_folder, 'nestedness/%s%s' % (dat, cur_raref)) if split: out_sh = '%s/run_nestedness_%s_%s%s%s.sh' % ( job_folder2, prjct_nm, dat, cur_raref, filt_raref) for _, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items( ): for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items( ): meta, qza, mat_qza, tree = metas_qzas_mat_qzas_trees[0] meta_pd = read_meta_pd(meta).set_index('sample_name') cases_dict = check_metadata_cases_dict( meta, meta_pd, dict(subsets), 'nestedness') for case_var, case_vals_list in cases_dict.items(): for case_vals in case_vals_list: case = get_case(case_vals, case_var).replace(' ', '_') cur_sh = '%s/run_nestedness_%s%s_%s_%s%s.sh' % ( job_folder2, dat, cur_raref, group, case, filt_raref) cur_sh = cur_sh.replace(' ', '-') # print("case", case) all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh) res, group_case_nodfs = run_single_nestedness( odir, cur_raref, level, group, meta_pd, nodfs, nulls, modes, cur_sh, qza, case, case_var, case_vals, binary, params, force) nodfs_fps.setdefault(stats_tax_dat, []).extend(group_case_nodfs) nestedness_raref[(group, case)] = res break nestedness_res[dat].append(nestedness_raref)
def run_taxonomy(method: str, i_datasets_folder: str, datasets: dict, datasets_read: dict, datasets_phylo: dict, datasets_features: dict, datasets_filt_map: dict, i_classifier: str, taxonomies: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> None: """ Parameters ---------- method i_datasets_folder : str Path to the folder containing the data/metadata subfolders. datasets : dict Mappring dataset name -> [data file path, metadata file path]. datasets_read : dict Mapping dataset name -> [data table, metadata table] datasets_phylo : dict To be updated with ('tree_to_use', 'corrected_or_not') per dataset. datasets_features : dict Mapping dataset name -> list of features names in the dataset tsv / biom file. datasets_filt_map : dict i_classifier : str Path to the taxonomic classifier. taxonomies : dict Mapping Dataset name -> [method, assignment qza] force : bool Force the re-writing of scripts for all commands. prjct_nm : str Short nick name for your project. qiime_env : str Name of your qiime2 conda environment (e.g. qiime2-2019.10). chmod : str Whether to change permission of output files (default: 744). noloc : str run_params : dict filt_raref : str jobs : bool chunkit : int Returns ------- """ job_folder = get_job_folder(i_datasets_folder, 'taxonomy') job_folder2 = get_job_folder(i_datasets_folder, 'taxonomy/chunks') amplicon_datasets = [ dat for dat, (tree, correction) in datasets_phylo.items() if tree == 'amplicon' ] wol_datasets = [ dat for dat, (tree, correction) in datasets_phylo.items() if tree == 'wol' ] main_written = 0 to_chunk = [] run_pbs = '%s/1_run_taxonomy_%s%s.sh' % (job_folder, prjct_nm, filt_raref) with open(run_pbs, 'w') as o: for dat, tsv_meta_pds_ in datasets_read.items(): out_sh = '%s/run_taxonomy_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] if dat in datasets_filt_map: taxonomies[dat] = taxonomies[datasets_filt_map[dat]] continue written = 0 with open(out_sh, 'w') as cur_sh: for idx, tsv_meta_pds in enumerate(tsv_meta_pds_): if idx: continue tsv, meta = datasets[dat][idx] if not isinstance(tsv_meta_pds[0], pd.DataFrame) and \ tsv_meta_pds[0] == 'raref': if not isfile(tsv): print('Must have run rarefaction to use it ' 'further...\nExiting') sys.exit(0) tsv_pd, meta_pd = get_raref_tab_meta_pds(meta, tsv) datasets_read[dat][idx] = [tsv_pd, meta_pd] else: tsv_pd, meta_pd = tsv_meta_pds odir = get_analysis_folder(i_datasets_folder, 'taxonomy/%s' % dat) out_rad = '%s/tax_%s' % (odir, dat) if dat in amplicon_datasets: out_qza = '%s_%s.qza' % (out_rad, method) out_tsv = '%s.tsv' % splitext(out_qza)[0] taxonomies[dat] = [method, out_qza, out_tsv] if not i_classifier: print('No classifier passed for 16S ' 'data\nExiting...') continue cmd = run_taxonomy_amplicon(dat, i_datasets_folder, force, tsv_pd, out_qza, out_tsv, i_classifier) else: out_qza = '%s.qza' % out_rad out_tsv = '%s.tsv' % out_rad if dat in wol_datasets: cur_datasets_features = datasets_features[dat] taxonomies[dat] = ['wol', out_qza, out_tsv] cmd = run_taxonomy_wol(force, tsv_pd, out_qza, out_tsv, cur_datasets_features) else: if len( [x for x in tsv_pd.index if str(x).isdigit()]) == tsv_pd.shape[0]: continue taxonomies[dat] = ['feat', out_qza, out_tsv] cmd = run_taxonomy_others(force, tsv_pd, out_qza, out_tsv) if cmd: cur_sh.write('echo "%s"\n' % cmd) cur_sh.write('%s\n\n' % cmd) main_written += 1 written += 1 if written: to_chunk.append(out_sh) if not chunkit: run_xpbs(out_sh, out_pbs, '%s.tx.sklrn.%s%s' % (prjct_nm, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'taxonomy', prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if main_written: print_message('# Classify features using classify-sklearn', 'sh', run_pbs, jobs)
formula, meta_vars, drop = models[model] # print("meta_pd.shape") # print(meta_pd.shape) # print("meta_pd.columns") # print(meta_pd.columns) # print("meta_vars") # print(meta_vars) # for meta_v in meta_vars: # print("meta_v") # print(meta_v) # print("meta_pd[meta_v].value_counts()") # print(meta_pd[meta_v].value_counts()) datdir = '%s/%s/%s/%s/%s' % (dat_pair_path, filt, case, params, model) odir = get_analysis_folder(i_datasets_folder, 'songbird/%s' % datdir) new_qza = '%s/tab.qza' % odir new_meta = '%s/metadata.tsv' % odir train_column, train_samples = get_metadata_train_test( meta, meta_pd, list(meta_vars), new_meta, train, drop) if not train_column: new_meta_invalid = '%s/metadata_invalid' % odir with open(new_meta_invalid, 'w') as invalid: pass continue baselines = {} metadatas = {} model_baselines = {'1': '"1"'} if dat in models_baselines and model in models_baselines[
def run_barplot(i_datasets_folder: str, datasets: dict, taxonomies: dict, force: bool, prjct_nm: str, qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict, filt_raref: str, jobs: bool, chunkit: int) -> None: """Visualize taxonomy with an interactive bar plot. Parameters ---------- i_datasets_folder : str Path to the folder containing the data/metadata subfolders datasets : dict Mappig dataset name -> [tsv file path, metadata file path] taxonomies : dict Mappig dataset name -> [classification_method, tax_qza] force : bool Force the re-writing of scripts for all commands prjct_nm : str Short nick name for your project qiime_env : str Mame of a qiime2 conda environment chmod : str Whether to change permission of output files (defalt: 744) noloc : bool run_params : dict filt_raref : str jobs : bool chunkit : int Returns ------- """ job_folder = get_job_folder(i_datasets_folder, 'barplot') job_folder2 = get_job_folder(i_datasets_folder, 'barplot/chunks') written = 0 to_chunk = [] run_pbs = '%s/1_run_barplot_%s%s.sh' % (job_folder, prjct_nm, filt_raref) with open(run_pbs, 'w') as o: for dat, tsv_meta_pds_ in datasets.items(): out_sh = '%s/run_barplot_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref) if slurm: out_pbs = '%s.slm' % splitext(out_sh)[0] else: out_pbs = '%s.pbs' % splitext(out_sh)[0] with open(out_sh, 'w') as cur_sh: for tsv_meta_pds in tsv_meta_pds_: tsv, meta = tsv_meta_pds if dat not in taxonomies: continue method, tax_qza, tax_tsv = taxonomies[dat] if not method: method = 'taxofromfile' qza = '%s.qza' % splitext(tsv)[0] odir = get_analysis_folder(i_datasets_folder, 'barplot/%s' % dat) out_qzv = '%s/bar_%s_%s.qzv' % (odir, dat, method) if force or not isfile(out_qzv): write_barplots(out_qzv, qza, meta, tax_qza, cur_sh) written += 1 to_chunk.append(out_sh) if not chunkit: run_xpbs(out_sh, out_pbs, '%s.brplt.%s%s' % (prjct_nm, dat, filt_raref), qiime_env, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], chmod, written, 'single', o, noloc, slurm, jobs) if to_chunk and chunkit: simple_chunks(run_pbs, job_folder2, to_chunk, 'barplot', prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit, None) if written: print_message('# Make sample compositions barplots', 'sh', run_pbs, jobs)