def mmbird(self, paired_datasets, differentials):
        if not paired_datasets.mmvec_pd.shape[0]:
            print('No mmvec output detected...')
            return None
        self.prep_mmvec(paired_datasets.mmvec_pd)
        if differentials.songbird_pd.shape[0]:
            songbird_pd = self.prep_songbird(differentials.songbird_pd)
            self.merge_mmvec_songbird(songbird_pd)

        self.get_taxo_pds()
        self.get_omics_songbirds_taxa()
        self.get_mmvec_res()
        self.show_mmvec_issues()

        omics_pairs = [
            tuple(x) for x in self.mmvec_songbird_pd[
                ['omic_subset_filt1', 'omic_subset_filt2']].values.tolist()
        ]
        pc_sb_correlations = self.get_pair_cmds(omics_pairs)

        if len(pc_sb_correlations):
            out_folder = get_analysis_folder(self.config.i_datasets_folder,
                                             'mmbird')
            out_correlations = '%s/pc_vs_songbird_correlations.tsv' % out_folder
            pc_sb_correlations_pd = pd.concat(pc_sb_correlations)
            if pc_sb_correlations_pd.shape[0]:
                pc_sb_correlations_pd.to_csv(out_correlations,
                                             index=False,
                                             sep='\t')
                print('\t\t==> Written:', out_correlations)
            else:
                print('\t\t==> No good songbird model to '
                      'make correlations with mmvec PCs...')
        self.register_command('mmbird')
Esempio n. 2
0
def get_precomputed_taxonomies(i_datasets_folder: str, datasets: dict,
                               datasets_filt_map: dict, taxonomies: dict,
                               method: str) -> None:
    """Update taxonomies dict with file found existing

    Parameters
    ----------
    i_datasets_folder
        Path to the folder containing the data/metadata subfolders
    datasets : dict
        Mapping dataset name -> [tsv file path, metadata file path]
    datasets_filt_map : dict
    taxonomies : dict
        Mapping dataset name -> [classification_method, tax_qza]
    method : str
    """
    for dat in datasets:
        dat_raw = get_raw_of_filtered_dataset(dat, datasets_filt_map)
        analysis_folder = get_analysis_folder(i_datasets_folder,
                                              'taxonomy/%s' % dat_raw)

        tax_qza = '%s/tax_%s_%s.qza' % (analysis_folder, dat_raw, method)
        tax_tsv = '%s.tsv' % splitext(tax_qza)[0]
        if isfile(tax_tsv):
            taxonomies[dat] = ['', tax_qza, tax_tsv]

        tax_qza = '%s/tax_%s.qza' % (analysis_folder, dat_raw)
        tax_tsv = '%s.tsv' % splitext(tax_qza)[0]
        if isfile(tax_tsv):
            taxonomies[dat] = ['', tax_qza, tax_tsv]
 def get_datasets_paths(self):
     paths = []
     if self.songbirds.shape[0]:
         for r, row in self.songbirds.iterrows():
             dataset = row['dataset']
             filter = row['filter']
             subset = row['subset']
             for analysis in ['mmvec', 'songbird']:
                 odir = get_analysis_folder(
                     self.config.i_datasets_folder,
                     '%s/datasets/%s/%s' % (analysis, dataset, subset))
                 rad = '%s_%s' % (dataset, filter)
                 tsv = '%s/tab_%s.tsv' % (odir, rad)
                 qza = '%s.qza' % splitext(tsv)[0]
                 meta = '%s/meta_%s.tsv' % (odir, rad)
                 if isfile(tsv) and isfile(qza) and isfile(meta):
                     paths.append([tsv, qza, meta])
                     break
                 elif analysis == 'songbird':
                     paths.append([tsv, qza, meta])
     if paths:
         self.songbirds = pd.concat([
             self.songbirds,
             pd.DataFrame(paths, columns=['tsv', 'qza', 'meta'])
         ],
                                    axis=1)
Esempio n. 4
0
def create_songbird_feature_metadata(i_datasets_folder: str, taxonomies: dict,
                                     q2_pd: pd.DataFrame):

    q2_pd = q2_pd.loc[(q2_pd.pair == 'no_pair') & (q2_pd.Pseudo_Q_squared > 0)]
    for dat in taxonomies.keys():
        dat_q2_pd = q2_pd.loc[q2_pd.dat.str.contains(dat)]
        dat_sbs = []
        for (pair, dat, dataset_filter, subset, model, songbird_filter,
             parameters, baseline, differentials,
             Pseudo_Q_squared) in dat_q2_pd.values:
            sb_pd = pd.read_table(differentials, index_col=0).iloc[1:]
            sb_pd.columns = [
                '%s__%s__%s__%s__%s__%s__%s (Q2=%s): %s' %
                (dat, dataset_filter, subset, model, songbird_filter,
                 parameters, baseline, Pseudo_Q_squared, x)
                for x in sb_pd.columns
            ]
            dat_sbs.append(sb_pd)
        if len(dat_sbs):
            dat_sbs_pd = pd.concat(dat_sbs, axis=1, sort=False)
            odir = get_analysis_folder(i_datasets_folder, 'songbird/%s' % dat)
            fpo_tsv = '%s/sb_%s.tsv' % (odir, dat)
            fpo_qza = '%s/sb_%s.qza' % (odir, dat)
            dat_sbs_pd.reset_index().rename(
                columns={
                    dat_sbs_pd.reset_index().columns.tolist()[0]: 'Feature ID'
                }).to_csv(fpo_tsv, index=True, sep='\t')
            run_import(fpo_tsv, fpo_qza, 'FeatureData[Differential]')
 def summarize_songbirds(self):
     q2s = []
     songbird = get_analysis_folder(self.config.i_datasets_folder,
                                    'songbird')
     for root, dirs, files in os.walk(songbird):
         for fil in files:
             if fil == 'tensorboard.html':
                 path = root + '/' + fil
                 diff = '%s/differentials.tsv' % dirname(root)
                 root_split = root.split('%s/' % songbird)[-1].split('/')
                 d, pr, fr, sb, sr, ps, ml, be = root_split
                 with open(path) as f:
                     for line in f:
                         if 'Pseudo Q-squared' in line:
                             ls = line.split(
                                 'Pseudo Q-squared:</a></strong> ')
                             q2s.append([
                                 pr, d, fr, sb, ml, sr, ps, be, diff,
                                 float(ls[-1].split('<')[0])
                             ])
     if q2s:
         self.q2s_pd = pd.DataFrame(q2s,
                                    columns=[
                                        'pair', 'dataset', 'filter',
                                        'subset', 'model',
                                        'songbird_filter', 'parameters',
                                        'baseline', 'differentials',
                                        'Pseudo_Q_squared'
                                    ])
         q2s_fp = '%s/songbird_q2.tsv' % songbird
         self.q2s_pd.to_csv(q2s_fp, index=False, sep='\t')
         print('\t\t==> Written:', q2s_fp)
Esempio n. 6
0
def run_taxonomy_amplicon(dat: str, i_datasets_folder: str, force: bool,
                          tsv_pd: pd.DataFrame, out_qza: str, out_tsv: str,
                          i_classifier: str) -> str:
    """
    :param dat: Current dataset.
    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param force: Force the re-writing of scripts for all commands.
    :param tsv_pd: Current features table for the current dataset.
    :param out_qza: Taxonomy classification output to generate.
    :param out_tsv: Taxonomy classification output exported.
    :param i_classifier: Path to the taxonomic classifier.
    """
    cmd = ''
    if isfile(out_tsv) and not isfile(out_qza):
        cmd += run_import(out_tsv, out_qza, 'FeatureData[Taxonomy]')
    else:
        ref_classifier_qza = get_taxonomy_classifier(i_classifier)
        odir_seqs = get_analysis_folder(i_datasets_folder, 'seqs/%s' % dat)
        out_fp_seqs_rad = '%s/seq_%s' % (odir_seqs, dat)
        out_fp_seqs_fasta = '%s.fasta' % out_fp_seqs_rad
        out_fp_seqs_qza = '%s.qza' % out_fp_seqs_rad
        if force or not isfile(out_fp_seqs_qza):
            cmd += write_seqs_fasta(out_fp_seqs_fasta, out_fp_seqs_qza, tsv_pd)
        if force or not isfile(out_qza):
            cmd += write_taxonomy_sklearn(out_qza, out_fp_seqs_qza,
                                          ref_classifier_qza)
            cmd += run_export(out_qza, out_tsv, '')
    return cmd
Esempio n. 7
0
 def get_common_paths(self):
     cmds = {}
     paths = []
     pfs = ['pair', 'filter', 'subset']
     for (pair, filter, subset), mmvec in self.mmvecs.groupby(pfs):
         data_dir = get_analysis_folder(
             self.config.i_datasets_folder,
             'mmvec/common/data/%s/%s' % (pair, subset))
         meta_dir = get_analysis_folder(
             self.config.i_datasets_folder,
             'mmvec/common/metadata/%s/%s' % (pair, subset))
         mmvec_d = mmvec.iloc[0, :].to_dict()
         dat1, dat2 = mmvec_d['dataset1'], mmvec_d['dataset2']
         prev1, prev2 = mmvec_d['prevalence1'], mmvec_d['prevalence2']
         abun1, abun2 = mmvec_d['abundance1'], mmvec_d['abundance2']
         qza1, meta1 = self.get_dataset_path(dat1, filter, subset)
         qza2, meta2 = self.get_dataset_path(dat2, filter, subset)
         if not isfile(meta1) or not isfile(meta2):
             continue
         meta1_pd, meta2_pd = read_meta_pd(meta1), read_meta_pd(meta2)
         sams = set(meta1_pd.sample_name) & set(meta2_pd.sample_name)
         if len(sams) < 10:
             print('Not enough samples in pair %s: %s (%s) vs %s (%s)' %
                   (pair, mmvec_d['dataset1'], meta1_pd.shape[0],
                    mmvec_d['dataset2'], meta2_pd.shape[0]))
             continue
         meta_fp, new_tsv1, new_qza1, new_tsv2, new_qza2 = self.get_new_fps(
             meta_dir, data_dir, qza1, qza2, dat1, prev1, abun1, dat2,
             prev2, abun2, pair, len(sams), cmds)
         meta_subset = get_meta_subset(meta1_pd, meta2_pd, sams)
         meta_subset.to_csv(meta_fp, index=False, sep='\t')
         paths.append([
             pair, filter, subset, sams, meta_fp, new_tsv1, new_tsv2,
             new_qza1, new_qza2
         ])
         print('\t\t\t* [TODO]', pair, filter, subset, ':', dat1, 'vs',
               dat2, '(%s samples)' % meta_subset.shape[0])
     if paths:
         common_paths_pd = pd.DataFrame(
             paths,
             columns=(pfs + [
                 'common_sams', 'meta_fp', 'new_tsv1', 'new_tsv2',
                 'new_qza1', 'new_qza2'
             ]))
         self.mmvecs = self.mmvecs.merge(common_paths_pd,
                                         on=['pair', 'filter', 'subset'])
     self.register_command('mmvec_paired_imports', cmds)
 def set_seqs_paths(self, config):
     for dataset, data in self.datasets.items():
         if data.phylo and data.phylo[0] == 'amplicon':
             odir = get_analysis_folder(config.i_datasets_folder,
                                        'seqs/%s' % dataset)
             seqs_fas = '%s/seq_%s.fasta' % (odir, dataset)
             seqs_qza = '%s.qza' % splitext(seqs_fas)[0]
             data.seqs = (seqs_qza, seqs_fas)
 def get_main_dirs(pair_dir, filt, subset, params_dir, model,
                   config) -> tuple:
     datdir = '%s/%s/%s/%s/%s' % (pair_dir, filt, subset, params_dir, model)
     odir = get_analysis_folder(config.i_datasets_folder,
                                'songbird/%s' % datdir)
     new_qza = '%s/tab.qza' % odir
     new_meta = '%s/metadata.tsv' % odir
     return datdir, odir, new_qza, new_meta
def run_distance_decay(i_datasets_folder: str, betas: dict,
                       p_distance_decay: str, datasets_rarefs: dict,
                       force: bool, prjct_nm: str, qiime_env: str, chmod: str,
                       noloc: bool, slurm: bool, split: bool, run_params: dict,
                       filt_raref: str, jobs: bool,
                       chunkit: int) -> (dict, list):

    job_folder2 = get_job_folder(i_datasets_folder, 'decay/chunks')
    decay_config = read_yaml_file(p_distance_decay)
    subsets, modes, params = get_decay_config(decay_config)

    all_sh_pbs = {}
    decay_res = {}
    for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items():
        if not split:
            out_sh = '%s/run_decay_%s_%s%s.sh' % (job_folder2, prjct_nm, dat,
                                                  filt_raref)
        decay_res[dat] = []
        for idx, metrics_groups_metas_qzas_dms_trees in enumerate(
                rarefs_metrics_groups_metas_qzas_dms_trees):
            decay_raref = {}
            cur_raref = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder,
                                       'decay/%s%s' % (dat, cur_raref))
            if split:
                out_sh = '%s/run_decay_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, cur_raref, filt_raref)
            for metric, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items(
            ):
                for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items(
                ):
                    for (meta, qza, mat_qza,
                         tree) in metas_qzas_mat_qzas_trees:
                        meta_pd = read_meta_pd(meta).set_index('sample_name')
                        cases_dict = check_metadata_cases_dict(
                            meta, meta_pd, dict(subsets), 'decay')
                        for case_var, case_vals_list in cases_dict.items():
                            for case_vals in case_vals_list:
                                case = get_case(case_vals,
                                                case_var).replace(' ', '_')
                                cur_sh = '%s/run_decay_%s%s_%s_%s_%s%s.sh' % (
                                    job_folder2, dat, cur_raref, metric, group,
                                    case, filt_raref)
                                cur_sh = cur_sh.replace(' ', '-')
                                all_sh_pbs.setdefault((dat, out_sh),
                                                      []).append(cur_sh)
                                new_meta_pd = get_new_meta_pd(
                                    meta_pd, case, case_var, case_vals)
                                res = run_single_decay(
                                    odir, group, new_meta_pd, cur_sh, mat_qza,
                                    case, modes, force, run_params["n_nodes"],
                                    run_params["n_procs"],
                                    int(params['iteration']),
                                    int(params['step']))
                                decay_raref[(metric, group, case)] = res
            decay_res[dat].append(decay_raref)
 def get_precomputed_trees(self, config):
     for dataset_, data in self.datasets.items():
         dataset = self._get_filt_raw(dataset_)
         analysis_folder = get_analysis_folder(config.i_datasets_folder,
                                               'phylo/%s' % dataset)
         tree_qza = '%s/tree_%s.qza' % (analysis_folder, dataset)
         tree_nwk = '%s.nwk' % splitext(tree_qza)[0]
         if isfile(tree_nwk) and isfile(tree_qza):
             data.tree = ('', tree_qza, tree_nwk)
             data.phylo = ('precpu', 0)
Esempio n. 12
0
    def mmvec(self) -> None:
        """Main script for the creation of mmvec jobs.
        It iterates over the rows of the table created
        upfront and over each combination of parameters
        and collect the output info for potential reuse
        in figure generation and post-analysis.

        Parameters
        ----------
        config : Class instance of AnalysesConfig
            Contains all the routine analyses config info.
        """
        cmds = {}
        mess = set()
        mmvec = []
        params_pd = self.get_params_combinations()
        for r, row in self.mmvecs.iterrows():
            self.process_params_combinations(row, params_pd, mess)
            pair, filter, subset = row['pair'], row['filter'], row['subset']
            d1, p1, a1 = row['dataset1'], row['prevalence1'], row['abundance1']
            d2, p2, a2 = row['dataset2'], row['prevalence2'], row['abundance2']
            for p, params in params_pd.iterrows():
                res_dir = self.get_res_dir(params)
                odir = get_analysis_folder(
                    self.config.i_datasets_folder,
                    'mmvec/paired/%s/%s/%s_%s-%s__%s_%s-%s/%s' %
                    (pair, subset, d1, p1, a1, d2, p2, a2, res_dir))
                mod_dir, mod_rnk, mod_rdn, mod_stt = self.get_out(
                    odir, 'model')
                nul_dir, nul_rnk, nul_rdn, nul_stt = self.get_out(odir, 'null')
                summary = '%s/paired-summary.qzv' % odir
                mmvec.append([
                    pair, filter, subset, d1, d2, p1, a1, p2, a2,
                    len(row['common_sams']), row['meta_fp'], row['new_tsv1'],
                    row['new_tsv2'], row['new_qza1'], row['new_qza2'],
                    'mmvec_out__%s' % res_dir, odir
                ])
                if self.config.force or not isfile(summary):
                    cmd = write_mmvec_cmd(
                        row['meta_fp'], row['new_qza1'], row['new_qza2'],
                        res_dir, mod_dir, nul_dir, mod_rnk, mod_rdn, mod_stt,
                        nul_rnk, nul_rdn, nul_stt, summary, params['batches'],
                        params['learns'], params['epochs'],
                        params['input_prior'], params['output_prior'],
                        params['thresh_feats'], params['latent_dims'],
                        params['train_column'], params['n_examples'],
                        params['summary_interval'], self.config.gpu,
                        self.config.standalone, self.config.qiime_env)
                    cmds.setdefault(row['pair'], []).append(cmd)
        if mmvec:
            self.get_mmvec_pd(mmvec)
        self.register_command('mmvec', cmds)
Esempio n. 13
0
def nestedness_nodfs(i_datasets_folder: str, nodfs_fps: dict, collapsed: dict,
                     filt_raref: str, prjct_nm: str, qiime_env: str,
                     chmod: str, noloc: bool, slurm: bool, split: bool,
                     run_params: dict, jobs: bool, chunkit: int) -> None:

    RESOURCES = pkg_resources.resource_filename("routine_qiime2_analyses",
                                                "resources")
    nestedness_nodfs_fp = '%s/nestedness_nodfs.py' % RESOURCES

    job_folder2 = get_job_folder(i_datasets_folder,
                                 'nestedness_figures/chunks')

    all_sh_pbs = {}
    for dat, nodfs in nodfs_fps.items():

        out_sh = '%s/run_nestedness_nodfs_%s_%s%s.sh' % (job_folder2, prjct_nm,
                                                         dat, filt_raref)
        out_py = out_sh.replace('.sh', '.py')
        cur_sh = '%s/run_nestedness_nodfs_%s%s_tmp.sh' % (job_folder2, dat,
                                                          filt_raref)
        cur_sh = cur_sh.replace(' ', '-')
        with open(cur_sh, 'w') as o:
            o.write('python3 %s\n' % out_py)
        all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)

        # value to edit in template
        odir = get_analysis_folder(i_datasets_folder,
                                   'nestedness/%s%s' % (dat, filt_raref))
        with open(out_py, 'w') as o, open(nestedness_nodfs_fp) as f:
            for line in f:
                line_edit = line
                if '<DAT>' in line:
                    line_edit = line_edit.replace('<DAT>', dat)
                if '<ODIR>' in line:
                    line_edit = line_edit.replace('<ODIR>', odir)
                if '<NODFS>' in line:
                    line_edit = line_edit.replace("'<NODFS>'", str(nodfs))
                if '<COLLAPSED>' in line:
                    line_edit = line_edit.replace("'<COLLAPSED>'",
                                                  str(collapsed))
                o.write(line_edit)

    job_folder = get_job_folder(i_datasets_folder, 'nestedness_figures')
    main_sh = write_main_sh(
        job_folder, 'run_nestedness_nodfs%s' % filt_raref, all_sh_pbs,
        '%s.nstd.ndf%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit)
    if main_sh:
        print("# NESTEDNESS NODFS")
        print_message('', 'sh', main_sh, jobs)
 def set_tree_paths(self, config):
     for dataset, data in self.datasets.items():
         if dataset in Datasets.filt_raw:
             continue
         if data.phylo:
             odir = get_analysis_folder(config.i_datasets_folder,
                                        'phylo/%s' % dataset)
             tree_nwk = '%s/tree_%s.nwk' % (odir, dataset)
             tree_qza = '%s.qza' % splitext(tree_nwk)[0]
             if data.phylo[0] == 'amplicon':
                 intree_qza = '%s_inTree.qza' % splitext(tree_nwk)[0]
                 data.tree = (intree_qza, tree_qza, tree_nwk)
             else:
                 data.tree = ('', tree_qza, tree_nwk)
    def get_precomputed_taxonomy(self, config, method='sklearn'):
        for dataset_, data in self.datasets.items():
            dataset = self._get_filt_raw(dataset_)
            analysis_folder = get_analysis_folder(config.i_datasets_folder,
                                                  'taxonomy/%s' % dataset)
            tax_qza = '%s/tax_%s_%s.qza' % (analysis_folder, dataset, method)
            tax_tsv = '%s.tsv' % splitext(tax_qza)[0]
            if isfile(tax_tsv) and isfile(tax_qza):
                data.tax = ['', tax_qza, tax_tsv]

            tax_qza = '%s/tax_%s.qza' % (analysis_folder, dataset)
            tax_tsv = '%s.tsv' % splitext(tax_qza)[0]
            if isfile(tax_tsv) and isfile(tax_qza):
                data.tax = ['', tax_qza, tax_tsv]
def get_dm_meta(dat, dm, meta, raref, metric, i_datasets_folder, skip):
    dm_rgx = '%s%s*/*%s_DM.qza' % (dirname(dm), raref, metric)
    dm_rgx_glob = glob.glob(dm_rgx)
    if len(dm_rgx_glob) >= 1:
        dm = sorted(dm_rgx_glob)[0]
    else:
        skip += 1
    meta_dir = get_analysis_folder(i_datasets_folder, 'rarefy/%s' % dat)
    meta_rgx = '%s/meta_%s%s*tsv' % (meta_dir, dat, raref)
    meta_rgx_glob = glob.glob(meta_rgx)
    if len(meta_rgx_glob) >= 1:
        meta = sorted(meta_rgx_glob)[0]
    else:
        skip += 1
    return dm, meta
 def set_taxonomy_paths(self, config, method):
     for dataset_, data in self.datasets.items():
         dataset = self._get_filt_raw(dataset_)
         odir = get_analysis_folder(config.i_datasets_folder,
                                    'taxonomy/%s' % dataset)
         if data.phylo and data.phylo[0] == 'amplicon':
             tax_tsv = '%s/tax_%s_%s.tsv' % (odir, dataset, method)
             meth = method
         else:
             tax_tsv = '%s/tax_%s.tsv' % (odir, dataset)
             if data.phylo and data.phylo[0] == 'wol':
                 meth = 'wol'
             else:
                 meth = 'feat'
         tax_qza = '%s.qza' % splitext(tax_tsv)[0]
         data.tax = [meth, tax_qza, tax_tsv]
 def set_rarefaction_paths(self, config):
     for dataset, data in self.datasets.items():
         if dataset in Datasets.filt_raw:
             data.raref_depths = self.datasets[
                 Datasets.filt_raw[dataset]].raref_depths
         if not data.raref_depths:
             continue
         odir = get_analysis_folder(config.i_datasets_folder,
                                    'rarefy/%s' % dataset)
         for depth_ in data.raref_depths[1]:
             depth = '_raref%s' % get_digit_depth(depth_,
                                                  data.data[0].sum())
             data.tsv.append('%s/tab_%s%s.tsv' % (odir, dataset, depth))
             data.qza.append('%s/tab_%s%s.qza' % (odir, dataset, depth))
             data.meta.append('%s/meta_%s%s.tsv' % (odir, dataset, depth))
             data.rarefs.append(depth)
def get_precomputed_trees(
        i_datasets_folder: str, datasets: dict, datasets_filt_map: dict,
        datasets_phylo: dict, trees: dict) -> None:
    """
    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets: dataset -> [tsv/biom path, meta path]
    :param datasets_phylo: to be updated with ('tree_to_use', 'corrected_or_not') per dataset.
    :param trees: to be update with tree to use for a dataset phylogenetic analyses.
    """
    for dat in datasets:
        if dat in datasets_filt_map:
            dat_tree = datasets_filt_map[dat]
        else:
            dat_tree = dat
        analysis_folder = get_analysis_folder(i_datasets_folder, 'phylo/%s' % dat_tree)
        tree_qza = '%s/tree_%s.qza' % (analysis_folder, dat_tree)
        if isfile(tree_qza):
            trees[dat] = ('', tree_qza)
            datasets_phylo[dat] = ('precpu', 0)
Esempio n. 20
0
def get_tax_fp(i_datasets_folder: str, omic: str, input_to_filtered: dict) -> str:

    tax_dir = get_analysis_folder(i_datasets_folder, 'taxonomy')

    omic_taxs = [x for x, y in input_to_filtered.items() if y == omic]
    if len(omic_taxs):
        omic_tax_ = omic_taxs[0]
        if '__raref' in omic_tax_:
            omic_tax = '__raref'.join(omic_tax_.split('__raref')[:-1])
        else:
            omic_tax = omic_tax_
    else:
        print('\nNo taxonomy file for "%s"' % omic)
        return ''

    omic_tax_fps = glob.glob('%s/%s/tax_%s*.tsv' % (tax_dir, omic_tax, omic_tax))
    if len(omic_tax_fps):
        omic_tax_fp = omic_tax_fps[0]
    else:
        omic_tax_fp = ''
    return omic_tax_fp
Esempio n. 21
0
 def get_datasets_paths(self):
     datasets_paths = self.mmvecs.copy()
     # print("datasets_paths")
     # print(datasets_paths.iloc[:, :5])
     datasets_paths = datasets_paths.drop(columns=['pair', 'omic'])
     datasets_paths = datasets_paths.loc[~datasets_paths.astype(str).
                                         duplicated()]
     # print("datasets_paths")
     # print(datasets_paths.iloc[:, :5])
     # print("datasets_paths[['dataset', 'filter', 'subset']].values")
     # print(datasets_paths[['dataset', 'filter', 'subset']].values)
     paths = []
     for r, row in datasets_paths.iterrows():
         dataset = row['dataset']
         filter = row['filter']
         subset = row['subset']
         odir = get_analysis_folder(
             self.config.i_datasets_folder,
             'mmvec/datasets/%s/%s' % (dataset, subset))
         rad = '%s_%s' % (dataset, filter)
         tsv = '%s/tab_%s.tsv' % (odir, rad)
         qza = '%s.qza' % splitext(tsv)[0]
         meta = '%s/meta_%s.tsv' % (odir, rad)
         paths.append([dataset, filter, subset, tsv, qza, meta])
     # print("paths")
     # print(paths)
     # datasets_paths = pd.concat([
     #     datasets_paths,
     #     pd.DataFrame(paths, columns=['tsv', 'qza', 'meta'])
     # ], axis=1)
     datasets_paths = datasets_paths.merge(
         pd.DataFrame(
             paths,
             columns=['dataset', 'filter', 'subset', 'tsv', 'qza', 'meta']),
         on=['dataset', 'filter', 'subset'],
         how='left')
     # print("datasets_paths")
     # print(datasets_paths.iloc[:, :5])
     return datasets_paths
 def create_songbird_feature_metadata(self):
     if self.q2s_pd.shape[0]:
         q2_pd = self.q2s_pd.loc[(self.q2s_pd.pair == 'no_pair')
                                 & (self.q2s_pd.Pseudo_Q_squared > 0)]
         for dat, dataset_pd in q2_pd.groupby('dataset'):
             dataset_sbs = []
             for r, row in dataset_pd.iterrows():
                 pr = 'pair=%s' % row['pair']
                 fr = 'filter=%s' % row['filter']
                 sb = 'subset=%s' % row['subset']
                 ml = 'model=%s' % row['model']
                 st = 'sb_filt=%s' % row['songbird_filter']
                 ps = 'params=%s' % row['parameters']
                 be = 'baseline=%s' % row['baseline']
                 q2 = '[Q2=%s]' % row['Pseudo_Q_squared']
                 diffs = row['differentials']
                 sb_pd = pd.read_csv(diffs, index_col=0, sep='\t')
                 sb_pd.columns = [
                     '%s %s: %s' %
                     ('__'.join([dat, pr, fr, sb, ml, st, ps, be]), q2, x)
                     for x in sb_pd.columns
                 ]
                 dataset_sbs.append(sb_pd)
             if len(dataset_sbs):
                 dataset_sbs_pd = pd.concat(dataset_sbs, axis=1, sort=False)
                 odir = get_analysis_folder(self.config.i_datasets_folder,
                                            'songbird/%s' % dat)
                 fpo_tsv = '%s/differentials_%s.tsv' % (odir, dat)
                 fpo_qza = '%s/differentials_%s.qza' % (odir, dat)
                 dataset_sbs_pd = dataset_sbs_pd.reset_index()
                 dataset_sbs_pd = dataset_sbs_pd.rename(
                     columns={
                         dataset_sbs_pd.columns.tolist()[0]: 'Feature ID'
                     })
                 dataset_sbs_pd.to_csv(fpo_tsv, index=True, sep='\t')
                 run_import(fpo_tsv, fpo_qza, 'FeatureData[Differential]')
def check_common_datasets(i_datasets_folder: str, mmvec_pairs: dict,
                          mmvec_filtering: dict, filt_datasets_pass: dict,
                          input_to_filtered: dict,
                          mmvec_subsets: dict) -> (dict, list):
    """
    :param i_datasets_folder:
    :param mmvec_pairs:
    :param force: Force the re-writing of scripts for all commands.
    :return:
    """
    common_datasets_pass = {}
    for pair, pair_datasets in mmvec_pairs.items():
        pair_filtering = mmvec_filtering[pair]
        common_datasets_pass[pair] = []
        data_dir_ = get_analysis_folder(i_datasets_folder,
                                        'mmvec/common/data/%s' % pair)
        meta_dir_ = get_analysis_folder(i_datasets_folder,
                                        'mmvec/common/metadata/%s' % pair)
        (omic1_, bool1), (omic2_, bool2) = pair_datasets
        if omic1_ not in input_to_filtered or omic2_ not in input_to_filtered:
            continue
        omic1 = input_to_filtered[omic1_]
        omic2 = input_to_filtered[omic2_]
        if (omic1, bool1) not in filt_datasets_pass or (
                omic2, bool2) not in filt_datasets_pass:
            continue
        for case_var, case_vals_list in mmvec_subsets.items():
            for case_vals in case_vals_list:
                case = get_case(case_vals, case_var)
                data_dir = data_dir_ + '/' + case
                meta_dir = meta_dir_ + '/' + case
                for preval_abund in sorted(pair_filtering):
                    preval_filt1, abund_filter1 = pair_filtering[preval_abund][
                        (omic1_, bool1)]
                    preval_filt2, abund_filter2 = pair_filtering[preval_abund][
                        (omic2_, bool2)]
                    if not filt_datasets_pass[(omic1, bool1)][(case,
                                                               preval_abund)]:
                        continue
                    if not filt_datasets_pass[(omic2, bool2)][(case,
                                                               preval_abund)]:
                        continue
                    filt1 = '_'.join([preval_filt1, abund_filter1])
                    filt2 = '_'.join([preval_filt2, abund_filter2])
                    tsv1, qza1, meta1, meta_pd1, sams1 = filt_datasets_pass[(
                        omic1, bool1)][(case, preval_abund)]
                    tsv2, qza2, meta2, meta_pd2, sams2 = filt_datasets_pass[(
                        omic2, bool2)][(case, preval_abund)]
                    common_sams = sorted(set(sams1) & set(sams2))
                    if len(common_sams) < 10:
                        print(
                            'Not enough samples: %s (%s) vs %s (%s) -> skipping'
                            % (omic1, filt1, omic2, filt2))
                        continue
                    meta_fp = '%s/meta_%s_%s_%s__%s_%s_%s__%s_%ss.tsv' % (
                        meta_dir, omic1, preval_filt1, abund_filter1, omic2,
                        preval_filt2, abund_filter2, pair, len(common_sams))
                    new_tsv1 = '%s/tab_%s_%s_%s__%s_%ss.tsv' % (
                        data_dir, omic1, preval_filt1, abund_filter1, pair,
                        len(common_sams))
                    new_qza1 = '%s.qza' % splitext(new_tsv1)[0]
                    new_tsv2 = '%s/tab_%s_%s_%s__%s_%ss.tsv' % (
                        data_dir, omic2, preval_filt2, abund_filter2, pair,
                        len(common_sams))
                    new_qza2 = '%s.qza' % splitext(new_tsv2)[0]
                    if isfile(meta_fp) and isfile(new_qza1) and isfile(
                            new_qza2):
                        common_datasets_pass[pair].append(meta_fp)
Esempio n. 24
0
def run_qemistree(i_datasets_folder: str, datasets: dict, prjct_nm: str,
                  i_qemistree: str, taxonomies: dict, force: bool,
                  qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                  run_params: dict, filt_raref: str, jobs: bool,
                  chunkit: int) -> None:
    """
    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets_read: dataset -> [tsv table, meta table]
    :param prjct_nm: Short nick name for your project.
    :param i_qemistree: path to qemistree folder (feature-data and tree).
    :param taxonomies: dataset -> [method, assignment qza]
    :param force: Force the re-writing of scripts for all commands.
    :param qiime_env: name of your qiime2 conda environment (e.g. qiime2-2019.10).
    :param chmod: whether to change permission of output files (defalt: 775).
    """

    job_folder = get_job_folder(i_datasets_folder, 'qemistree')
    job_folder2 = get_job_folder(i_datasets_folder, 'qemistree/chunks')

    written = 0
    to_chunk = []
    run_pbs = '%s/1_run_qemistree_%s%s.sh' % (job_folder, prjct_nm, filt_raref)
    with open(run_pbs, 'w') as o:
        for dat, tsv_meta_pds in datasets.items():
            feature_data = '%s/feature-data_%s.qza' % (i_qemistree, dat)
            qemistree = '%s/qemistree_%s.qza' % (i_qemistree, dat)
            if not isfile(feature_data) or not isfile(qemistree):
                continue
            out_sh = '%s/run_qemistree_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            odir = get_analysis_folder(i_datasets_folder, 'qemistree/%s' % dat)
            classyfire_qza = '%s/%s-classyfire.qza' % (odir, dat)
            classyfire_tsv = '%s.tsv' % splitext(classyfire_qza)[0]
            with open(out_sh, 'w') as cur_sh:
                if force or not isfile(classyfire_tsv):
                    write_qemistree(feature_data, classyfire_qza,
                                    classyfire_tsv, qemistree,
                                    cur_sh)
                    written += 1

            if isfile(classyfire_tsv):
                odir = get_analysis_folder(i_datasets_folder, 'taxonomy/%s' % dat)
                out_rad = '%s/tax_%s' % (odir, dat)
                tax_qza = '%s.qza' % out_rad
                tax_tsv = '%s.tsv' % out_rad
                classyfire_pd = pd.read_csv(classyfire_tsv, header=0, sep='\t')
                with open(tax_tsv, 'w') as o:
                    cols = ['id', 'kingdom', 'superclass', 'class', 'subclass', 'direct_parent']
                    o.write('Feature ID\tTaxon\n')
                    for row in classyfire_pd[cols].values:
                        o.write('%s\t%s\n' % (row[0], '; '.join(row[1:])))
                run_export(tax_tsv, tax_qza, 'FeatureData[Taxonomy]')
                taxonomies[dat] = ['direct_parent', tax_qza]
                written += 1
            else:
                print('[Warning] Maybe run qemistree first and then re-run pipeline to '
                      'have the classyfire taxonomy include in the barplots!')

            to_chunk.append(out_sh)
            if not chunkit:
                run_xpbs(out_sh, out_pbs, '%s.qmstr.%s%s' % (prjct_nm, dat, filt_raref), qiime_env,
                     run_params["time"], run_params["n_nodes"], run_params["n_procs"],
                     run_params["mem_num"], run_params["mem_dim"],
                     chmod, written, 'single', o, noloc, slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'qemistree',
                      prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"],
                      run_params["mem_num"], run_params["mem_dim"],
                      qiime_env, chmod, noloc, slurm, jobs, chunkit, None)

    if written:
        print_message('# Make qemistree classyfire classifications', 'sh', run_pbs, jobs)
Esempio n. 25
0
def run_sourcetracking(i_datasets_folder: str, datasets: dict,
                       p_sourcetracking_config: str, datasets_rarefs: dict,
                       force: bool, prjct_nm: str, qiime_env: str, chmod: str,
                       noloc: bool, slurm: bool, run_params: dict,
                       filt_raref: str, split: bool, jobs: bool,
                       chunkit: int) -> None:

    job_folder2 = get_job_folder(i_datasets_folder, 'sourcetracking/chunks')
    sourcetracking_dicts = get_sourcetracking_config(p_sourcetracking_config)
    sourcetracking_sourcesink = sourcetracking_dicts[0]
    sourcetracking_filtering = sourcetracking_dicts[1]
    sourcetracking_params = sourcetracking_dicts[2]
    main_cases_dict = sourcetracking_dicts[3]

    all_sh_pbs = {}
    all_import_sh_pbs = {}
    for dat, tsv_meta_pds_ in datasets.items():
        if dat in sourcetracking_filtering:
            filters = sourcetracking_filtering[dat]
        else:
            filters = {'0_0': ['0', '0']}
        for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
            tsv, meta = tsv_meta_pds
            meta_pd = read_meta_pd(meta)
            meta_pd = meta_pd.set_index('sample_name')
            cases_dict = check_metadata_cases_dict(meta, meta_pd,
                                                   dict(main_cases_dict),
                                                   'sourcetracking')
            cur_raref = datasets_rarefs[dat][idx]
            out_import_sh = '%s/run_import_sourcetracking_%s_%s%s%s.sh' % (
                job_folder2, prjct_nm, dat, filt_raref, cur_raref)
            imports = set()
            odir = get_analysis_folder(i_datasets_folder,
                                       'sourcetracking/%s' % dat)
            for method in sourcetracking_params['method']:
                out_sh = '%s/run_sourcetracking_%s_%s%s%s_%s.sh' % (
                    job_folder2, prjct_nm, dat, filt_raref, cur_raref, method)
                for case_var, case_vals_list in cases_dict.items():
                    for filt, (fp, fa) in filters.items():
                        cur_sh = '%s/run_sourcetracking_%s_%s_%s%s%s_%s_%s.sh' % (
                            job_folder2, prjct_nm, dat, case_var, filt_raref,
                            cur_raref, method, filt)
                        cur_sh = cur_sh.replace(' ', '-')
                        cur_import_sh = '%s/run_import_sourcetracking_%s_%s_%s%s%s_%s_%s.sh' % (
                            job_folder2, prjct_nm, dat, case_var, filt_raref,
                            cur_raref, method, filt)
                        cur_import_sh = cur_import_sh.replace(' ', '-')
                        all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                        all_import_sh_pbs.setdefault((dat, out_import_sh),
                                                     []).append(cur_import_sh)
                        run_single_sourcetracking(
                            odir, tsv, meta_pd, case_var,
                            sourcetracking_params, method, imports,
                            sourcetracking_sourcesink, case_vals_list, cur_sh,
                            cur_import_sh, force, filt, cur_raref, fp, fa,
                            run_params["n_nodes"], run_params["n_procs"])

    job_folder = get_job_folder(i_datasets_folder, 'sourcetracking')
    main_sh = write_main_sh(
        job_folder,
        '3_run_import_sourcetracking_%s%s' % (prjct_nm, filt_raref),
        all_import_sh_pbs, '%s.mpt.srctrk%s' % (prjct_nm, filt_raref),
        run_params["time"], run_params["n_nodes"], run_params["n_procs"],
        run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc,
        slurm, jobs, chunkit, '~/.')
    if main_sh:
        if p_sourcetracking_config:
            if p_sourcetracking_config.startswith('/panfs'):
                p_sourcetracking_config = p_sourcetracking_config.replace(
                    os.getcwd(), '')
            print('# import sourcetracking (groups config in %s)' %
                  p_sourcetracking_config)
        else:
            print('# import sourcetracking')
        print_message('', 'sh', main_sh, jobs)

    main_sh = write_main_sh(
        job_folder, '3_run_sourcetracking_%s%s' % (prjct_nm, filt_raref),
        all_sh_pbs, '%s.srctrk%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit,
        '~/.')
    if main_sh:
        if p_sourcetracking_config:
            if p_sourcetracking_config.startswith('/panfs'):
                p_sourcetracking_config = p_sourcetracking_config.replace(
                    os.getcwd(), '')
            print('# sourcetracking (groups config in %s)' %
                  p_sourcetracking_config)
        else:
            print('# sourcetracking')
        print_message('', 'sh', main_sh, jobs)
    def songbird(self) -> None:
        """Main script for the creation of songbird jobs.
        It iterates over the rows of the table created
        upfront and over each combination of parameters
        and collect the output info for potential reuse
        in figure generation and post-analysis.

        Parameters
        ----------
        config : Class instance of AnalysesConfig
            Contains all the routine analyses config info.
        project
            Darasets.
        """
        cmds = {}
        mess = set()
        songbird = []
        dat_cmds, dat_bcmds = {}, {}
        params_pd = self.get_params_combinations()
        for r, row in self.songbirds.iterrows():
            qza, pair, meta_fp = row['qza'], row['pair'], row['meta']
            dat, filt, subset = row['dataset'], row['filter'], row['subset']
            if dat not in self.songbird_models:
                continue
            dat_pair, pair_dir = self.get_dat_pair_dir(dat, pair)
            meta_pd = read_meta_pd(meta_fp)
            models = self.check_metadata_models(meta_fp, meta_pd,
                                                self.songbird_models[dat])
            row_params_pd = params_pd.copy()
            self.process_params_combinations(dat, meta_pd, row_params_pd, mess)
            for p, params in row_params_pd.iterrows():
                params_dir = self.get_params_dir(params)
                baselines, model_baselines = {}, {'1': '1'}
                for modx, model in enumerate(models.keys()):
                    formula, meta_vars, drop = models[model]
                    datdir, odir, new_qza, new_meta = self.get_main_dirs(
                        pair_dir, filt, subset, params_dir, model, self.config)
                    self.write_new_meta(meta_pd, new_meta, meta_vars, drop,
                                        params)
                    if dat in self.models_baselines and model in \
                            self.models_baselines[dat]:
                        model_baselines = self.models_baselines[dat][model]
                    for mdx, model_baseline in enumerate(model_baselines):
                        bformula = model_baselines[model_baseline]
                        bodir = get_analysis_folder(
                            self.config.i_datasets_folder,
                            'songbird/%s/b-%s' % (datdir, model_baseline))
                        out_paths = self.get_out_paths(odir, bodir,
                                                       model_baseline,
                                                       baselines)
                        # convergence = self.check_stats_convergence(out_paths)
                        cmd, bcmd = songbird_cmd(qza, new_qza, new_meta,
                                                 params, formula, bformula,
                                                 out_paths)
                        songbird.append([
                            dat, filt,
                            '%s_%s' % (params_dir.replace('/', '__'), model),
                            subset, out_paths['diff'], model_baseline,
                            out_paths['html'], pair
                        ])
                        if cmd:
                            dat_cmds.setdefault(dat, []).append(cmd)
                        if bcmd:
                            dat_bcmds.setdefault(dat, []).append(bcmd)

        for dat in dat_bcmds:
            # first come the scripts generating (reused) baselines models
            if dat_bcmds[dat]:
                cmds.setdefault(dat, []).extend(dat_bcmds[dat])
        for dat in dat_cmds:
            # and then the scripts generating the actual models
            if dat_cmds[dat]:
                cmds.setdefault(dat, []).extend(dat_cmds[dat])
        if songbird:
            self.get_songbird_pd(songbird)

        self.show_models_issues()
        self.register_command('songbird', cmds)
        self.summarize_songbirds()
        self.create_songbird_feature_metadata()
Esempio n. 27
0
def run_nestedness(i_datasets_folder: str, betas: dict,
                   datasets_collapsed_map: dict, p_nestedness_groups: str,
                   datasets_rarefs: dict, force: bool, prjct_nm: str,
                   qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                   split: bool, run_params: dict, filt_raref: str, jobs: bool,
                   chunkit: int) -> (dict, list, dict):

    job_folder2 = get_job_folder(i_datasets_folder, 'nestedness/chunks')

    nestedness_config = read_yaml_file(p_nestedness_groups)
    if 'soft' not in nestedness_config:
        print(
            'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)'
        )
        return {}
    if nestedness_config['soft'].endswith('Autocorrelation.jar') and isfile(
            nestedness_config['soft']):
        binary = nestedness_config['soft']
    else:
        binary = '%s/bin/Autocorrelation.jar' % nestedness_config['soft']
        if not isfile(binary):
            print(
                'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)'
            )
            return {}

    subsets, nodfs, colors, nulls, modes, params = get_nestedness_config(
        nestedness_config)

    nodfs_fps = {}
    all_sh_pbs = {}
    nestedness_res = {}
    for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items():
        if not split:
            out_sh = '%s/run_nestedness_%s_%s%s.sh' % (job_folder2, prjct_nm,
                                                       dat, filt_raref)

        stats_tax_dat, level = get_stats_tax_dat(dat, datasets_collapsed_map)

        nestedness_res[dat] = []
        for idx, metrics_groups_metas_qzas_dms_trees in enumerate(
                rarefs_metrics_groups_metas_qzas_dms_trees):
            nestedness_raref = {}
            cur_raref = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder,
                                       'nestedness/%s%s' % (dat, cur_raref))
            if split:
                out_sh = '%s/run_nestedness_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, cur_raref, filt_raref)
            for _, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items(
            ):
                for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items(
                ):

                    meta, qza, mat_qza, tree = metas_qzas_mat_qzas_trees[0]
                    meta_pd = read_meta_pd(meta).set_index('sample_name')
                    cases_dict = check_metadata_cases_dict(
                        meta, meta_pd, dict(subsets), 'nestedness')
                    for case_var, case_vals_list in cases_dict.items():
                        for case_vals in case_vals_list:
                            case = get_case(case_vals,
                                            case_var).replace(' ', '_')
                            cur_sh = '%s/run_nestedness_%s%s_%s_%s%s.sh' % (
                                job_folder2, dat, cur_raref, group, case,
                                filt_raref)
                            cur_sh = cur_sh.replace(' ', '-')
                            # print("case", case)
                            all_sh_pbs.setdefault((dat, out_sh),
                                                  []).append(cur_sh)
                            res, group_case_nodfs = run_single_nestedness(
                                odir, cur_raref, level, group, meta_pd, nodfs,
                                nulls, modes, cur_sh, qza, case, case_var,
                                case_vals, binary, params, force)
                            nodfs_fps.setdefault(stats_tax_dat,
                                                 []).extend(group_case_nodfs)
                            nestedness_raref[(group, case)] = res
                break
            nestedness_res[dat].append(nestedness_raref)
Esempio n. 28
0
def run_taxonomy(method: str, i_datasets_folder: str, datasets: dict,
                 datasets_read: dict, datasets_phylo: dict,
                 datasets_features: dict, datasets_filt_map: dict,
                 i_classifier: str, taxonomies: dict, force: bool,
                 prjct_nm: str, qiime_env: str, chmod: str, noloc: bool,
                 slurm: bool, run_params: dict, filt_raref: str, jobs: bool,
                 chunkit: int) -> None:
    """

    Parameters
    ----------
    method
    i_datasets_folder : str
        Path to the folder containing the data/metadata subfolders.
    datasets : dict
        Mappring dataset name -> [data file path, metadata file path].
    datasets_read : dict
        Mapping dataset name -> [data table, metadata table]
    datasets_phylo : dict
        To be updated with ('tree_to_use', 'corrected_or_not') per dataset.
    datasets_features : dict
        Mapping dataset name -> list of features names in
                                the dataset tsv / biom file.
    datasets_filt_map : dict
    i_classifier : str
        Path to the taxonomic classifier.
    taxonomies : dict
        Mapping Dataset name -> [method, assignment qza]
    force : bool
        Force the re-writing of scripts for all commands.
    prjct_nm : str
        Short nick name for your project.
    qiime_env : str
        Name of your qiime2 conda environment (e.g. qiime2-2019.10).
    chmod : str
        Whether to change permission of output files (default: 744).
    noloc : str
    run_params : dict
    filt_raref : str
    jobs : bool
    chunkit : int

    Returns
    -------

    """
    job_folder = get_job_folder(i_datasets_folder, 'taxonomy')
    job_folder2 = get_job_folder(i_datasets_folder, 'taxonomy/chunks')
    amplicon_datasets = [
        dat for dat, (tree, correction) in datasets_phylo.items()
        if tree == 'amplicon'
    ]
    wol_datasets = [
        dat for dat, (tree, correction) in datasets_phylo.items()
        if tree == 'wol'
    ]

    main_written = 0
    to_chunk = []
    run_pbs = '%s/1_run_taxonomy_%s%s.sh' % (job_folder, prjct_nm, filt_raref)
    with open(run_pbs, 'w') as o:
        for dat, tsv_meta_pds_ in datasets_read.items():
            out_sh = '%s/run_taxonomy_%s_%s%s.sh' % (job_folder2, prjct_nm,
                                                     dat, filt_raref)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            if dat in datasets_filt_map:
                taxonomies[dat] = taxonomies[datasets_filt_map[dat]]
                continue
            written = 0
            with open(out_sh, 'w') as cur_sh:
                for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
                    if idx:
                        continue
                    tsv, meta = datasets[dat][idx]
                    if not isinstance(tsv_meta_pds[0], pd.DataFrame) and \
                            tsv_meta_pds[0] == 'raref':
                        if not isfile(tsv):
                            print('Must have run rarefaction to use it '
                                  'further...\nExiting')
                            sys.exit(0)
                        tsv_pd, meta_pd = get_raref_tab_meta_pds(meta, tsv)
                        datasets_read[dat][idx] = [tsv_pd, meta_pd]
                    else:
                        tsv_pd, meta_pd = tsv_meta_pds

                    odir = get_analysis_folder(i_datasets_folder,
                                               'taxonomy/%s' % dat)
                    out_rad = '%s/tax_%s' % (odir, dat)

                    if dat in amplicon_datasets:
                        out_qza = '%s_%s.qza' % (out_rad, method)
                        out_tsv = '%s.tsv' % splitext(out_qza)[0]
                        taxonomies[dat] = [method, out_qza, out_tsv]
                        if not i_classifier:
                            print('No classifier passed for 16S '
                                  'data\nExiting...')
                            continue
                        cmd = run_taxonomy_amplicon(dat, i_datasets_folder,
                                                    force, tsv_pd, out_qza,
                                                    out_tsv, i_classifier)
                    else:
                        out_qza = '%s.qza' % out_rad
                        out_tsv = '%s.tsv' % out_rad
                        if dat in wol_datasets:
                            cur_datasets_features = datasets_features[dat]
                            taxonomies[dat] = ['wol', out_qza, out_tsv]
                            cmd = run_taxonomy_wol(force, tsv_pd, out_qza,
                                                   out_tsv,
                                                   cur_datasets_features)
                        else:
                            if len(
                                [x for x in tsv_pd.index
                                 if str(x).isdigit()]) == tsv_pd.shape[0]:
                                continue
                            taxonomies[dat] = ['feat', out_qza, out_tsv]
                            cmd = run_taxonomy_others(force, tsv_pd, out_qza,
                                                      out_tsv)
                    if cmd:
                        cur_sh.write('echo "%s"\n' % cmd)
                        cur_sh.write('%s\n\n' % cmd)
                        main_written += 1
                        written += 1
            if written:
                to_chunk.append(out_sh)
                if not chunkit:
                    run_xpbs(out_sh, out_pbs,
                             '%s.tx.sklrn.%s%s' % (prjct_nm, dat, filt_raref),
                             qiime_env, run_params["time"],
                             run_params["n_nodes"], run_params["n_procs"],
                             run_params["mem_num"], run_params["mem_dim"],
                             chmod, written, 'single', o, noloc, slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'taxonomy', prjct_nm,
                      run_params["time"], run_params["n_nodes"],
                      run_params["n_procs"], run_params["mem_num"],
                      run_params["mem_dim"], qiime_env, chmod, noloc, slurm,
                      jobs, chunkit, None)

    if main_written:
        print_message('# Classify features using classify-sklearn', 'sh',
                      run_pbs, jobs)
Esempio n. 29
0
                    formula, meta_vars, drop = models[model]
                    # print("meta_pd.shape")
                    # print(meta_pd.shape)
                    # print("meta_pd.columns")
                    # print(meta_pd.columns)
                    # print("meta_vars")
                    # print(meta_vars)
                    # for meta_v in meta_vars:
                    #     print("meta_v")
                    #     print(meta_v)
                    #     print("meta_pd[meta_v].value_counts()")
                    #     print(meta_pd[meta_v].value_counts())

                    datdir = '%s/%s/%s/%s/%s' % (dat_pair_path, filt, case,
                                                 params, model)
                    odir = get_analysis_folder(i_datasets_folder,
                                               'songbird/%s' % datdir)
                    new_qza = '%s/tab.qza' % odir
                    new_meta = '%s/metadata.tsv' % odir

                    train_column, train_samples = get_metadata_train_test(
                        meta, meta_pd, list(meta_vars), new_meta, train, drop)
                    if not train_column:
                        new_meta_invalid = '%s/metadata_invalid' % odir
                        with open(new_meta_invalid, 'w') as invalid:
                            pass
                        continue

                    baselines = {}
                    metadatas = {}
                    model_baselines = {'1': '"1"'}
                    if dat in models_baselines and model in models_baselines[
Esempio n. 30
0
def run_barplot(i_datasets_folder: str, datasets: dict, taxonomies: dict,
                force: bool, prjct_nm: str, qiime_env: str, chmod: str,
                noloc: bool, slurm: bool, run_params: dict, filt_raref: str,
                jobs: bool, chunkit: int) -> None:
    """Visualize taxonomy with an interactive bar plot.

    Parameters
    ----------
    i_datasets_folder : str
        Path to the folder containing the data/metadata subfolders
    datasets : dict
        Mappig dataset name -> [tsv file path, metadata file path]
    taxonomies : dict
        Mappig dataset name -> [classification_method, tax_qza]
    force : bool
        Force the re-writing of scripts for all commands
    prjct_nm : str
        Short nick name for your project
    qiime_env : str
        Mame of a qiime2 conda environment
    chmod : str
        Whether to change permission of output files (defalt: 744)
    noloc : bool
    run_params : dict
    filt_raref : str
    jobs : bool
    chunkit : int

    Returns
    -------

    """
    job_folder = get_job_folder(i_datasets_folder, 'barplot')
    job_folder2 = get_job_folder(i_datasets_folder, 'barplot/chunks')

    written = 0
    to_chunk = []
    run_pbs = '%s/1_run_barplot_%s%s.sh' % (job_folder, prjct_nm, filt_raref)
    with open(run_pbs, 'w') as o:
        for dat, tsv_meta_pds_ in datasets.items():
            out_sh = '%s/run_barplot_%s_%s%s.sh' % (job_folder2, prjct_nm, dat,
                                                    filt_raref)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            with open(out_sh, 'w') as cur_sh:
                for tsv_meta_pds in tsv_meta_pds_:
                    tsv, meta = tsv_meta_pds
                    if dat not in taxonomies:
                        continue
                    method, tax_qza, tax_tsv = taxonomies[dat]
                    if not method:
                        method = 'taxofromfile'
                    qza = '%s.qza' % splitext(tsv)[0]
                    odir = get_analysis_folder(i_datasets_folder,
                                               'barplot/%s' % dat)
                    out_qzv = '%s/bar_%s_%s.qzv' % (odir, dat, method)
                    if force or not isfile(out_qzv):
                        write_barplots(out_qzv, qza, meta, tax_qza, cur_sh)
                        written += 1
            to_chunk.append(out_sh)
            if not chunkit:
                run_xpbs(out_sh, out_pbs,
                         '%s.brplt.%s%s' % (prjct_nm, dat, filt_raref),
                         qiime_env, run_params["time"], run_params["n_nodes"],
                         run_params["n_procs"], run_params["mem_num"],
                         run_params["mem_dim"], chmod, written, 'single', o,
                         noloc, slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'barplot', prjct_nm,
                      run_params["time"], run_params["n_nodes"],
                      run_params["n_procs"], run_params["mem_num"],
                      run_params["mem_dim"], qiime_env, chmod, noloc, slurm,
                      jobs, chunkit, None)

    if written:
        print_message('# Make sample compositions barplots', 'sh', run_pbs,
                      jobs)