Beispiel #1
0
    def test_read_yaml_file(self):
        expected = {'a': {'b': ['x', 'y']}}
        observed = read_yaml_file(self.dummy_yaml_fp)
        self.assertEqual(expected, observed)

        expected = {}
        observed = read_yaml_file(self.no_dummy_yaml_fp)
        self.assertEqual(expected, observed)
        observed = read_yaml_file(None)
        self.assertEqual(expected, observed)
 def __init__(self, config, project) -> None:
     self.config = config
     self.project = project
     self.cmds = {}
     self.mmvec_songbird_pd = pd.DataFrame()
     self.taxo_pds = {}
     self.metas = {}
     self.mmvec_res = {}
     self.mmvec_issues = set()
     self.xmmvecs = read_yaml_file(config.xmmvec)
     self.highlights = read_yaml_file(config.mmvec_highlights)
Beispiel #3
0
    def setUp(self) -> None:
        # this self.run_params certainly contains defaults:
        self.run_params_fp = '%s/run_params.yml' % RESOURCES
        self.run_params = read_yaml_file(self.run_params_fp)
        # import:
        #   time: "4"
        #   n_nodes: "1"
        #   n_procs: "1"
        #   mem_num: "10"
        #   mem_dim: "gb"
        #   env: "qiime2-2020.2"
        self.conda_envs = {'qiime2-2020.2', 'a_conda_env'}

        self.update_time_param_fp = '%s/update_time_param_fp.yml' % TEST
        with open(self.update_time_param_fp, 'w') as o:
            o.write('import:\n  time: "10"\n')
        self.update_env_param_fp = '%s/update_env_params_fp.yml' % TEST
        with open(self.update_env_param_fp, 'w') as o:
            o.write('import:\n  time: "10"\n  env: "a_conda_env"\n')
        self.update_not_a_env_fp = '%s/update_not_a_env_fp.yml' % TEST
        with open(self.update_not_a_env_fp, 'w') as o:
            o.write('import:\n  env: "not_a_conda_env"\n')
        self.update_not_mem_dim_fp = '%s/update_not_mem_dim_fp.yml' % TEST
        with open(self.update_not_mem_dim_fp, 'w') as o:
            o.write('import:\n  mem_dim: "tb"\n')
Beispiel #4
0
 def get_train_test_dict(self):
     self.train_test_dict = read_yaml_file(self.train_test)
     if 'train' not in self.train_test_dict:
         self.train_test_dict['train'] = 0.7
     elif float(self.train_test_dict['train']) < 0:
         self.train_test_dict['train'] = 0.7
     elif float(self.train_test_dict['train']) > 1:
         self.train_test_dict['train'] = 0.7
Beispiel #5
0
 def __init__(self, config, project) -> None:
     self.config = config
     self.project = project
     self.cmds = {}
     self.alpha_metrics = get_metrics('alpha_metrics', config.alphas)
     self.beta_metrics = get_metrics('beta_metrics', config.betas)
     self.alpha_subsets = read_yaml_file(self.config.alpha_subsets)
     self.alphas = {}
     self.betas = {}
def run_distance_decay(i_datasets_folder: str, betas: dict,
                       p_distance_decay: str, datasets_rarefs: dict,
                       force: bool, prjct_nm: str, qiime_env: str, chmod: str,
                       noloc: bool, slurm: bool, split: bool, run_params: dict,
                       filt_raref: str, jobs: bool,
                       chunkit: int) -> (dict, list):

    job_folder2 = get_job_folder(i_datasets_folder, 'decay/chunks')
    decay_config = read_yaml_file(p_distance_decay)
    subsets, modes, params = get_decay_config(decay_config)

    all_sh_pbs = {}
    decay_res = {}
    for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items():
        if not split:
            out_sh = '%s/run_decay_%s_%s%s.sh' % (job_folder2, prjct_nm, dat,
                                                  filt_raref)
        decay_res[dat] = []
        for idx, metrics_groups_metas_qzas_dms_trees in enumerate(
                rarefs_metrics_groups_metas_qzas_dms_trees):
            decay_raref = {}
            cur_raref = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder,
                                       'decay/%s%s' % (dat, cur_raref))
            if split:
                out_sh = '%s/run_decay_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, cur_raref, filt_raref)
            for metric, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items(
            ):
                for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items(
                ):
                    for (meta, qza, mat_qza,
                         tree) in metas_qzas_mat_qzas_trees:
                        meta_pd = read_meta_pd(meta).set_index('sample_name')
                        cases_dict = check_metadata_cases_dict(
                            meta, meta_pd, dict(subsets), 'decay')
                        for case_var, case_vals_list in cases_dict.items():
                            for case_vals in case_vals_list:
                                case = get_case(case_vals,
                                                case_var).replace(' ', '_')
                                cur_sh = '%s/run_decay_%s%s_%s_%s_%s%s.sh' % (
                                    job_folder2, dat, cur_raref, metric, group,
                                    case, filt_raref)
                                cur_sh = cur_sh.replace(' ', '-')
                                all_sh_pbs.setdefault((dat, out_sh),
                                                      []).append(cur_sh)
                                new_meta_pd = get_new_meta_pd(
                                    meta_pd, case, case_var, case_vals)
                                res = run_single_decay(
                                    odir, group, new_meta_pd, cur_sh, mat_qza,
                                    case, modes, force, run_params["n_nodes"],
                                    run_params["n_procs"],
                                    int(params['iteration']),
                                    int(params['step']))
                                decay_raref[(metric, group, case)] = res
            decay_res[dat].append(decay_raref)
Beispiel #7
0
 def collapse(self):
     collapse_taxo = read_yaml_file(self.config.collapse_taxo)
     collapse_taxo = dict(
         (Datasets.raw_filt[dat], x) for dat, x in collapse_taxo.items()
         if dat in Datasets.raw_filt and dat in self.project.datasets)
     project_coll = {}
     for dat, levels in collapse_taxo.items():
         data = self.project.datasets[dat]
         split_levels, empties = get_split_levels(levels, data.tax_split[0])
         data.collapsed = split_levels
         for tax, level in split_levels.items():
             dat_tax = '%s_tx-%s' % (dat, tax)
             data_tax = Data(dat_tax, self.config.i_datasets_folder)
             for idx, tsv in enumerate(data.tsv):
                 tax_tsv = '%s_tx-%s.tsv' % (splitext(tsv)[0], tax)
                 tax_qza = '%s.qza' % splitext(tax_tsv)[0]
                 tax_meta = '%s_tx-%s.tsv' % (splitext(
                     data.meta[idx])[0], tax)
                 coll_dat = splitext(tax_tsv)[0].split('/tab_')[-1]
                 if isfile(tax_tsv) and isfile(tax_meta):
                     coll_pd = pd.read_csv(tax_tsv,
                                           index_col=0,
                                           header=0,
                                           sep='\t')
                     if coll_pd.shape[0] < 5:
                         continue
                     cmd = fix_collapsed_data(empties[tax], coll_pd,
                                              tax_tsv, tax_qza, tax_meta)
                     if cmd:
                         self.cmds.setdefault(dat, []).append(cmd)
                     Datasets.coll_raw[coll_dat] = dat
                     Datasets.raw_coll.setdefault(dat, []).append(coll_dat)
                     if idx:
                         data_tax.tsv.append(tax_tsv)
                         data_tax.qza.append(tax_qza)
                         data_tax.meta.append(tax_meta)
                     data_tax.data.append(coll_pd)
                     data_tax.metadata.append(coll_pd)
                     data_tax.rarefs.append(data.rarefs[idx])
                 else:
                     cmd = write_collapse_taxo(data.qza[idx], data.tax[1],
                                               tax_qza, tax_tsv,
                                               data.meta[idx], tax_meta,
                                               level, empties[tax])
                     if cmd:
                         self.cmds.setdefault(dat, []).append(cmd)
             data_tax.phylo = ('', 0)
             project_coll[dat_tax] = data_tax
     self.project.datasets.update(project_coll)
     self.register_command('collapse')
Beispiel #8
0
def get_collapse_taxo(p_collapse_taxo: str, datasets_filt: dict) -> dict:
    """
    Parameters
    ----------
    p_collapse_taxo : str
    datasets_filt : dict

    Returns
    -------
    collapse_taxo : dict
    """
    collapse_taxo = read_yaml_file(p_collapse_taxo)
    collapse_taxo.update(
        dict((datasets_filt[dat], x) for dat, x in collapse_taxo.items()
             if dat in datasets_filt))
    return collapse_taxo
Beispiel #9
0
 def filter_rare_samples(self):
     thresholds = read_yaml_file(self.config.filt_threshs)
     project_filt = {}
     for dat, data in self.project.datasets.items():
         if dat not in thresholds:
             continue
         names, thresh_sam, thresh_feat = get_thresholds(thresholds[dat])
         if no_filtering(dat, thresh_sam, thresh_feat):
             continue
         dat_filt = get_dat_filt(dat, names, thresh_sam, thresh_feat)
         Datasets.filt_raw[dat_filt] = dat
         Datasets.raw_filt[dat] = dat_filt
         # register the filtered dataset as an additional dataset
         data_filt = Data(dat_filt, self.config.i_datasets_folder)
         qza_exists = isfile(data_filt.qza[0])
         meta_exists = isfile(data_filt.meta[0])
         if not self.config.force and qza_exists and meta_exists:
             data_filt.read_data_pd()
             data_filt.read_meta_pd()
         else:
             data_filt_pd = filtering_thresholds(names, thresh_sam,
                                                 thresh_feat, data.data[0])
             if harsh_filtering(dat_filt, data_filt_pd):
                 continue
             # write filtered data
             data_filt_pd.to_csv(data_filt.tsv[0], index=True, sep='\t')
             data_filt.data.append(data_filt_pd)
             # import qza
             cmd = run_import(data_filt.tsv[0], data_filt.qza[0],
                              'FeatureTable[Frequency]')
             self.cmds.setdefault(dat_filt, []).append(cmd)
             # write filtered metadata
             meta_filt_pd = data.metadata[0].loc[
                 data.metadata[0].sample_name.isin(
                     data_filt_pd.columns.tolist())].copy()
             meta_filt_pd.to_csv(data_filt.meta[0], index=False, sep='\t')
             data_filt.metadata.append(meta_filt_pd)
         data_filt.phylo = data.phylo
         data_filt.features = get_gids(data.features, data_filt.data[0])
         project_filt[dat_filt] = data_filt
     self.project.datasets.update(project_filt)
     self.register_command('filter')
Beispiel #10
0
def check_rarefy_need(i_datasets_folder: str, datasets_read: dict,
                      p_raref_depths: str) -> (dict, dict):
    """Check the distribution of reads per sample and its skewness to
    warn user for the need for rarefaction of the feature tables.

    Parameters
    ----------
    i_datasets_folder : str
        Path to the folder containing the data/metadata sub-folders
    datasets_read : dict
    p_raref_depths : str
        Path to a rarefaction config yaml file.

    Returns
    -------

    """
    depths_yml = read_yaml_file(p_raref_depths)
    datasets_raref_depths = {}
    datasets_raref_evals = {}
    for dat, tsv_meta_pds in datasets_read.items():
        for (tsv_pd, meta_pd) in tsv_meta_pds:
            tsv_sam_sum = tsv_pd.sum()
            # not in the class code yet
            datasets_raref_evals[dat] = get_datasets_raref_evals(tsv_sam_sum)
            # if depths_yml:
            #     if dat in depths_yml:
            #     continue
            skip, depths = get_dat_depths(dat, i_datasets_folder, depths_yml,
                                          tsv_sam_sum)
            if skip:
                continue
            datasets_raref_depths[dat] = depths
            if depths[0]:
                # not in the class code yet
                datasets_raref_evals[dat].update([
                    int(x) if str(x).isdigit() else np.floor(min(tsv_sam_sum))
                    for x in depths[1]
                ])
    return datasets_raref_depths, datasets_raref_evals
Beispiel #11
0
def run_nestedness(i_datasets_folder: str, betas: dict,
                   datasets_collapsed_map: dict, p_nestedness_groups: str,
                   datasets_rarefs: dict, force: bool, prjct_nm: str,
                   qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                   split: bool, run_params: dict, filt_raref: str, jobs: bool,
                   chunkit: int) -> (dict, list, dict):

    job_folder2 = get_job_folder(i_datasets_folder, 'nestedness/chunks')

    nestedness_config = read_yaml_file(p_nestedness_groups)
    if 'soft' not in nestedness_config:
        print(
            'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)'
        )
        return {}
    if nestedness_config['soft'].endswith('Autocorrelation.jar') and isfile(
            nestedness_config['soft']):
        binary = nestedness_config['soft']
    else:
        binary = '%s/bin/Autocorrelation.jar' % nestedness_config['soft']
        if not isfile(binary):
            print(
                'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)'
            )
            return {}

    subsets, nodfs, colors, nulls, modes, params = get_nestedness_config(
        nestedness_config)

    nodfs_fps = {}
    all_sh_pbs = {}
    nestedness_res = {}
    for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items():
        if not split:
            out_sh = '%s/run_nestedness_%s_%s%s.sh' % (job_folder2, prjct_nm,
                                                       dat, filt_raref)

        stats_tax_dat, level = get_stats_tax_dat(dat, datasets_collapsed_map)

        nestedness_res[dat] = []
        for idx, metrics_groups_metas_qzas_dms_trees in enumerate(
                rarefs_metrics_groups_metas_qzas_dms_trees):
            nestedness_raref = {}
            cur_raref = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder,
                                       'nestedness/%s%s' % (dat, cur_raref))
            if split:
                out_sh = '%s/run_nestedness_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, cur_raref, filt_raref)
            for _, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items(
            ):
                for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items(
                ):

                    meta, qza, mat_qza, tree = metas_qzas_mat_qzas_trees[0]
                    meta_pd = read_meta_pd(meta).set_index('sample_name')
                    cases_dict = check_metadata_cases_dict(
                        meta, meta_pd, dict(subsets), 'nestedness')
                    for case_var, case_vals_list in cases_dict.items():
                        for case_vals in case_vals_list:
                            case = get_case(case_vals,
                                            case_var).replace(' ', '_')
                            cur_sh = '%s/run_nestedness_%s%s_%s_%s%s.sh' % (
                                job_folder2, dat, cur_raref, group, case,
                                filt_raref)
                            cur_sh = cur_sh.replace(' ', '-')
                            # print("case", case)
                            all_sh_pbs.setdefault((dat, out_sh),
                                                  []).append(cur_sh)
                            res, group_case_nodfs = run_single_nestedness(
                                odir, cur_raref, level, group, meta_pd, nodfs,
                                nulls, modes, cur_sh, qza, case, case_var,
                                case_vals, binary, params, force)
                            nodfs_fps.setdefault(stats_tax_dat,
                                                 []).extend(group_case_nodfs)
                            nestedness_raref[(group, case)] = res
                break
            nestedness_res[dat].append(nestedness_raref)
Beispiel #12
0
def filter_rare_samples(i_datasets_folder: str, datasets: dict,
                        datasets_read: dict, datasets_features: dict,
                        datasets_rarefs: dict, datasets_filt: dict,
                        datasets_filt_map: dict, datasets_phylo: dict,
                        prjct_nm: str, qiime_env: str, p_filt_threshs: str,
                        chmod: str, noloc: bool, run_params: dict,
                        filt_raref: str, jobs: bool, slurm: bool,
                        chunkit: int) -> None:
    """
    Filter the rare features, keep samples with enough reads/features and import to Qiime2.

    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets: dataset -> [tsv/biom path, meta path]
    :param datasets_read: dataset -> [tsv table, meta table]
    :param datasets_features: dataset -> list of features names in the dataset tsv / biom file.
    :param datasets_phylo: to be updated with ('tree_to_use', 'corrected_or_not') per dataset.
    :param prjct_nm: Short nick name for your project.
    :param qiime_env: name of your qiime2 conda environment (e.g. qiime2-2019.10).
    :param thresh: min number of reads per sample to keep it.
    :param chmod: whether to change permission of output files (defalt: 775).
    """
    threshs_dats = read_yaml_file(p_filt_threshs)

    written = 0
    datasets_update = {}
    datasets_read_update = {}
    datasets_features_update = {}
    datasets_phylo_update = {}
    job_folder = get_job_folder(i_datasets_folder, 'import_filtered')
    out_sh = '%s/1_run_import_filtered_%s%s.sh' % (job_folder, prjct_nm,
                                                   filt_raref)
    if slurm:
        out_pbs = '%s.slm' % splitext(out_sh)[0]
    else:
        out_pbs = '%s.pbs' % splitext(out_sh)[0]
    to_chunk = []
    with open(out_sh, 'w') as sh:
        for dat, tab_meta_pds_ in datasets_read.items():
            if dat not in threshs_dats:
                continue
            names, thresh_sam, thresh_feat = get_thresholds(threshs_dats[dat])
            if no_filtering(dat, thresh_sam, thresh_feat):
                continue
            dat_filt = get_dat_filt(dat, names, thresh_sam, thresh_feat)

            datasets_filt[dat] = dat_filt
            datasets_filt_map[dat_filt] = dat
            datasets_rarefs[dat_filt] = ['']

            tsv_filt, qza_filt, meta_filt = get_fps(i_datasets_folder,
                                                    dat_filt)

            if isfile(qza_filt) and isfile(meta_filt):
                datasets_update[dat_filt] = [[tsv_filt, meta_filt]]
                tab_filt_pd = pd.read_csv(tsv_filt,
                                          index_col=0,
                                          header=0,
                                          sep='\t')
                with open(meta_filt) as f:
                    for line in f:
                        break
                meta_filt_pd = pd.read_csv(meta_filt,
                                           header=0,
                                           sep='\t',
                                           dtype={line.split('\t')[0]: str},
                                           low_memory=False)
                # datasets_read_update[dat_filt] = [tab_filt_pd, meta_filt_pd]
                datasets_read_update[dat_filt] = [[tab_filt_pd, meta_filt_pd]]
                datasets_phylo_update[dat_filt] = datasets_phylo[dat]
                datasets_features_update[dat_filt] = dict(
                    gid_feat for gid_feat in datasets_features[dat].items()
                    if gid_feat[1] in tab_filt_pd.index)
                continue

            for (tab_pd, meta_pd) in tab_meta_pds_:
                tab_filt_pd = filtering_thresholds(names, thresh_sam,
                                                   thresh_feat, tab_pd)
                if harsh_filtering(dat_filt, tab_filt_pd):
                    continue
                meta_filt_pd = meta_pd.loc[meta_pd.sample_name.isin(
                    tab_filt_pd.columns.tolist())].copy()
                tab_filt_pd.reset_index().to_csv(tsv_filt,
                                                 index=False,
                                                 sep='\t')
                meta_filt_pd.to_csv(meta_filt, index=False, sep='\t')

                datasets_update[dat_filt] = [[tsv_filt, meta_filt]]
                datasets_read_update[dat_filt] = [[tab_filt_pd, meta_filt_pd]]
                datasets_phylo_update[dat_filt] = datasets_phylo[dat]
                datasets_features_update[dat_filt] = dict(
                    gid_feat for gid_feat in datasets_features[dat].items()
                    if gid_feat[1] in tab_filt_pd.index)
                cmd = run_import(tsv_filt, qza_filt, "FeatureTable[Frequency]")
                sh.write('echo "%s"\n' % cmd)
                sh.write('%s\n' % cmd)
                written += 1
    if written:
        run_xpbs(
            out_sh, out_pbs, '%s.fltr%s' % (prjct_nm, filt_raref), qiime_env,
            run_params["time"], run_params["n_nodes"], run_params["n_procs"],
            run_params["mem_num"], run_params["mem_dim"], chmod, written,
            '# Filter samples for a min number of %s reads' % p_filt_threshs,
            None, noloc, slurm, jobs)

    # after this update, the raw dataset remain included
    datasets.update(datasets_update)
    datasets_read.update(datasets_read_update)
    datasets_features.update(datasets_features_update)
    datasets_phylo.update(datasets_phylo_update)
def run_alpha(i_datasets_folder: str, datasets: dict, datasets_read: dict,
              datasets_phylo: dict, datasets_rarefs: dict,
              p_alpha_subsets: str, trees: dict, force: bool, prjct_nm: str,
              qiime_env: str, chmod: str, noloc: bool, slurm: bool, As: tuple,
              dropout: bool, run_params: dict, filt_raref: str,
              eval_depths: dict, jobs: bool, chunkit: int) -> dict:
    """
    Computes the alpha diversity vectors for each dataset.

    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets: dataset -> [tsv, meta]
    :param datasets_read: dataset -> [tsv table, meta table]
    :param datasets_phylo: to be updated with ('tree_to_use', 'corrected_or_not') per dataset.
    :param p_alpha_subsets: Subsets for alpha diversity.
    :param trees: to be update with tree to use for a dataset phylogenetic analyses.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Short nick name for your project.
    :param qiime_env: name of your qiime2 conda environment (e.g. qiime2-2019.10).
    :param chmod: whether to change permission of output files (defalt: 775).
    :return: {'dataset1': [ 'meta', {'div_index1': '.qza', 'div_index2': '.qza', ... }],
              'dataset2': [ 'meta', {'div_index1': '.qza', 'div_index2': '.qza', ... }], '...'}
    """
    evaluation = ''
    if len(eval_depths):
        evaluation = '_eval'
    alpha_metrics = get_metrics('alpha_metrics', As)
    alpha_subsets = read_yaml_file(p_alpha_subsets)
    job_folder = get_job_folder(i_datasets_folder, 'alpha%s' % evaluation)
    job_folder2 = get_job_folder(i_datasets_folder,
                                 'alpha%s/chunks' % evaluation)
    diversities = {}
    run_pbs = '%s/1_run_alpha_%s%s%s.sh' % (job_folder, prjct_nm, evaluation,
                                            filt_raref)
    main_written = 0
    to_chunk = []
    with open(run_pbs, 'w') as o:
        for dat, tsv_meta_pds_ in datasets.items():
            written = 0
            diversities[dat] = []
            out_sh = '%s/run_alpha_%s%s_%s%s.sh' % (
                job_folder2, prjct_nm, evaluation, dat, filt_raref)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            with open(out_sh, 'w') as cur_sh:
                for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
                    tsv, meta = tsv_meta_pds
                    if not isinstance(
                            datasets_read[dat][idx][0], pd.DataFrame
                    ) and datasets_read[dat][idx][0] == 'raref':
                        if not isfile(tsv):
                            print(
                                'Must have run rarefaction to use it further...\nExiting'
                            )
                            sys.exit(0)
                        tsv_pd, meta_pd = get_raref_tab_meta_pds(meta, tsv)
                        datasets_read[dat][idx] = [tsv_pd, meta_pd]
                    else:
                        tsv_pd, meta_pd = datasets_read[dat][idx]
                    cur_raref = datasets_rarefs[dat][idx]
                    qza = '%s.qza' % splitext(tsv)[0]
                    divs = {}
                    for metric in alpha_metrics:
                        odir = get_analysis_folder(
                            i_datasets_folder, 'alpha/%s%s' % (dat, cur_raref))
                        out_fp = '%s/%s_%s.qza' % (
                            odir, basename(splitext(qza)[0]), metric)
                        out_tsv = '%s.tsv' % splitext(out_fp)[0]
                        if force or not isfile(out_fp):
                            ret_continue = write_diversity_alpha(
                                out_fp, datasets_phylo, trees, dat, qza,
                                metric, cur_sh, qiime_env)
                            if ret_continue:
                                continue
                            cmd = run_export(out_fp, out_tsv, '')
                            cur_sh.write('echo "%s"\n' % cmd)
                            cur_sh.write('%s\n\n' % cmd)
                            written += 1
                            main_written += 1
                        divs.setdefault('', []).append((out_fp, metric))

                    if alpha_subsets and dat in alpha_subsets:
                        for subset, subset_regex in alpha_subsets[dat].items():
                            odir = get_analysis_folder(
                                i_datasets_folder,
                                'alpha/%s%s/%s' % (dat, cur_raref, subset))
                            if dropout:
                                qza_subset_ = '%s/%s_%s.qza' % (
                                    odir, basename(splitext(qza)[0]), subset)
                            else:
                                qza_subset_ = '%s/%s_%s_noDropout.qza' % (
                                    odir, basename(splitext(qza)[0]), subset)
                            feats_subset = '%s.meta' % splitext(qza_subset_)[0]
                            feats = get_subset(tsv_pd, subset_regex)
                            if not len(feats):
                                continue
                            subset_pd = pd.DataFrame({
                                'Feature ID':
                                feats,
                                'Subset': [subset] * len(feats)
                            })
                            subset_pd.to_csv(feats_subset,
                                             index=False,
                                             sep='\t')
                            write_filter_features(tsv_pd, feats, qza,
                                                  qza_subset_, feats_subset,
                                                  cur_sh, dropout)
                            for metric in alpha_metrics:

                                if metric in [
                                        'faith_pd'
                                ] and datasets_phylo[dat][1] and dat in trees:
                                    tree_in_qza = trees[dat][0]
                                    tree_in_tsv = '%s.tsv' % splitext(
                                        tree_in_qza)[0]
                                    if dropout:
                                        qza_subset = '%s/%s_%s.qza' % (
                                            odir,
                                            basename(splitext(tree_in_qza)[0]),
                                            subset)
                                    else:
                                        qza_subset = '%s/%s_%s_noDropout.qza' % (
                                            odir,
                                            basename(splitext(tree_in_qza)[0]),
                                            subset)
                                    write_filter_features(
                                        pd.read_csv(tree_in_tsv,
                                                    header=0,
                                                    index_col=0,
                                                    sep='\t'), feats,
                                        tree_in_qza, qza_subset, feats_subset,
                                        cur_sh, dropout)
                                else:
                                    qza_subset = qza_subset_

                                out_fp = '%s/%s__%s.qza' % (
                                    odir, basename(
                                        splitext(qza_subset)[0]), metric)
                                out_tsv = '%s.tsv' % splitext(out_fp)[0]

                                if force or not isfile(out_fp):
                                    ret_continue = write_diversity_alpha(
                                        out_fp, {dat: [1, 0]}, trees, dat,
                                        qza_subset, metric, cur_sh, qiime_env)
                                    if ret_continue:
                                        continue
                                    cmd = run_export(out_fp, out_tsv, '')
                                    cur_sh.write('echo "%s"\n' % cmd)
                                    cur_sh.write('%s\n\n' % cmd)
                                    written += 1
                                    main_written += 1
                                divs.setdefault(subset, []).append(
                                    (out_fp, metric))
                    diversities[dat].append(divs)
            to_chunk.append(out_sh)
            if not chunkit:
                run_xpbs(
                    out_sh, out_pbs, '%s.mg.lph%s.%s%s' %
                    (prjct_nm, evaluation, dat, filt_raref), qiime_env,
                    run_params["time"], run_params["n_nodes"],
                    run_params["n_procs"], run_params["mem_num"],
                    run_params["mem_dim"], chmod, written, 'single', o, noloc,
                    slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'alpha', prjct_nm,
                      run_params["time"], run_params["n_nodes"],
                      run_params["n_procs"], run_params["mem_num"],
                      run_params["mem_dim"], qiime_env, chmod, noloc, slurm,
                      jobs, chunkit, None)

    if main_written:
        print_message('# Calculate alpha diversity indices', 'sh', run_pbs,
                      jobs)
    return diversities
 def set_rarefaction(self, config):
     depths_yml = read_yaml_file(config.raref_depths)
     self.set_rarefaction_depths(config, depths_yml)
     self.set_rarefaction_paths(config)