Esempio n. 1
0
def edit_taxonomies(i_datasets_folder: str, taxonomies: dict, force: bool,
                    prjct_nm: str, qiime_env: str, chmod: str, noloc: bool,
                    slurm: bool, run_params: dict, filt_raref: str, jobs: bool,
                    chunkit: int):

    job_folder = get_job_folder(i_datasets_folder, 'taxonomy')
    job_folder2 = get_job_folder(i_datasets_folder, 'taxonomy/chunks')

    main_written = 0
    to_chunk = []
    run_pbs = '%s/1_run_taxonomy_edit_%s%s.sh' % (job_folder, prjct_nm,
                                                  filt_raref)
    with open(run_pbs, 'w') as o:
        for dat, (_, qza, tsv) in taxonomies.items():
            if not isfile(tsv):
                continue
            written = 0
            out_pd = pd.read_csv(tsv, dtype=str, sep='\t')
            taxo = out_pd['Taxon'].tolist()
            taxo_edit = get_taxa_edit(taxo)
            if taxo != taxo_edit:
                out_pd['Taxon'] = taxo_edit
                out_pd.to_csv(tsv, index=False, sep='\t')
                cmd = run_import(tsv, qza, 'FeatureData[Taxonomy]')

                out_sh = '%s/run_taxonomy_edit_%s_%s%s.sh' % (
                    job_folder2, prjct_nm, dat, filt_raref)
                if slurm:
                    out_pbs = '%s.slm' % splitext(out_sh)[0]
                else:
                    out_pbs = '%s.pbs' % splitext(out_sh)[0]
                with open(out_sh, 'w') as cur_sh:
                    cur_sh.write('echo "%s"\n' % cmd)
                    cur_sh.write('%s\n\n' % cmd)
                    main_written += 1
                    written += 1
                if written:
                    to_chunk.append(out_sh)
                if not chunkit:
                    run_xpbs(out_sh, out_pbs,
                             '%s.tx.dt.%s%s' % (prjct_nm, dat, filt_raref),
                             qiime_env, run_params["time"],
                             run_params["n_nodes"], run_params["n_procs"],
                             run_params["mem_num"], run_params["mem_dim"],
                             chmod, written, 'single', o, noloc, slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'taxonomy_edit',
                      prjct_nm, run_params["time"], run_params["n_nodes"],
                      run_params["n_procs"], run_params["mem_num"],
                      run_params["mem_dim"], qiime_env, chmod, noloc, slurm,
                      jobs, chunkit, None)

    if main_written:
        print_message('# Edit features taxonomy to not contain "," characters',
                      'sh', run_pbs, jobs)
Esempio n. 2
0
 def get_jobs_folders(self, analyses_commands):
     for analysis in analyses_commands:
         self.jobs_folders[analysis] = [
             '%s/run_%s_%s%s.sh' %
             (get_job_folder(self.config.i_datasets_folder, analysis),
              analysis, self.prjct_nm, self.filt_raref),
             '%s/run_%s_%s%s' % (get_job_folder(
                 self.config.i_datasets_folder, '%s/chunks' % analysis),
                                 analysis, self.prjct_nm, self.filt_raref),
         ]
def summarize_permanova(i_datasets_folder: str, permanovas: dict,
                        prjct_nm: str, qiime_env: str, chmod: str, noloc: bool,
                        slurm: bool, split: bool, run_params: dict,
                        filt_raref: str, jobs: bool, chunkit: int) -> dict:

    RESOURCES = pkg_resources.resource_filename("routine_qiime2_analyses",
                                                "resources")
    summarize_fp = '%s/summarize_permanovas.py' % RESOURCES

    all_sh_pbs = {}
    job_folder2 = get_job_folder(i_datasets_folder,
                                 'permanova_summarize/chunks')
    for dat, metrics in permanovas.items():
        metrics = [
            x for x in [
                'aitchison', 'jaccard', 'braycurtis', 'unweighted_unifrac',
                'weighted_unifrac'
            ] if x in metrics
        ]
        permanovas[dat] = []
        out_sh = '%s/run_permanova_summarize_%s%s.sh' % (job_folder2, dat,
                                                         filt_raref)
        out_py = '%s/run_permanova_summarize_%s%s.py' % (job_folder2, dat,
                                                         filt_raref)
        with open(out_py, 'w') as o, open(summarize_fp) as f:
            for line in f:
                line_edit = line
                if 'DATASET' in line:
                    line_edit = line_edit.replace('DATASET', dat)
                if 'ROUTINE_FOLDER' in line:
                    line_edit = line_edit.replace('ROUTINE_FOLDER',
                                                  i_datasets_folder)
                if 'METRICS' in line:
                    line_edit = line_edit.replace('METRICS', str(metrics))
                o.write(line_edit)
        cur_sh = '%s/run_permanova_summarize_%s%s_tmp.sh' % (job_folder2, dat,
                                                             filt_raref)
        with open(cur_sh, 'w') as o:
            o.write('python3 %s\n' % out_py)
        all_sh_pbs[(dat, out_sh)] = [cur_sh]

    job_folder = get_job_folder(i_datasets_folder, 'permanova_summarize')
    main_sh = write_main_sh(
        job_folder, '3_run_permanova_summarize%s' % filt_raref, all_sh_pbs,
        '%s.prm%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit)
    if main_sh:
        print("# SUMMARIZE PERMANOVAS")
        print_message('', 'sh', main_sh, jobs)

    return permanovas
Esempio n. 4
0
def nestedness_nodfs(i_datasets_folder: str, nodfs_fps: dict, collapsed: dict,
                     filt_raref: str, prjct_nm: str, qiime_env: str,
                     chmod: str, noloc: bool, slurm: bool, split: bool,
                     run_params: dict, jobs: bool, chunkit: int) -> None:

    RESOURCES = pkg_resources.resource_filename("routine_qiime2_analyses",
                                                "resources")
    nestedness_nodfs_fp = '%s/nestedness_nodfs.py' % RESOURCES

    job_folder2 = get_job_folder(i_datasets_folder,
                                 'nestedness_figures/chunks')

    all_sh_pbs = {}
    for dat, nodfs in nodfs_fps.items():

        out_sh = '%s/run_nestedness_nodfs_%s_%s%s.sh' % (job_folder2, prjct_nm,
                                                         dat, filt_raref)
        out_py = out_sh.replace('.sh', '.py')
        cur_sh = '%s/run_nestedness_nodfs_%s%s_tmp.sh' % (job_folder2, dat,
                                                          filt_raref)
        cur_sh = cur_sh.replace(' ', '-')
        with open(cur_sh, 'w') as o:
            o.write('python3 %s\n' % out_py)
        all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)

        # value to edit in template
        odir = get_analysis_folder(i_datasets_folder,
                                   'nestedness/%s%s' % (dat, filt_raref))
        with open(out_py, 'w') as o, open(nestedness_nodfs_fp) as f:
            for line in f:
                line_edit = line
                if '<DAT>' in line:
                    line_edit = line_edit.replace('<DAT>', dat)
                if '<ODIR>' in line:
                    line_edit = line_edit.replace('<ODIR>', odir)
                if '<NODFS>' in line:
                    line_edit = line_edit.replace("'<NODFS>'", str(nodfs))
                if '<COLLAPSED>' in line:
                    line_edit = line_edit.replace("'<COLLAPSED>'",
                                                  str(collapsed))
                o.write(line_edit)

    job_folder = get_job_folder(i_datasets_folder, 'nestedness_figures')
    main_sh = write_main_sh(
        job_folder, 'run_nestedness_nodfs%s' % filt_raref, all_sh_pbs,
        '%s.nstd.ndf%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit)
    if main_sh:
        print("# NESTEDNESS NODFS")
        print_message('', 'sh', main_sh, jobs)
def run_distance_decay(i_datasets_folder: str, betas: dict,
                       p_distance_decay: str, datasets_rarefs: dict,
                       force: bool, prjct_nm: str, qiime_env: str, chmod: str,
                       noloc: bool, slurm: bool, split: bool, run_params: dict,
                       filt_raref: str, jobs: bool,
                       chunkit: int) -> (dict, list):

    job_folder2 = get_job_folder(i_datasets_folder, 'decay/chunks')
    decay_config = read_yaml_file(p_distance_decay)
    subsets, modes, params = get_decay_config(decay_config)

    all_sh_pbs = {}
    decay_res = {}
    for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items():
        if not split:
            out_sh = '%s/run_decay_%s_%s%s.sh' % (job_folder2, prjct_nm, dat,
                                                  filt_raref)
        decay_res[dat] = []
        for idx, metrics_groups_metas_qzas_dms_trees in enumerate(
                rarefs_metrics_groups_metas_qzas_dms_trees):
            decay_raref = {}
            cur_raref = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder,
                                       'decay/%s%s' % (dat, cur_raref))
            if split:
                out_sh = '%s/run_decay_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, cur_raref, filt_raref)
            for metric, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items(
            ):
                for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items(
                ):
                    for (meta, qza, mat_qza,
                         tree) in metas_qzas_mat_qzas_trees:
                        meta_pd = read_meta_pd(meta).set_index('sample_name')
                        cases_dict = check_metadata_cases_dict(
                            meta, meta_pd, dict(subsets), 'decay')
                        for case_var, case_vals_list in cases_dict.items():
                            for case_vals in case_vals_list:
                                case = get_case(case_vals,
                                                case_var).replace(' ', '_')
                                cur_sh = '%s/run_decay_%s%s_%s_%s_%s%s.sh' % (
                                    job_folder2, dat, cur_raref, metric, group,
                                    case, filt_raref)
                                cur_sh = cur_sh.replace(' ', '-')
                                all_sh_pbs.setdefault((dat, out_sh),
                                                      []).append(cur_sh)
                                new_meta_pd = get_new_meta_pd(
                                    meta_pd, case, case_var, case_vals)
                                res = run_single_decay(
                                    odir, group, new_meta_pd, cur_sh, mat_qza,
                                    case, modes, force, run_params["n_nodes"],
                                    run_params["n_procs"],
                                    int(params['iteration']),
                                    int(params['step']))
                                decay_raref[(metric, group, case)] = res
            decay_res[dat].append(decay_raref)
Esempio n. 6
0
def run_taxonomy(method: str, i_datasets_folder: str, datasets: dict,
                 datasets_read: dict, datasets_phylo: dict,
                 datasets_features: dict, datasets_filt_map: dict,
                 i_classifier: str, taxonomies: dict, force: bool,
                 prjct_nm: str, qiime_env: str, chmod: str, noloc: bool,
                 slurm: bool, run_params: dict, filt_raref: str, jobs: bool,
                 chunkit: int) -> None:
    """

    Parameters
    ----------
    method
    i_datasets_folder : str
        Path to the folder containing the data/metadata subfolders.
    datasets : dict
        Mappring dataset name -> [data file path, metadata file path].
    datasets_read : dict
        Mapping dataset name -> [data table, metadata table]
    datasets_phylo : dict
        To be updated with ('tree_to_use', 'corrected_or_not') per dataset.
    datasets_features : dict
        Mapping dataset name -> list of features names in
                                the dataset tsv / biom file.
    datasets_filt_map : dict
    i_classifier : str
        Path to the taxonomic classifier.
    taxonomies : dict
        Mapping Dataset name -> [method, assignment qza]
    force : bool
        Force the re-writing of scripts for all commands.
    prjct_nm : str
        Short nick name for your project.
    qiime_env : str
        Name of your qiime2 conda environment (e.g. qiime2-2019.10).
    chmod : str
        Whether to change permission of output files (default: 744).
    noloc : str
    run_params : dict
    filt_raref : str
    jobs : bool
    chunkit : int

    Returns
    -------

    """
    job_folder = get_job_folder(i_datasets_folder, 'taxonomy')
    job_folder2 = get_job_folder(i_datasets_folder, 'taxonomy/chunks')
    amplicon_datasets = [
        dat for dat, (tree, correction) in datasets_phylo.items()
        if tree == 'amplicon'
    ]
    wol_datasets = [
        dat for dat, (tree, correction) in datasets_phylo.items()
        if tree == 'wol'
    ]

    main_written = 0
    to_chunk = []
    run_pbs = '%s/1_run_taxonomy_%s%s.sh' % (job_folder, prjct_nm, filt_raref)
    with open(run_pbs, 'w') as o:
        for dat, tsv_meta_pds_ in datasets_read.items():
            out_sh = '%s/run_taxonomy_%s_%s%s.sh' % (job_folder2, prjct_nm,
                                                     dat, filt_raref)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            if dat in datasets_filt_map:
                taxonomies[dat] = taxonomies[datasets_filt_map[dat]]
                continue
            written = 0
            with open(out_sh, 'w') as cur_sh:
                for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
                    if idx:
                        continue
                    tsv, meta = datasets[dat][idx]
                    if not isinstance(tsv_meta_pds[0], pd.DataFrame) and \
                            tsv_meta_pds[0] == 'raref':
                        if not isfile(tsv):
                            print('Must have run rarefaction to use it '
                                  'further...\nExiting')
                            sys.exit(0)
                        tsv_pd, meta_pd = get_raref_tab_meta_pds(meta, tsv)
                        datasets_read[dat][idx] = [tsv_pd, meta_pd]
                    else:
                        tsv_pd, meta_pd = tsv_meta_pds

                    odir = get_analysis_folder(i_datasets_folder,
                                               'taxonomy/%s' % dat)
                    out_rad = '%s/tax_%s' % (odir, dat)

                    if dat in amplicon_datasets:
                        out_qza = '%s_%s.qza' % (out_rad, method)
                        out_tsv = '%s.tsv' % splitext(out_qza)[0]
                        taxonomies[dat] = [method, out_qza, out_tsv]
                        if not i_classifier:
                            print('No classifier passed for 16S '
                                  'data\nExiting...')
                            continue
                        cmd = run_taxonomy_amplicon(dat, i_datasets_folder,
                                                    force, tsv_pd, out_qza,
                                                    out_tsv, i_classifier)
                    else:
                        out_qza = '%s.qza' % out_rad
                        out_tsv = '%s.tsv' % out_rad
                        if dat in wol_datasets:
                            cur_datasets_features = datasets_features[dat]
                            taxonomies[dat] = ['wol', out_qza, out_tsv]
                            cmd = run_taxonomy_wol(force, tsv_pd, out_qza,
                                                   out_tsv,
                                                   cur_datasets_features)
                        else:
                            if len(
                                [x for x in tsv_pd.index
                                 if str(x).isdigit()]) == tsv_pd.shape[0]:
                                continue
                            taxonomies[dat] = ['feat', out_qza, out_tsv]
                            cmd = run_taxonomy_others(force, tsv_pd, out_qza,
                                                      out_tsv)
                    if cmd:
                        cur_sh.write('echo "%s"\n' % cmd)
                        cur_sh.write('%s\n\n' % cmd)
                        main_written += 1
                        written += 1
            if written:
                to_chunk.append(out_sh)
                if not chunkit:
                    run_xpbs(out_sh, out_pbs,
                             '%s.tx.sklrn.%s%s' % (prjct_nm, dat, filt_raref),
                             qiime_env, run_params["time"],
                             run_params["n_nodes"], run_params["n_procs"],
                             run_params["mem_num"], run_params["mem_dim"],
                             chmod, written, 'single', o, noloc, slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'taxonomy', prjct_nm,
                      run_params["time"], run_params["n_nodes"],
                      run_params["n_procs"], run_params["mem_num"],
                      run_params["mem_dim"], qiime_env, chmod, noloc, slurm,
                      jobs, chunkit, None)

    if main_written:
        print_message('# Classify features using classify-sklearn', 'sh',
                      run_pbs, jobs)
Esempio n. 7
0
def run_collapse(i_datasets_folder: str, datasets: dict, datasets_filt: dict,
                 datasets_read: dict, datasets_features: dict,
                 datasets_phylo: dict, split_taxa_pds: dict, taxonomies: dict,
                 p_collapse_taxo: str, datasets_rarefs: dict,
                 datasets_collapsed: dict, datasets_collapsed_map: dict,
                 force: bool, prjct_nm: str, qiime_env: str, chmod: str,
                 noloc: bool, slurm: bool, run_params: dict, filt_raref: str,
                 jobs: bool) -> dict:

    collapse_taxo = get_collapse_taxo(p_collapse_taxo, datasets_filt)
    main_written = 0
    collapsed = {}
    datasets_update = {}
    datasets_read_update = {}
    datasets_features_update = {}
    datasets_phylo_update = {}
    stop_for_collapse = False
    job_folder = get_job_folder(i_datasets_folder, 'collapsed_taxo')
    job_folder2 = get_job_folder(i_datasets_folder, 'collapsed_taxo/chunks')
    run_pbs = '%s/3_run_collapsed_taxo_%s%s.sh' % (job_folder, prjct_nm,
                                                   filt_raref)
    with open(run_pbs, 'w') as o:
        for dat, tab_meta_fps in datasets.items():
            if dat not in collapse_taxo:
                continue
            # get the taxonomic levels
            collapse_levels = collapse_taxo[dat]
            split_taxa_pd, split_taxa_fp = split_taxa_pds[dat]
            split_levels, remove_empties = get_split_levels(
                collapse_levels, split_taxa_pd)
            collapsed[dat] = split_levels

            # files that will be collapsed using qiime2
            tax_qza, tax_fp = taxonomies[dat][1:]

            written = 0
            out_sh = '%s/run_collapsed_taxo_%s_%s%s.sh' % (
                job_folder2, prjct_nm, dat, filt_raref)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            with open(out_sh, 'w') as cur_sh:
                for idx, tab_meta_fp in enumerate(tab_meta_fps):
                    tab_fp, meta_fp = tab_meta_fp
                    tab_qza = '%s.qza' % splitext(tab_fp)[0]
                    for tax, level in split_levels.items():
                        coll_paths = collapse_paths(dat, tax, tab_fp, meta_fp)
                        dat_tax = coll_paths[0]
                        dat_coll = coll_paths[1]
                        coll_tsv = coll_paths[2]
                        coll_qza = coll_paths[3]
                        coll_meta = coll_paths[4]
                        if isfile(coll_tsv) and isfile(coll_meta):
                            coll_pd = pd.read_csv(coll_tsv,
                                                  index_col=0,
                                                  header=0,
                                                  sep='\t')
                            coll_meta_pd = pd.read_csv(
                                coll_meta,
                                header=0,
                                sep='\t',
                                dtype={'sample_name': str})
                            if coll_pd.shape[0] < 5:
                                continue
                            cmd = fix_collapsed_data(remove_empties[tax],
                                                     coll_pd, coll_tsv,
                                                     coll_qza, coll_meta)
                            if cmd:
                                cur_sh.write('echo "%s"\n' % cmd)
                                cur_sh.write('%s\n\n' % cmd)
                                main_written += 1
                                written += 1
                            datasets_read_update.setdefault(
                                dat_tax, []).append([coll_pd, coll_meta_pd])
                            datasets_collapsed.setdefault(dat,
                                                          []).append(dat_coll)
                            datasets_collapsed_map[dat_coll] = dat
                            datasets_update.setdefault(dat_tax, []).append(
                                [coll_tsv, coll_meta])
                            datasets_rarefs.setdefault(dat_tax, []).append(
                                datasets_rarefs[dat][idx])
                            datasets_phylo_update[dat_tax] = ('', 0)
                        else:
                            written += 1
                            main_written += 1
                            stop_for_collapse = True
                            cmd = write_collapse_taxo(tab_qza, tax_qza,
                                                      coll_qza, coll_tsv,
                                                      meta_fp, coll_meta,
                                                      level,
                                                      remove_empties[tax])
                            cur_sh.write('echo "%s"\n' % cmd)
                            cur_sh.write('%s\n\n' % cmd)

            if written:
                run_xpbs(out_sh, out_pbs,
                         '%s.cllps.%s%s' % (prjct_nm, dat, filt_raref),
                         qiime_env, run_params["time"], run_params["n_nodes"],
                         run_params["n_procs"], run_params["mem_num"],
                         run_params["mem_dim"], chmod, written, 'single', o,
                         noloc, slurm, jobs)

    if main_written:
        print_message(
            '# Collapse features for taxo levels defined in %s' %
            p_collapse_taxo, 'sh', run_pbs, jobs)

    if stop_for_collapse:
        print('Stopping here as this collapse must be run first for other '
              'analyses to work')
        sys.exit(0)

    datasets.update(datasets_update)
    datasets_read.update(datasets_read_update)
    datasets_phylo.update(datasets_phylo_update)

    return collapsed
def shear_tree(
        i_datasets_folder: str, datasets: dict, datasets_read: dict,
        datasets_phylo: dict, datasets_features: dict, prjct_nm: str,
        i_wol_tree: str, trees: dict, datasets_rarefs: dict, force: bool,
        qiime_env: str, chmod: str, noloc: bool, slurm: bool, run_params: dict,
        filt_raref: str, jobs: bool) -> None:
    """
    Get the sub-tree from the Web of Life tree that corresponds to the gOTUs-labeled features.

    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets_read: dataset -> [tsv table, meta table]
    :param datasets_phylo: to be updated with ('tree_to_use', 'corrected_or_not') per dataset.
    :param datasets_features: dataset -> list of features names in the dataset tsv / biom file.
    :param prjct_nm: Short nick name for your project.
    :param i_wol_tree: default on barnacle /projects/wol/profiling/dbs/wol/phylogeny/wol_tree.nwk.
    :param trees: to be update with tree to use for a dataset phylogenetic analyses.
    :param force: Force the re-writing of scripts for all commands.
    :param qiime_env: name of your qiime2 conda environment (e.g. qiime2-2019.10).
    :param chmod: whether to change permission of output files (defalt: 775).
    """
    # check whether there's dataset(s) that may use
    # the Web of Life tree (i.e. features contain gID)
    wol_datasets = [dat for dat, (tree, correction)
                    in datasets_phylo.items() if tree == 'wol']
    if len(wol_datasets):
        job_folder = get_job_folder(i_datasets_folder, 'phylo')
        job_folder2 = get_job_folder(i_datasets_folder, 'phylo/chunks')
        i_wol_tree = get_wol_tree(i_wol_tree)
        wol = TreeNode.read(i_wol_tree)

        main_written = 0
        main_sh = '%s/0_run_import_trees_%s%s.sh' % (job_folder, prjct_nm, filt_raref)
        with open(main_sh, 'w') as main_o:
            for dat, tsv_metas_fps_ in datasets.items():
                written = 0
                if dat not in wol_datasets:
                    continue
                out_sh = '%s/run_import_tree_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref)
                if slurm:
                    out_pbs = out_sh.replace('.sh', '.slm')
                else:
                    out_pbs = out_sh.replace('.sh', '.pbs')
                with open(out_sh, 'w') as o:
                    for idx, tsv_metas_fps in enumerate(tsv_metas_fps_):
                        tsv, meta = tsv_metas_fps
                        if not isinstance(datasets_read[dat][idx][0], pd.DataFrame) and datasets_read[dat][idx][0] == 'raref':
                            if not isfile(tsv):
                                print('Must have run rarefaction to use it further...\nExiting')
                                sys.exit(0)
                            tsv_pd, meta_pd = get_raref_tab_meta_pds(meta, tsv)
                            datasets_read[dat][idx] = [tsv_pd, meta_pd]
                        else:
                            tsv_pd, meta_pd = datasets_read[dat][idx]
                        cur_raref = datasets_rarefs[dat][idx]
                        cur_datasets_features = dict(
                            gid for gid in datasets_features[dat].items() if gid[1] in tsv_pd.index)

                        analysis_folder = get_analysis_folder(i_datasets_folder, 'phylo/%s' % dat)
                        wol_features_fpo = '%s/tree_%s%s.nwk' % (analysis_folder, dat, cur_raref)
                        wol_features_qza = wol_features_fpo.replace('.nwk', '.qza')

                        # if idx:
                        #     trees[dat].append(('', wol_features_qza))
                        # else:
                        #     trees[dat] = [('', wol_features_qza)]
                        if not idx:
                            trees[dat] = ('', wol_features_qza)

                        if force or not isfile(wol_features_qza):
                            wol_features = wol.shear(list(cur_datasets_features.keys()))
                            # rename the tip per the features names associated with each gID
                            for tip in wol_features.tips():
                                tip.name = cur_datasets_features[tip.name]
                            wol_features.write(wol_features_fpo)
                            cmd = run_import(wol_features_fpo, wol_features_qza, "Phylogeny[Rooted]")
                            o.write("echo '%s'\n" % cmd)
                            o.write('%s\n\n' % cmd)
                        written += 1
                        main_written += 1
                run_xpbs(out_sh, out_pbs, '%s.shr.%s%s' % (prjct_nm, dat, filt_raref), qiime_env,
                         run_params["time"], run_params["n_nodes"], run_params["n_procs"],
                         run_params["mem_num"], run_params["mem_dim"],
                         chmod, written, 'single', main_o, noloc, slurm, jobs)
        if main_written:
            print_message("# Shear Web of Life tree to features' genome IDs (%s)" % ', '.join(wol_datasets), 'sh', main_sh, jobs)
Esempio n. 9
0
def run_doc(i_datasets_folder: str, datasets: dict, p_doc_config: str,
            datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str,
            chmod: str, noloc: bool, slurm: bool, run_params: dict,
            filt_raref: str, phates: dict, doc_phate: bool, split: bool,
            jobs: bool, chunkit: int) -> None:

    job_folder2 = get_job_folder(i_datasets_folder, 'doc/chunks')
    doc_filtering, doc_params, main_cases_dict = get_doc_config(p_doc_config)

    all_sh_pbs = {}
    all_import_sh_pbs = {}
    dat_cases_tabs = {}
    need_to_run_phate = []
    need_to_run_less_phate = []
    for dat, tsv_meta_pds_ in datasets.items():
        dat_cases_tabs[dat] = {}
        if dat in doc_filtering:
            filters = doc_filtering[dat]
        else:
            filters = {'0-0': ['0', '0']}
        for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
            dat_phates = []
            if dat in phates:
                dat_phates = phates[dat][idx]
            tsv, meta = tsv_meta_pds
            meta_pd = read_meta_pd(meta)
            meta_pd = meta_pd.set_index('sample_name')
            cases_dict = check_metadata_cases_dict(meta, meta_pd,
                                                   dict(main_cases_dict),
                                                   'DOC')
            cur_raref = datasets_rarefs[dat][idx]
            dat_cases_tabs[dat][cur_raref] = {}
            if not split:
                out_sh = '%s/run_doc_%s%s%s.sh' % (job_folder2, dat,
                                                   filt_raref, cur_raref)
                out_import_sh = '%s/run_import_doc_%s%s%s.sh' % (
                    job_folder2, dat, filt_raref, cur_raref)
            odir = get_analysis_folder(i_datasets_folder, 'doc/%s' % dat)
            for filt, (fp, fa) in filters.items():
                if split:
                    out_sh = '%s/run_doc_%s%s%s_%s.sh' % (
                        job_folder2, dat, filt_raref, cur_raref, filt)
                    out_import_sh = '%s/run_import_doc_%s%s%s_%s.sh' % (
                        job_folder2, dat, filt_raref, cur_raref, filt)
                for case_var, case_vals_list in cases_dict.items():
                    cur_sh = '%s/run_doc_%s_%s%s%s_%s.sh' % (
                        job_folder2, dat, case_var, filt_raref, cur_raref,
                        filt)
                    cur_sh = cur_sh.replace(' ', '-')
                    cur_import_sh = '%s/run_import_doc_%s_%s%s%s_%s.sh' % (
                        job_folder2, dat, case_var, filt_raref, cur_raref,
                        filt)
                    cur_import_sh = cur_import_sh.replace(' ', '-')
                    all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                    all_import_sh_pbs.setdefault((dat, out_import_sh),
                                                 []).append(cur_import_sh)
                    cases = run_single_doc(
                        i_datasets_folder, odir, tsv, meta_pd, case_var,
                        doc_params, case_vals_list, cur_sh, cur_import_sh,
                        force, filt, cur_raref, fp, fa, run_params["n_nodes"],
                        run_params["n_procs"], dat_phates, doc_phate,
                        need_to_run_phate, need_to_run_less_phate)
                    dat_cases_tabs[dat][cur_raref].setdefault(case_var,
                                                              []).extend(cases)

    for need_to_run in need_to_run_phate:
        print(' -', need_to_run)

    job_folder = get_job_folder(i_datasets_folder, 'doc')
    main_sh = write_main_sh(job_folder, '3_run_import_doc%s' % filt_raref,
                            all_import_sh_pbs,
                            '%s.doc.mpt%s' % (prjct_nm, filt_raref), "4", "1",
                            "1", "500", "mb", qiime_env, chmod, noloc, slurm,
                            jobs, chunkit)
    if main_sh:
        if p_doc_config:
            if p_doc_config.startswith('/panfs'):
                p_doc_config = p_doc_config.replace(os.getcwd(), '')
            print('# Import for DOC (groups config in %s)' % p_doc_config)
        else:
            print('# Import DOC')
        print_message('', 'sh', main_sh, jobs)

    main_sh = write_main_sh(job_folder, '3_run_doc%s' % filt_raref, all_sh_pbs,
                            '%s.doc%s' % (prjct_nm, filt_raref),
                            run_params["time"], run_params["n_nodes"],
                            run_params["n_procs"], run_params["mem_num"],
                            run_params["mem_dim"], qiime_env, chmod, noloc,
                            jobs, slurm, chunkit, '~/.')
    if main_sh:
        if p_doc_config:
            if p_doc_config.startswith('/panfs'):
                p_doc_config = p_doc_config.replace(os.getcwd(), '')
            print('# DOC (groups config in %s)' % p_doc_config)
        else:
            print('# DOC')
        print_message('', 'sh', main_sh, jobs)

    do_r = 1
    if do_r:
        job_folder = get_job_folder(i_datasets_folder, 'doc/R')
        job_folder2 = get_job_folder(i_datasets_folder, 'doc/R/chunks')
        main_written = 0
        main_sh = '%s/run_R_doc%s.sh' % (job_folder, filt_raref)
        with open(main_sh, 'w') as main_o:
            for dat, raref_case_var_cases in dat_cases_tabs.items():

                shs = []
                written = 0
                odir = get_analysis_folder(i_datasets_folder, 'doc/%s' % dat)
                log_error = '%s/log.error' % odir
                for raref, case_var_cases in raref_case_var_cases.items():
                    for case_var, cases in case_var_cases.items():
                        for cdx, case in enumerate(cases):
                            plot = '%s_%s_%s_%s' % (dat, raref, case_var, cdx)
                            case_r = '%s/R' % case
                            pdf = '%s/plot.pdf' % case_r
                            do = '%s/DO.tsv' % case_r
                            if not isfile(pdf):
                                cur_r = '%s/run_R_doc_%s_%s_%s_vanilla.R' % (
                                    job_folder2, dat, case_var, cdx)
                                cur_sh = 'echo "*** %s" >> %s\n' % (plot,
                                                                    log_error)
                                cur_sh += 'R -f %s --vanilla 2>> %s\n' % (
                                    cur_r, log_error)
                                cur_sh += 'echo "end" >> %s\n' % log_error
                                shs.append(cur_sh)
                                with open(cur_r, 'w') as o:
                                    o.write("library(DOC)\n")
                                    o.write("library(ggplot2)\n")
                                    if not isfile(do):
                                        o.write(
                                            "otu <- read.table('%s/tab.tsv', header=T, sep='\\t', comment.char='', check.names=F, nrows=2)\n"
                                            % case)
                                        o.write(
                                            "index_name <- colnames(otu)[1]\n")
                                        o.write(
                                            "otu <- read.table('%s/tab.tsv', header=T, sep='\\t', comment.char='', check.names=F, row.names=index_name)\n"
                                            % case)
                                        o.write("if (dim(otu)[1] > 100) {\n")
                                        o.write("    res <- DOC(otu)\n")
                                        o.write(
                                            "    res.null <- DOC.null(otu)\n")
                                        o.write(
                                            "    write.table(x=res$DO, file='%s/DO.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res$LME, file='%s/LME.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    colnames(res$NEG) <- c('Neg_Slope', 'Data')\n"
                                        )
                                        o.write(
                                            "    write.table(x=res$NEG, file='%s/NEG.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res$FNS, file='%s/FNS.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res$BOOT, file='%s/BOOT.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res$CI, file='%s/CI.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res.null$DO, file='%s/null_DO.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res.null$LME, file='%s/null_LME.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    colnames(res.null$NEG) <- c('Neg_Slope', 'Data')\n"
                                        )
                                        o.write(
                                            "    write.table(x=res.null$NEG, file='%s/null_NEG.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res.null$FNS, file='%s/null_FNS.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res.null$BOOT, file='%s/null_BOOT.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res.null$CI, file='%s/null_CI.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write("}\n")
                                    o.write(
                                        "res = list(BOOT=read.table('%s/BOOT.tsv', h=T, sep='\\t'), CI=read.table('%s/CI.tsv', h=T, sep='\\t'), DO=read.table('%s/DO.tsv', h=T, sep='\\t'), LME=read.table('%s/LME.tsv', h=T, sep='\\t'), FNS=read.table('%s/FNS.tsv', h=T, sep='\\t'), NEG=read.table('%s/NEG.tsv', h=T, sep='\\t'))\n"
                                        % (case_r, case_r, case_r, case_r,
                                           case_r, case_r))
                                    o.write(
                                        "res.null = list(BOOT=read.table('%s/null_BOOT.tsv', h=T, sep='\\t'), CI=read.table('%s/null_CI.tsv', h=T, sep='\\t'), DO=read.table('%s/null_DO.tsv', h=T, sep='\\t'), LME=read.table('%s/null_LME.tsv', h=T, sep='\\t'), FNS=read.table('%s/null_FNS.tsv', h=T, sep='\\t'), NEG=read.table('%s/null_NEG.tsv', h=T, sep='\\t'))\n"
                                        % (case_r, case_r, case_r, case_r,
                                           case_r, case_r))
                                    o.write(
                                        "colnames(res$NEG) <- c('Neg.Slope', 'Data')\n"
                                    )
                                    o.write(
                                        "colnames(res.null$NEG) <- c('Neg.Slope', 'Data')\n"
                                    )
                                    o.write(
                                        "res$DO <- res$DO[which(res$DO$Overlap <= 1),]\n"
                                    )
                                    o.write(
                                        "res.null$DO <- res.null$DO[which(res.null$DO$Overlap <= 1),]\n"
                                    )
                                    o.write("pdf('%s')\n" % pdf)
                                    o.write(
                                        "merged <- DOC.merge(list(s_%s = res, s_%s=res.null))\n"
                                        % (plot, plot))
                                    o.write("plot(merged)\n")
                                    o.write("dev.off()\n")
                                    main_written += 1
                                    written += 1
                if written:
                    if chunkit and len(shs) >= chunkit:
                        chunks = [
                            list(x)
                            for x in np.array_split(np.array(shs), chunkit)
                        ]
                    if split and len(shs) >= 3:
                        chunks = [
                            list(x) for x in np.array_split(np.array(shs), 3)
                        ]
                    else:
                        chunks = [shs]
                    for cdx, chunk in enumerate(chunks):
                        out_sh = '%s/run_R_doc_%s%s_%s.sh' % (job_folder2, dat,
                                                              filt_raref, cdx)
                        out_pbs = '%s.pbs' % splitext(out_sh)[0]
                        with open(out_sh, 'w') as o:
                            for c in chunk:
                                o.write('echo "%s"\n\n' % c)
                                o.write('%s\n\n' % c)
                        run_xpbs(
                            out_sh, out_pbs, '%s.doc.R.%s%s_%s' %
                            (prjct_nm, dat, filt_raref, cdx), 'xdoc',
                            run_params["time"], run_params["n_nodes"],
                            run_params["n_procs"], run_params["mem_num"],
                            run_params["mem_dim"], chmod, written, 'single',
                            main_o, noloc, slurm, jobs)
        if main_written:
            print_message('# DOC (R)', 'sh', main_sh, jobs)
Esempio n. 10
0
def filter_rare_samples(i_datasets_folder: str, datasets: dict,
                        datasets_read: dict, datasets_features: dict,
                        datasets_rarefs: dict, datasets_filt: dict,
                        datasets_filt_map: dict, datasets_phylo: dict,
                        prjct_nm: str, qiime_env: str, p_filt_threshs: str,
                        chmod: str, noloc: bool, run_params: dict,
                        filt_raref: str, jobs: bool, slurm: bool,
                        chunkit: int) -> None:
    """
    Filter the rare features, keep samples with enough reads/features and import to Qiime2.

    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets: dataset -> [tsv/biom path, meta path]
    :param datasets_read: dataset -> [tsv table, meta table]
    :param datasets_features: dataset -> list of features names in the dataset tsv / biom file.
    :param datasets_phylo: to be updated with ('tree_to_use', 'corrected_or_not') per dataset.
    :param prjct_nm: Short nick name for your project.
    :param qiime_env: name of your qiime2 conda environment (e.g. qiime2-2019.10).
    :param thresh: min number of reads per sample to keep it.
    :param chmod: whether to change permission of output files (defalt: 775).
    """
    threshs_dats = read_yaml_file(p_filt_threshs)

    written = 0
    datasets_update = {}
    datasets_read_update = {}
    datasets_features_update = {}
    datasets_phylo_update = {}
    job_folder = get_job_folder(i_datasets_folder, 'import_filtered')
    out_sh = '%s/1_run_import_filtered_%s%s.sh' % (job_folder, prjct_nm,
                                                   filt_raref)
    if slurm:
        out_pbs = '%s.slm' % splitext(out_sh)[0]
    else:
        out_pbs = '%s.pbs' % splitext(out_sh)[0]
    to_chunk = []
    with open(out_sh, 'w') as sh:
        for dat, tab_meta_pds_ in datasets_read.items():
            if dat not in threshs_dats:
                continue
            names, thresh_sam, thresh_feat = get_thresholds(threshs_dats[dat])
            if no_filtering(dat, thresh_sam, thresh_feat):
                continue
            dat_filt = get_dat_filt(dat, names, thresh_sam, thresh_feat)

            datasets_filt[dat] = dat_filt
            datasets_filt_map[dat_filt] = dat
            datasets_rarefs[dat_filt] = ['']

            tsv_filt, qza_filt, meta_filt = get_fps(i_datasets_folder,
                                                    dat_filt)

            if isfile(qza_filt) and isfile(meta_filt):
                datasets_update[dat_filt] = [[tsv_filt, meta_filt]]
                tab_filt_pd = pd.read_csv(tsv_filt,
                                          index_col=0,
                                          header=0,
                                          sep='\t')
                with open(meta_filt) as f:
                    for line in f:
                        break
                meta_filt_pd = pd.read_csv(meta_filt,
                                           header=0,
                                           sep='\t',
                                           dtype={line.split('\t')[0]: str},
                                           low_memory=False)
                # datasets_read_update[dat_filt] = [tab_filt_pd, meta_filt_pd]
                datasets_read_update[dat_filt] = [[tab_filt_pd, meta_filt_pd]]
                datasets_phylo_update[dat_filt] = datasets_phylo[dat]
                datasets_features_update[dat_filt] = dict(
                    gid_feat for gid_feat in datasets_features[dat].items()
                    if gid_feat[1] in tab_filt_pd.index)
                continue

            for (tab_pd, meta_pd) in tab_meta_pds_:
                tab_filt_pd = filtering_thresholds(names, thresh_sam,
                                                   thresh_feat, tab_pd)
                if harsh_filtering(dat_filt, tab_filt_pd):
                    continue
                meta_filt_pd = meta_pd.loc[meta_pd.sample_name.isin(
                    tab_filt_pd.columns.tolist())].copy()
                tab_filt_pd.reset_index().to_csv(tsv_filt,
                                                 index=False,
                                                 sep='\t')
                meta_filt_pd.to_csv(meta_filt, index=False, sep='\t')

                datasets_update[dat_filt] = [[tsv_filt, meta_filt]]
                datasets_read_update[dat_filt] = [[tab_filt_pd, meta_filt_pd]]
                datasets_phylo_update[dat_filt] = datasets_phylo[dat]
                datasets_features_update[dat_filt] = dict(
                    gid_feat for gid_feat in datasets_features[dat].items()
                    if gid_feat[1] in tab_filt_pd.index)
                cmd = run_import(tsv_filt, qza_filt, "FeatureTable[Frequency]")
                sh.write('echo "%s"\n' % cmd)
                sh.write('%s\n' % cmd)
                written += 1
    if written:
        run_xpbs(
            out_sh, out_pbs, '%s.fltr%s' % (prjct_nm, filt_raref), qiime_env,
            run_params["time"], run_params["n_nodes"], run_params["n_procs"],
            run_params["mem_num"], run_params["mem_dim"], chmod, written,
            '# Filter samples for a min number of %s reads' % p_filt_threshs,
            None, noloc, slurm, jobs)

    # after this update, the raw dataset remain included
    datasets.update(datasets_update)
    datasets_read.update(datasets_read_update)
    datasets_features.update(datasets_features_update)
    datasets_phylo.update(datasets_phylo_update)
def run_phate(p_phate_config: str, i_datasets_folder: str, datasets: dict,
              datasets_rarefs: dict, force: bool, prjct_nm: str,
              qiime_env: str, chmod: str, noloc: bool, slurm: bool,
              split: bool, run_params: dict, filt_raref: str, jobs: bool,
              chunkit: int) -> dict:

    job_folder2 = get_job_folder(i_datasets_folder, 'phate/chunks')
    phate_dicts = get_phate_dicts(p_phate_config)
    phate_filtering, phate_labels, phate_params, main_cases_dict = phate_dicts

    phates = {}
    all_sh_pbs = {}
    all_import_sh_pbs = {}
    for dat, tsv_meta_pds_ in datasets.items():
        phates[dat] = []
        if dat in phate_filtering:
            filters = phate_filtering[dat]
        else:
            filters = {'0_0': ['0', '0']}
        for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
            tsv, meta = tsv_meta_pds
            meta_pd = read_meta_pd(meta)
            meta_pd = meta_pd.set_index('sample_name')
            cases_dict = check_metadata_cases_dict(meta, meta_pd,
                                                   dict(main_cases_dict),
                                                   'phate')
            cur_raref = datasets_rarefs[dat][idx]
            if not split:
                out_sh = '%s/run_phate_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, filt_raref, cur_raref)
                out_import_sh = '%s/run_import_phate_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, filt_raref, cur_raref)
            odir = get_analysis_folder(i_datasets_folder, 'phate/%s' % dat)
            raref_phates = {}
            for filt, (fp, fa) in filters.items():
                raref_phates[filt] = {}
                if split:
                    out_sh = '%s/run_phate_%s_%s%s%s%s.sh' % (
                        job_folder2, prjct_nm, dat, filt_raref, cur_raref,
                        filt)
                    out_import_sh = '%s/run_import_phate_%s_%s%s%s%s.sh' % (
                        job_folder2, prjct_nm, dat, filt_raref, cur_raref,
                        filt)
                for case_var, case_vals_list in cases_dict.items():
                    cur_sh = '%s/run_phate_%s_%s%s%s_%s_%s.sh' % (
                        job_folder2, prjct_nm, dat, filt_raref, cur_raref,
                        case_var, filt)
                    cur_sh = cur_sh.replace(' ', '-')
                    cur_import_sh = '%s/run_import_phate_%s_%s%s%s_%s_%s.sh' % (
                        job_folder2, prjct_nm, dat, filt_raref, cur_raref,
                        case_var, filt)
                    cur_import_sh = cur_import_sh.replace(' ', '-')
                    all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                    all_import_sh_pbs.setdefault((dat, out_import_sh),
                                                 []).append(cur_import_sh)
                    phate = run_single_phate(dat, odir, tsv, meta_pd, case_var,
                                             phate_labels, phate_params,
                                             run_params, case_vals_list,
                                             cur_sh, cur_import_sh, force,
                                             filt, cur_raref, fp, fa)
                    raref_phates[filt][case_var] = phate
            phates[dat].append(raref_phates)

    job_folder = get_job_folder(i_datasets_folder, 'phate')
    main_sh = write_main_sh(
        job_folder, '3_run_import_phate_%s%s' % (prjct_nm, filt_raref),
        all_import_sh_pbs, '%s.mrt.pht%s' % (prjct_nm, filt_raref),
        run_params["time"], run_params["n_nodes"], run_params["n_procs"],
        run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc,
        slurm, jobs, chunkit)
    if main_sh:
        if p_phate_config:
            if p_phate_config.startswith('/panfs'):
                p_phate_config = p_phate_config.replace(os.getcwd(), '')
            print('# Import for PHATE (groups config in %s)' % p_phate_config)
        else:
            print('# Import for PHATE')
        print_message('', 'sh', main_sh, jobs)

    main_sh = write_main_sh(
        job_folder, '3_run_phate_%s%s' % (prjct_nm, filt_raref), all_sh_pbs,
        '%s.pht%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], 'xphate', chmod, noloc, slurm, jobs, chunkit)
    if main_sh:
        if p_phate_config:
            if p_phate_config.startswith('/panfs'):
                p_phate_config = p_phate_config.replace(os.getcwd(), '')
            print('# PHATE (groups config in %s)' % p_phate_config)
        else:
            print('# PHATE')
        print_message('', 'sh', main_sh, jobs)
    return phates
Esempio n. 12
0
def run_songbird(p_diff_models: str, i_datasets_folder: str, datasets: dict,
                 datasets_read: dict, datasets_filt: dict,
                 train_test_dict: dict, input_to_filtered: dict,
                 mmvec_outputs: list, force: bool, prjct_nm: str,
                 qiime_env: str, chmod: str, noloc: bool, split: bool,
                 run_params: dict, filt_raref: str, jobs: bool,
                 chunkit: int) -> list:
    """
    Run songbird: Vanilla regression methods for microbiome differential abundance analysis.
    https://github.com/biocore/songbird
    Main per-dataset looper for the songbird datasets.

    :param p_diff_models: Formulas for multinomial regression-based differential abundance ranking.
    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets: list of datasets.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Nick name for your project.
    :param qiime_env: qiime2-xxxx.xx conda environment.
    :param chmod: whether to change permission of output files (default: 775).
    """
    job_folder = get_job_folder(i_datasets_folder, 'songbird')
    job_folder2 = get_job_folder(i_datasets_folder, 'songbird/chunks')
    songbird_dicts = get_songbird_dicts(p_diff_models)
    songbird_models = songbird_dicts[0]
    songbird_filtering = songbird_dicts[1]
    unique_filtering = get_unique_filterings(songbird_filtering)

    params = songbird_dicts[2]
    models_baselines = songbird_dicts[3]
    songbird_datasets = songbird_dicts[4]
    songbird_subsets = songbird_dicts[5]

    trains = params['train']
    batches = params['batches']
    learns = params['learns']
    epochs = params['epochs']
    thresh_feats = params['thresh_feats']
    thresh_samples = params['thresh_samples']
    diff_priors = params['diff_priors']
    summary_intervals = params['summary_interval']

    filt_datasets_done, common_datasets_done = check_filtered_and_common_dataset(
        i_datasets_folder, datasets, datasets_filt, songbird_datasets, {},
        songbird_filtering, unique_filtering, 'songbird', input_to_filtered,
        songbird_subsets)

    already_computed = {}
    filt_datasets, common_datasets = make_filtered_and_common_dataset(
        i_datasets_folder, datasets, datasets_filt, datasets_read,
        songbird_datasets, train_test_dict, {}, songbird_filtering,
        unique_filtering, job_folder, force, prjct_nm, qiime_env, chmod, noloc,
        'songbird', filt_raref, filt_datasets_done, common_datasets_done,
        input_to_filtered, already_computed, songbird_subsets, jobs)

    songbird_models.update(
        dict((input_to_filtered[x], y) for x, y in songbird_models.items()
             if x in input_to_filtered))

    songbirds = {}
    for dat, filts_files in filt_datasets.items():
        for (case, filts), files in filts_files.items():
            songbirds.setdefault(dat[0], []).append(
                [case, filts, files[0], files[2], ''])
Esempio n. 13
0
def nestedness_graphs(i_datasets_folder: str, nestedness_res: dict,
                      datasets: dict, split_taxa_pds: dict,
                      datasets_rarefs: dict, colors: dict,
                      datasets_collapsed_map: dict, collapsed: dict,
                      filt_raref: str, prjct_nm: str, qiime_env: str,
                      chmod: str, noloc: bool, slurm: bool, split: bool,
                      run_params: dict, jobs: bool, chunkit: int):

    RESOURCES = pkg_resources.resource_filename("routine_qiime2_analyses",
                                                "resources")
    nestedness_graphs_fp = '%s/nestedness_graphs.py' % RESOURCES

    job_folder2 = get_job_folder(i_datasets_folder,
                                 'nestedness_figures/chunks')

    all_sh_pbs = {}
    for dat, nestedness_rarefs in nestedness_res.items():
        if not split:
            out_sh = '%s/run_nestedness_graphs_%s_%s%s.sh' % (
                job_folder2, prjct_nm, dat, filt_raref)

        stats_tax_dat, level = get_stats_tax_dat(dat, datasets_collapsed_map)

        for idx, nestedness_raref in enumerate(nestedness_rarefs):
            cur_raref = datasets_rarefs[dat][idx]

            if split:
                out_sh = '%s/run_nestedness_graphs_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, cur_raref, filt_raref)
            out_py = out_sh.replace('.sh', '.py')

            cur_sh = '%s/run_nestedness_graphs_%s%s%s_tmp.sh' % (
                job_folder2, dat, cur_raref, filt_raref)
            cur_sh = cur_sh.replace(' ', '-')
            with open(cur_sh, 'w') as o:
                o.write('python3 %s\n' % out_py)
            all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)

            # value to edit in template
            tab_fp, meta_fp = datasets[dat][idx]
            if stats_tax_dat in split_taxa_pds:
                split_taxa_fp = split_taxa_pds[stats_tax_dat][1]
            else:
                split_taxa_fp = ''

            with open(out_py, 'w') as o, open(nestedness_graphs_fp) as f:
                for line in f:
                    line_edit = line
                    if '<DAT>' in line:
                        line_edit = line_edit.replace('<DAT>', dat)
                    if '<CUR_RAREF>' in line:
                        line_edit = line_edit.replace('<CUR_RAREF>', cur_raref)
                    if '<TAB_FP>' in line:
                        line_edit = line_edit.replace('<TAB_FP>', tab_fp)
                    if '<META_FP>' in line:
                        line_edit = line_edit.replace('<META_FP>', meta_fp)
                    if '<COLORS_SAMPLE>' in line:
                        line_edit = line_edit.replace("'<COLORS_SAMPLE>'",
                                                      str(colors['sample']))
                    if '<COLORS_FEATURE>' in line:
                        line_edit = line_edit.replace("'<COLORS_FEATURE>'",
                                                      str(colors['feature']))
                    if '<STATS_TAX_DAT>' in line:
                        line_edit = line_edit.replace('<STATS_TAX_DAT>',
                                                      stats_tax_dat)
                    if '<SPLIT_TAXA_FP>' in line:
                        line_edit = line_edit.replace('<SPLIT_TAXA_FP>',
                                                      split_taxa_fp)
                    if '<LEVEL>' in line:
                        line_edit = line_edit.replace('<LEVEL>', level)
                    if '<COLLAPSED>' in line:
                        line_edit = line_edit.replace("'<COLLAPSED>'",
                                                      str(collapsed))
                    if '<NESTEDNESS_RAREF>' in line:
                        line_edit = line_edit.replace("'<NESTEDNESS_RAREF>'",
                                                      str(nestedness_raref))
                    o.write(line_edit)

    job_folder = get_job_folder(i_datasets_folder, 'nestedness_figures')
    main_sh = write_main_sh(
        job_folder, 'run_nestedness_graphs%s' % filt_raref, all_sh_pbs,
        '%s.nstd.grph%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit)
    if main_sh:
        print("# NESTEDNESS GRAPHS")
        print_message('', 'sh', main_sh, jobs)
                                if testing_group == 'ALL':
                                    continue
                                cur_sh = '%s/run_beta_group_significance_%s%s_%s_%s_%s_%s%s.sh' % (
                                    job_folder2, dat, cur_depth, metric,
                                    subset, case, testing_group, filt_raref)
                                cur_sh = cur_sh.replace(' ', '-')
                                all_sh_pbs.setdefault((dat, out_sh),
                                                      []).append(cur_sh)
                                run_single_perm(odir, subset, meta_pd, cur_sh,
                                                metric, case, testing_group,
                                                p_perm_tests_min, p_beta_type,
                                                qza, mat_qza, case_var,
                                                case_vals, npermutations,
                                                force)

    job_folder = get_job_folder(i_datasets_folder, 'permanova')
    main_sh = write_main_sh(
        job_folder,
        '3_run_beta_group_significance_%s%s' % (prjct_nm, filt_raref),
        all_sh_pbs, '%s.prm%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit)
    if main_sh:
        if p_perm_groups:
            print("# PERMANOVA (groups config in %s)" % p_perm_groups)
        else:
            print("# PERMANOVA")
        print_message('', 'sh', main_sh, jobs)

    return permanovas
Esempio n. 15
0
def run_rarefy(i_datasets_folder: str, datasets: dict, datasets_read: dict,
               datasets_phylo: dict, datasets_filt_map: dict,
               datasets_rarefs: dict, p_raref_depths: str, eval_rarefs: bool,
               force: bool, prjct_nm: str, qiime_env: str, chmod: str,
               noloc: bool, run_params: dict, filt_raref: str, filt_only: bool,
               jobs: bool, slurm: bool, chunkit: int) -> dict:
    """
    Run rarefy: Rarefy table.
    https://docs.qiime2.org/2019.10/plugins/available/feature-table/rarefy/

    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets: dataset -> [tsv/biom path, meta path]
    :param datasets_read: dataset -> [tsv table, meta table]
    :param datasets_features: dataset -> list of features names in the dataset tsv / biom file.
    :param datasets_phylo: to be updated with ('tree_to_use', 'corrected_or_not') per dataset.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Nick name for your project.
    :param qiime_env: qiime2-xxxx.xx conda environment.
    :param chmod: whether to change permission of output files (defalt: 775).
    :return: deta divesity matrices.
    """

    evaluation = ''
    eval_depths = {}
    datasets_raref_depths, datasets_raref_evals = check_rarefy_need(
        i_datasets_folder, datasets_read, p_raref_depths)
    if eval_rarefs:
        evaluation = '_eval'

    set_filt_rarefy(datasets_raref_depths, datasets_filt_map)

    datasets_update = {}
    datasets_read_update = {}
    datasets_phylo_update = {}
    datasets_append = {}

    main_written = 0
    job_folder = get_job_folder(i_datasets_folder, 'rarefy%s' % evaluation)
    job_folder2 = get_job_folder(i_datasets_folder,
                                 'rarefy%s/chunks' % evaluation)
    to_chunk = []
    run_pbs = '%s/1_run_rarefy_%s%s%s.sh' % (job_folder, prjct_nm, evaluation,
                                             filt_raref)
    with open(run_pbs, 'w') as o:
        for dat, tsv_meta_pds_ in datasets.items():

            written = 0
            if dat not in datasets_raref_depths:
                continue
            if filt_only and dat not in datasets_filt_map:
                continue

            odir = get_analysis_folder(i_datasets_folder,
                                       'rarefy%s/%s' % (evaluation, dat))
            out_sh = '%s/run_rarefy_%s%s_%s.sh' % (job_folder2, prjct_nm,
                                                   evaluation, dat)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            with open(out_sh, 'w') as cur_sh:

                depths = datasets_raref_depths[dat][1]
                if eval_rarefs:
                    depths = datasets_raref_evals[dat]

                tsv_pd, meta_pd = datasets_read[dat][0]
                tsv_sums = tsv_pd.sum()
                for tsv_meta_pds in tsv_meta_pds_:
                    tsv, meta = tsv_meta_pds
                    for depth_ in depths:
                        depth = get_digit_depth(depth_, tsv_sums)
                        dat_raref = '%s_raref%s%s' % (dat, evaluation,
                                                      str(depth))
                        meta_out = '%s/meta_%s.tsv' % (odir, dat_raref)
                        remaining_samples = tsv_sums[
                            tsv_sums >= depth].index.tolist()
                        meta_raref_pd = meta_pd.loc[
                            meta_pd.sample_name.isin(remaining_samples), :]
                        meta_raref_pd.to_csv(meta_out, index=False, sep='\t')

                        qza = tsv.replace('.tsv', '.qza')
                        qza_out = '%s/tab_%s.qza' % (odir, dat_raref)
                        tsv_out = '%s.tsv' % splitext(qza_out)[0]
                        if force or not os.path.isfile(tsv_out):
                            cmd = write_rarefy(qza, qza_out, depth)
                            cur_sh.write('echo "%s"\n' % cmd)
                            cur_sh.write('%s\n\n' % cmd)
                            cmd = run_export(qza_out, tsv_out,
                                             'FeatureTable[Frequency]')
                            cur_sh.write('echo "%s"\n' % cmd)
                            cur_sh.write('%s\n\n' % cmd)
                            main_written += 1
                            written += 1

                        if eval_rarefs:
                            eval_depths.setdefault(dat, []).append(
                                '%s_%s' % (dat, str(depth)))
                            datasets_update['%s_%s' % (dat, str(depth))] = [[
                                tsv_out, meta_out
                            ]]
                            datasets_read_update['%s_%s' %
                                                 (dat, str(depth))] = (
                                                     'raref', str(depth))
                            datasets_phylo_update[
                                '%s_%s' %
                                (dat, str(depth))] = datasets_phylo[dat]
                        else:
                            datasets_append.setdefault(dat, []).append(
                                [tsv_out, meta_out])
                            if isfile(tsv_out) and isfile(meta_out):
                                tab_filt_pd = pd.read_csv(tsv_out,
                                                          index_col=0,
                                                          header=0,
                                                          sep='\t')
                                with open(meta_out) as f:
                                    for line in f:
                                        break
                                meta_filt_pd = pd.read_csv(
                                    meta_out,
                                    header=0,
                                    sep='\t',
                                    dtype={line.split('\t')[0]: str},
                                    low_memory=False)
                                datasets_read[dat].append(
                                    [tab_filt_pd, meta_filt_pd])
                            else:
                                datasets_read[dat].append(
                                    ('raref', str(depth)))
                            datasets_rarefs.setdefault(dat, []).append(
                                '_raref%s%s' % (evaluation, str(depth)))

            to_chunk.append(out_sh)
            if not chunkit:
                run_xpbs(
                    out_sh, out_pbs,
                    '%s.bt%s.%s%s' % (prjct_nm, evaluation, dat, filt_raref),
                    qiime_env, run_params["time"], run_params["n_nodes"],
                    run_params["n_procs"], run_params["mem_num"],
                    run_params["mem_dim"], chmod, written, 'single', o, noloc,
                    slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'rarefy%s' % evaluation,
                      prjct_nm, run_params["time"], run_params["n_nodes"],
                      run_params["n_procs"], run_params["mem_num"],
                      run_params["mem_dim"], qiime_env, chmod, noloc, slurm,
                      jobs, chunkit, None)

    if main_written:
        print_message('# Get rarefied datasets', 'sh', run_pbs, jobs)

    if eval_rarefs:
        datasets.update(datasets_update)
        datasets_read.update(datasets_read_update)
        datasets_phylo.update(datasets_phylo_update)
    else:
        for dat, fps in datasets_append.items():
            datasets[dat].extend(fps)

    return eval_depths
                                    case, filt_raref)
                                cur_sh = cur_sh.replace(' ', '-')
                                all_sh_pbs.setdefault((dat, out_sh),
                                                      []).append(cur_sh)
                                new_meta_pd = get_new_meta_pd(
                                    meta_pd, case, case_var, case_vals)
                                res = run_single_decay(
                                    odir, group, new_meta_pd, cur_sh, mat_qza,
                                    case, modes, force, run_params["n_nodes"],
                                    run_params["n_procs"],
                                    int(params['iteration']),
                                    int(params['step']))
                                decay_raref[(metric, group, case)] = res
            decay_res[dat].append(decay_raref)

    job_folder = get_job_folder(i_datasets_folder, 'decay')
    main_sh = write_main_sh(
        job_folder, '3_run_decay_%s%s' % (prjct_nm, filt_raref), all_sh_pbs,
        '%s.prm%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit)
    if main_sh:
        if p_distance_decay:
            print("# decay (config in %s)" % p_distance_decay)
        else:
            print("# decay")
        print_message('', 'sh', main_sh, jobs)

    return decay_res

def run_mantel(i_datasets_folder: str, datasets_filt: dict, p_mantel: str,
               betas: dict, force: bool, prjct_nm: str, qiime_env: str,
               chmod: str, noloc: bool, slurm: bool, split: bool,
               run_params: dict, filt_raref: str, filt_only: bool,
               eval_depths: dict, jobs: bool, chunkit: int) -> None:
    """
    """
    evaluation = ''
    if eval_depths:
        evaluation = '_eval'
        mantel_pairs = {}
        for dat, depths in eval_depths.items():
            sorted_depths = sorted(depths, key=lambda x: int(x.split('_')[-1]))
            for idx, x in enumerate(sorted_depths[:-1]):
                y = sorted_depths[(idx + 1)]
                n0 = x.split('_')[-1]
                n1 = y.split('_')[-1]
                mantel_pairs['%s_%s' % (n0, n1)] = [x, y]
        mantel_subsets = {'ALL': [[]]}
    else:
        mantel_pairs, mantel_subsets = get_procrustes_mantel_dicts(p_mantel)

    get_job_folder(i_datasets_folder, 'mantel%s' % evaluation)

    all_sh_pbs = {}
    missing_dats = set()
    for pair, (dat1_, dat2_) in mantel_pairs.items():

        dat1, raref1 = get_dat_idx(dat1_, evaluation, datasets_filt, filt_only)
        dat2, raref2 = get_dat_idx(dat2_, evaluation, datasets_filt, filt_only)

        if check_dat_exists(betas, dat1, missing_dats) or check_dat_exists(
                betas, dat2, missing_dats):
            continue

        if evaluation:
            metrics_groups_metas_qzas_dms_trees1 = betas[dat1]
            metrics_groups_metas_qzas_dms_trees2 = betas[dat2]
        else:
            metrics_groups_metas_qzas_dms_trees1 = betas[dat1][0]
            metrics_groups_metas_qzas_dms_trees2 = betas[dat2][0]

        job_folder2 = get_job_folder(
            i_datasets_folder,
            'mantel%s/chunks/%s%s' % (evaluation, pair, filt_raref))
        if not split:
            out_sh = '%s/run_mantel_%s%s_%s%s.sh' % (
                job_folder2, prjct_nm, evaluation, pair, filt_raref)

        for metric, groups_metas_qzas_dms_trees1 in metrics_groups_metas_qzas_dms_trees1.items(
        ):
            if split:
                out_sh = '%s/run_mantel_%s%s_%s_%s%s.sh' % (
                    job_folder2, prjct_nm, evaluation, pair, metric,
                    filt_raref)
            if metric not in metrics_groups_metas_qzas_dms_trees2:
                continue
            groups_metas_qzas_dms_trees2 = metrics_groups_metas_qzas_dms_trees2[
                metric]
            groups1 = sorted(groups_metas_qzas_dms_trees1.keys())
            groups2 = sorted(groups_metas_qzas_dms_trees2.keys())
            for (group1_, group2_) in itertools.product(*[groups1, groups2]):
                if group1_ == '':
                    group1 = 'full'
                else:
                    group1 = group1_
                if group2_ == '':
                    group2 = 'full'
                else:
                    group2 = group2_

                meta1, qza1, dm1, tree1 = groups_metas_qzas_dms_trees1[
                    group1_][0]
                meta2, qza2, dm2, tree2 = groups_metas_qzas_dms_trees2[
                    group2_][0]

                skip = 0
                if not evaluation:
                    if '__raref' in dat1_:
                        dm1, meta1 = get_dm_meta(dat1, dm1, meta1, raref1,
                                                 metric, i_datasets_folder,
                                                 skip)
                    if '__raref' in dat2_:
                        dm2, meta2 = get_dm_meta(dat2, dm2, meta2, raref2,
                                                 metric, i_datasets_folder,
                                                 skip)
                if skip:
                    print(
                        '[Mantels] One desired rarefaction depth not run (pair %s)'
                        % pair)
                    continue

                meta_pd1 = read_meta_pd(meta1)
                meta_pd2 = read_meta_pd(meta2)
                common_sams = list(
                    set(meta_pd1.sample_name) & set(meta_pd2.sample_name))
                if len(common_sams) < 3:
                    continue

                meta_pd = meta_pd1.loc[meta_pd1.sample_name.isin(common_sams)]
                cases_dict = check_metadata_cases_dict(meta1, meta_pd,
                                                       dict(mantel_subsets),
                                                       'mantel')
                odir = get_analysis_folder(
                    i_datasets_folder, 'mantel%s/%s%s/%s_vs_%s' %
                    (evaluation, pair, filt_raref, group1, group2))
                job_folder3 = get_job_folder(
                    i_datasets_folder, 'mantel%s/chunks/%s%s/%s_vs_%s' %
                    (evaluation, pair, filt_raref, group1, group2))

                for case_var, case_vals_list in cases_dict.items():
                    for case_vals in case_vals_list:
                        case_ = get_case(case_vals, case_var).replace(' ', '_')
                        cur = '%s__%s' % (metric, case_)
                        cur_sh = '%s/run_mantel%s_%s%s.sh' % (
                            job_folder3, evaluation, cur, filt_raref)
                        cur_sh = cur_sh.replace(' ', '-')
                        all_sh_pbs.setdefault((pair, out_sh),
                                              []).append(cur_sh)

                        dm_out1 = '%s/dm_%s__%s_DM.qza' % (odir, dat1_, cur)
                        dm_out2 = '%s/dm_%s__%s_DM.qza' % (odir, dat2_, cur)
                        mantel_out = '%s/mantel%s_%s__%s__%s.qzv' % (
                            odir, evaluation, dat1_, dat2_, cur)
                        run_single_procrustes_mantel('mantel', odir, dm1, dm2,
                                                     meta_pd, dm_out1, dm_out2,
                                                     mantel_out, cur_sh, cur,
                                                     case_var, case_vals,
                                                     force)

    job_folder = get_job_folder(i_datasets_folder, 'mantel%s' % evaluation)
    main_sh = write_main_sh(
        job_folder, '4_run_mantel_%s%s%s' % (prjct_nm, evaluation, filt_raref),
        all_sh_pbs, '%s.mntl%s%s' % (prjct_nm, evaluation, filt_raref),
        run_params["time"], run_params["n_nodes"], run_params["n_procs"],
        run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc,
        slurm, jobs, chunkit)
    if main_sh:
        if p_mantel and p_mantel != 1:
            if p_mantel.startswith('/panfs'):
                p_mantel = p_mantel.replace(os.getcwd(), '')
            print('# Mantels (pairs and samples subsets config in %s)' %
                  p_mantel)
        else:
            print('# Mantels')
        print_message('', 'sh', main_sh, jobs)
def run_procrustes(i_datasets_folder: str, datasets_filt: dict,
                   p_procrustes: str, betas: dict, force: bool, prjct_nm: str,
                   qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                   split: bool, run_params: dict, filt_raref: str,
                   filt_only: bool, eval_depths: dict, jobs: bool,
                   chunkit: int) -> None:
    """
    """
    evaluation = ''
    if eval_depths:
        evaluation = '_eval'
        procrustes_pairs = {}
        for dat, depths in eval_depths.items():
            sorted_depths = sorted(depths, key=lambda x: int(x.split('_')[-1]))
            for idx, x in enumerate(sorted_depths[:-1]):
                y = sorted_depths[(idx + 1)]
                n0 = x.split('_')[-1]
                n1 = y.split('_')[-1]
                procrustes_pairs['%s_%s' % (n0, n1)] = [x, y]
        procrustes_subsets = {'ALL': [[]]}
    else:
        procrustes_pairs, procrustes_subsets = get_procrustes_mantel_dicts(
            p_procrustes)
    get_job_folder(i_datasets_folder, 'procrustes%s' % evaluation)
    dms_tab = []
    all_sh_pbs = {}
    missing_dats = set()
    for pair, (dat1_, dat2_) in procrustes_pairs.items():

        dat1, raref1 = get_dat_idx(dat1_, evaluation, datasets_filt, filt_only)
        dat2, raref2 = get_dat_idx(dat2_, evaluation, datasets_filt, filt_only)

        if check_dat_exists(betas, dat1, missing_dats) or check_dat_exists(
                betas, dat2, missing_dats):
            continue

        if evaluation:
            metrics_groups_metas_qzas_dms_trees1 = betas[dat1]
            metrics_groups_metas_qzas_dms_trees2 = betas[dat2]
        else:
            metrics_groups_metas_qzas_dms_trees1 = betas[dat1][0]
            metrics_groups_metas_qzas_dms_trees2 = betas[dat2][0]

        job_folder2 = get_job_folder(
            i_datasets_folder,
            'procrustes%s/chunks/%s%s' % (evaluation, pair, filt_raref))
        if not split:
            out_sh = '%s/run_procrustes_%s%s_%s%s.sh' % (
                job_folder2, prjct_nm, evaluation, pair, filt_raref)

        for metric, groups_metas_qzas_dms_trees1 in metrics_groups_metas_qzas_dms_trees1.items(
        ):
            if split:
                out_sh = '%s/run_procrustes_%s%s_%s_%s%s.sh' % (
                    job_folder2, prjct_nm, evaluation, pair, metric,
                    filt_raref)
            if metric not in metrics_groups_metas_qzas_dms_trees2:
                continue
            groups_metas_qzas_dms_trees2 = metrics_groups_metas_qzas_dms_trees2[
                metric]
            groups1 = sorted(groups_metas_qzas_dms_trees1.keys())
            groups2 = sorted(groups_metas_qzas_dms_trees2.keys())
            for (group1_, group2_) in itertools.product(*[groups1, groups2]):
                if group1_ == '':
                    group1 = 'full'
                else:
                    group1 = group1_
                if group2_ == '':
                    group2 = 'full'
                else:
                    group2 = group2_

                meta1, qza1, dm1, tree1 = groups_metas_qzas_dms_trees1[
                    group1_][0]
                meta2, qza2, dm2, tree2 = groups_metas_qzas_dms_trees2[
                    group2_][0]

                skip = 0
                if not evaluation:
                    if '__raref' in dat1_:
                        dm1, meta1 = get_dm_meta(dat1, dm1, meta1, raref1,
                                                 metric, i_datasets_folder,
                                                 skip)
                    if '__raref' in dat2_:
                        dm2, meta2 = get_dm_meta(dat2, dm2, meta2, raref2,
                                                 metric, i_datasets_folder,
                                                 skip)
                if skip:
                    print(
                        '[Proscustes] One desired rarefaction depth not run (pair %s)'
                        % pair)
                    continue

                meta_pd1 = read_meta_pd(meta1)
                meta_pd2 = read_meta_pd(meta2)
                common_sams = list(
                    set(meta_pd1.sample_name) & set(meta_pd2.sample_name))
                if len(common_sams) < 3:
                    continue

                meta_pd = meta_pd1.loc[meta_pd1.sample_name.isin(common_sams)]
                cases_dict = check_metadata_cases_dict(
                    meta1, meta_pd, dict(procrustes_subsets), 'procrustes')
                odir = get_analysis_folder(
                    i_datasets_folder, 'procrustes%s/%s%s/%s_vs_%s' %
                    (evaluation, pair, filt_raref, group1, group2))
                job_folder3 = get_job_folder(
                    i_datasets_folder, 'procrustes%s/chunks/%s%s/%s_vs_%s' %
                    (evaluation, pair, filt_raref, group1, group2))
                for case_var, case_vals_list in cases_dict.items():
                    for case_vals in case_vals_list:
                        case_ = get_case(case_vals, case_var).replace(' ', '_')
                        cur = '%s__%s' % (metric, case_)
                        cur_sh = '%s/run_procrustes%s_%s%s.sh' % (
                            job_folder3, evaluation, cur, filt_raref)
                        cur_sh = cur_sh.replace(' ', '-')
                        all_sh_pbs.setdefault((pair, out_sh),
                                              []).append(cur_sh)

                        dm_out1 = '%s/dm_%s__%s_DM.qza' % (odir, dat1_, cur)
                        dm_out2 = '%s/dm_%s__%s_DM.qza' % (odir, dat2_, cur)
                        dm_out1_tsv = '%s.tsv' % splitext(dm_out1)[0]
                        dm_out2_tsv = '%s.tsv' % splitext(dm_out2)[0]
                        biplot = '%s/procrustes%s_%s__%s__%s.qzv' % (
                            odir, evaluation, dat1_, dat2_, cur)
                        run_single_procrustes_mantel('procrustes', odir, dm1,
                                                     dm2, meta_pd, dm_out1,
                                                     dm_out2, biplot, cur_sh,
                                                     cur, case_var, case_vals,
                                                     force)
                        dms_tab.append([
                            pair, dat1_, dat2_, group1, group2, case_, metric,
                            dm_out1_tsv, dm_out2_tsv
                        ])

    job_folder = get_job_folder(i_datasets_folder, 'procrustes%s' % evaluation)
    main_sh = write_main_sh(
        job_folder,
        '4_run_procrustes_%s%s%s' % (prjct_nm, evaluation, filt_raref),
        all_sh_pbs, '%s.prcst%s%s' % (prjct_nm, evaluation, filt_raref),
        run_params["time"], run_params["n_nodes"], run_params["n_procs"],
        run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc,
        slurm, jobs, chunkit)
    if main_sh:
        if p_procrustes and p_procrustes != 1:
            if p_procrustes.startswith('/panfs'):
                p_procrustes = p_procrustes.replace(os.getcwd(), '')
            print('# Procrustes (pairs and samples subsets config in %s)' %
                  p_procrustes)
        else:
            print('# Procrustes')
        print_message('', 'sh', main_sh, jobs)

    dms_tab_pd = pd.DataFrame(dms_tab,
                              columns=[
                                  'pair',
                                  'dat1',
                                  'dat2',
                                  'metric',
                                  'group1',
                                  'group2',
                                  'case',
                                  'dm_out1',
                                  'dm_out2',
                              ])

    odir = get_analysis_folder(i_datasets_folder,
                               'procrustes%s/R' % evaluation)
    out_Rs = glob.glob('%s/pairs_proscrustes_results%s%s*.tsv' %
                       (odir, evaluation, filt_raref))
    if len(out_Rs):
        done_R = pd.concat([pd.read_table(x, sep=' ') for x in out_Rs])
        dms_tab_pd = dms_tab_pd.loc[~dms_tab_pd[['dm_out1', 'dm_out2']].sum(1).
                                    isin(done_R[['f1', 'f2']].sum(1))]

    if dms_tab_pd.shape[0]:
        fp_num = 0
        if len(out_Rs):
            last = sorted(
                out_Rs, key=lambda fp: int(fp.split('.tsv')[0].split('_')[-1]))
            fp_num = int(last[-1].split('.tsv')[0].split('_')[-1]) + 1

        dms_tab_fp = '%s/pairs%s%s_%s.tsv' % (odir, evaluation, filt_raref,
                                              fp_num)
        dms_tab_pd.to_csv(dms_tab_fp, index=False, sep='\t')
        out_R = '%s/pairs_proscrustes_results%s%s_%s.tsv' % (
            odir, evaluation, filt_raref, fp_num)
        job_folder = get_job_folder(i_datasets_folder, 'procrustes/R')
        R_script = '%s/4_run_procrustes_%s%s.R' % (job_folder, prjct_nm,
                                                   filt_raref)
        with open(R_script, 'w') as o:
            o.write("library(vegan)\n")
            o.write("dms_files <- read.table('%s', h=T)\n" % dms_tab_fp)
            o.write(
                "cols <- c('pair', 'd1', 'd2', 'g1', 'g2', 'case', 'metric', 'f1', 'f2', 'samples', 'M2', 'p-value')\n"
            )
            o.write(
                "res <- setNames(data.frame(matrix(ncol = 12, nrow = 0)), cols)\n"
            )
            o.write("for (i in seq(1, dim(dms_files)[1])) {\n")
            o.write("    row <- as.vector(unlist(dms_files[i,]))\n")
            o.write("    pair <- row[1]\n")
            o.write("    d1 <- row[2]\n")
            o.write("    d2 <- row[3]\n")
            o.write("    group1 <- row[4]\n")
            o.write("    group2 <- row[5]\n")
            o.write("    case <- row[6]\n")
            o.write("    metric <- row[7]\n")
            o.write("    f1 <- row[8]\n")
            o.write("    f2 <- row[9]\n")
            o.write("    if (sum(file.exists(f1, f2)) == 2) {\n")
            o.write(
                "        filin_tsv_pd1 <- read.csv(f1, header = TRUE, check.names=FALSE,\n"
            )
            o.write(
                "                                  row.names = 1, colClasses = 'character', sep = '\\t')\n"
            )
            o.write(
                "        filin_tsv_pd2 <- read.csv(f2, header = TRUE, check.names=FALSE,\n"
            )
            o.write(
                "                                  row.names = 1, colClasses = 'character', sep = '\\t')\n"
            )
            o.write("        filin_tsv_pd1 <- data.matrix(filin_tsv_pd1)\n")
            o.write("        filin_tsv_pd2 <- data.matrix(filin_tsv_pd2)\n")
            o.write(
                "        filin_tsv_pd1 <- filin_tsv_pd1[rownames(filin_tsv_pd2), rownames(filin_tsv_pd2)]\n"
            )
            o.write(
                "        # procrustes12 <- procrustes(filin_tsv_pd1, filin_tsv_pd2, kind=2, permutations=999)\n"
            )
            o.write(
                "        prtst <- protest(filin_tsv_pd1, filin_tsv_pd2, permutations = 999)\n"
            )
            o.write("        n <- dim(filin_tsv_pd1)[1]\n")
            o.write(
                "        res[i,] <- c(pair, d1, d2, group1, group2, case, metric, f1, f2, n, prtst$ss, prtst$signif)\n"
            )
            o.write("    }\n")
            o.write("}\n")
            o.write("write.table(x = res, file = '%s')\n" % out_R)

        out_sh = '%s/4_run_procrustes_%s%s_R%s.sh' % (job_folder, prjct_nm,
                                                      evaluation, filt_raref)
        out_pbs = '%s.pbs' % splitext(out_sh)[0]
        with open(out_sh, 'w') as o:
            o.write('R -f %s --vanilla\n' % R_script)

        run_xpbs(
            out_sh, out_pbs,
            '%s.prcrt%s.R%s' % (prjct_nm, evaluation, filt_raref), 'renv',
            run_params["time"], run_params["n_nodes"], run_params["n_procs"],
            run_params["mem_num"], run_params["mem_dim"], chmod, 1,
            '# Procrustes for stats in R (pairs and samples subsets config in %s)'
            % p_procrustes, None, False, jobs)
Esempio n. 19
0
def run_barplot(i_datasets_folder: str, datasets: dict, taxonomies: dict,
                force: bool, prjct_nm: str, qiime_env: str, chmod: str,
                noloc: bool, slurm: bool, run_params: dict, filt_raref: str,
                jobs: bool, chunkit: int) -> None:
    """Visualize taxonomy with an interactive bar plot.

    Parameters
    ----------
    i_datasets_folder : str
        Path to the folder containing the data/metadata subfolders
    datasets : dict
        Mappig dataset name -> [tsv file path, metadata file path]
    taxonomies : dict
        Mappig dataset name -> [classification_method, tax_qza]
    force : bool
        Force the re-writing of scripts for all commands
    prjct_nm : str
        Short nick name for your project
    qiime_env : str
        Mame of a qiime2 conda environment
    chmod : str
        Whether to change permission of output files (defalt: 744)
    noloc : bool
    run_params : dict
    filt_raref : str
    jobs : bool
    chunkit : int

    Returns
    -------

    """
    job_folder = get_job_folder(i_datasets_folder, 'barplot')
    job_folder2 = get_job_folder(i_datasets_folder, 'barplot/chunks')

    written = 0
    to_chunk = []
    run_pbs = '%s/1_run_barplot_%s%s.sh' % (job_folder, prjct_nm, filt_raref)
    with open(run_pbs, 'w') as o:
        for dat, tsv_meta_pds_ in datasets.items():
            out_sh = '%s/run_barplot_%s_%s%s.sh' % (job_folder2, prjct_nm, dat,
                                                    filt_raref)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            with open(out_sh, 'w') as cur_sh:
                for tsv_meta_pds in tsv_meta_pds_:
                    tsv, meta = tsv_meta_pds
                    if dat not in taxonomies:
                        continue
                    method, tax_qza, tax_tsv = taxonomies[dat]
                    if not method:
                        method = 'taxofromfile'
                    qza = '%s.qza' % splitext(tsv)[0]
                    odir = get_analysis_folder(i_datasets_folder,
                                               'barplot/%s' % dat)
                    out_qzv = '%s/bar_%s_%s.qzv' % (odir, dat, method)
                    if force or not isfile(out_qzv):
                        write_barplots(out_qzv, qza, meta, tax_qza, cur_sh)
                        written += 1
            to_chunk.append(out_sh)
            if not chunkit:
                run_xpbs(out_sh, out_pbs,
                         '%s.brplt.%s%s' % (prjct_nm, dat, filt_raref),
                         qiime_env, run_params["time"], run_params["n_nodes"],
                         run_params["n_procs"], run_params["mem_num"],
                         run_params["mem_dim"], chmod, written, 'single', o,
                         noloc, slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'barplot', prjct_nm,
                      run_params["time"], run_params["n_nodes"],
                      run_params["n_procs"], run_params["mem_num"],
                      run_params["mem_dim"], qiime_env, chmod, noloc, slurm,
                      jobs, chunkit, None)

    if written:
        print_message('# Make sample compositions barplots', 'sh', run_pbs,
                      jobs)
Esempio n. 20
0
def run_nestedness(i_datasets_folder: str, betas: dict,
                   datasets_collapsed_map: dict, p_nestedness_groups: str,
                   datasets_rarefs: dict, force: bool, prjct_nm: str,
                   qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                   split: bool, run_params: dict, filt_raref: str, jobs: bool,
                   chunkit: int) -> (dict, list, dict):

    job_folder2 = get_job_folder(i_datasets_folder, 'nestedness/chunks')

    nestedness_config = read_yaml_file(p_nestedness_groups)
    if 'soft' not in nestedness_config:
        print(
            'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)'
        )
        return {}
    if nestedness_config['soft'].endswith('Autocorrelation.jar') and isfile(
            nestedness_config['soft']):
        binary = nestedness_config['soft']
    else:
        binary = '%s/bin/Autocorrelation.jar' % nestedness_config['soft']
        if not isfile(binary):
            print(
                'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)'
            )
            return {}

    subsets, nodfs, colors, nulls, modes, params = get_nestedness_config(
        nestedness_config)

    nodfs_fps = {}
    all_sh_pbs = {}
    nestedness_res = {}
    for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items():
        if not split:
            out_sh = '%s/run_nestedness_%s_%s%s.sh' % (job_folder2, prjct_nm,
                                                       dat, filt_raref)

        stats_tax_dat, level = get_stats_tax_dat(dat, datasets_collapsed_map)

        nestedness_res[dat] = []
        for idx, metrics_groups_metas_qzas_dms_trees in enumerate(
                rarefs_metrics_groups_metas_qzas_dms_trees):
            nestedness_raref = {}
            cur_raref = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder,
                                       'nestedness/%s%s' % (dat, cur_raref))
            if split:
                out_sh = '%s/run_nestedness_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, cur_raref, filt_raref)
            for _, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items(
            ):
                for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items(
                ):

                    meta, qza, mat_qza, tree = metas_qzas_mat_qzas_trees[0]
                    meta_pd = read_meta_pd(meta).set_index('sample_name')
                    cases_dict = check_metadata_cases_dict(
                        meta, meta_pd, dict(subsets), 'nestedness')
                    for case_var, case_vals_list in cases_dict.items():
                        for case_vals in case_vals_list:
                            case = get_case(case_vals,
                                            case_var).replace(' ', '_')
                            cur_sh = '%s/run_nestedness_%s%s_%s_%s%s.sh' % (
                                job_folder2, dat, cur_raref, group, case,
                                filt_raref)
                            cur_sh = cur_sh.replace(' ', '-')
                            # print("case", case)
                            all_sh_pbs.setdefault((dat, out_sh),
                                                  []).append(cur_sh)
                            res, group_case_nodfs = run_single_nestedness(
                                odir, cur_raref, level, group, meta_pd, nodfs,
                                nulls, modes, cur_sh, qza, case, case_var,
                                case_vals, binary, params, force)
                            nodfs_fps.setdefault(stats_tax_dat,
                                                 []).extend(group_case_nodfs)
                            nestedness_raref[(group, case)] = res
                break
            nestedness_res[dat].append(nestedness_raref)
Esempio n. 21
0
                        cur_sh = '%s/run_songbird_%s_%s_%s_%s_%s_%s.sh' % (
                            job_folder2, dat_pair, filt, case, modx, mdx, idx)
                        cur_sh = cur_sh.replace(' ', '-')

                        all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                        diffs, tensor_html = run_single_songbird(
                            odir, odir_base, qza, new_qza, new_meta, cur_sh,
                            force, batch, learn, epoch, diff_prior,
                            thresh_feat, thresh_sample, formula, train_column,
                            summary_interval, metadatas, baselines,
                            model_baseline, baseline_formula)
                        songbird_outputs.append([
                            dat, filt,
                            '%s_%s' % (params.replace('/', '__'), model), case,
                            diffs, model_baseline, tensor_html, pair
                        ])

    job_folder = get_job_folder(i_datasets_folder, 'songbird')
    main_sh = write_main_sh(
        job_folder, '2_songbird_%s%s' % (prjct_nm, filt_raref), all_sh_pbs,
        '%s.sngbrd%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], qiime_env, chmod, noloc, jobs, chunkit)
    if main_sh:
        if p_diff_models.startswith('/panfs'):
            p_diff_models = p_diff_models.replace(os.getcwd(), '')
        print_message("# Songbird (configs in %s)" % p_diff_models, 'sh',
                      main_sh, jobs)
    return songbird_outputs
Esempio n. 22
0
def run_qemistree(i_datasets_folder: str, datasets: dict, prjct_nm: str,
                  i_qemistree: str, taxonomies: dict, force: bool,
                  qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                  run_params: dict, filt_raref: str, jobs: bool,
                  chunkit: int) -> None:
    """
    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets_read: dataset -> [tsv table, meta table]
    :param prjct_nm: Short nick name for your project.
    :param i_qemistree: path to qemistree folder (feature-data and tree).
    :param taxonomies: dataset -> [method, assignment qza]
    :param force: Force the re-writing of scripts for all commands.
    :param qiime_env: name of your qiime2 conda environment (e.g. qiime2-2019.10).
    :param chmod: whether to change permission of output files (defalt: 775).
    """

    job_folder = get_job_folder(i_datasets_folder, 'qemistree')
    job_folder2 = get_job_folder(i_datasets_folder, 'qemistree/chunks')

    written = 0
    to_chunk = []
    run_pbs = '%s/1_run_qemistree_%s%s.sh' % (job_folder, prjct_nm, filt_raref)
    with open(run_pbs, 'w') as o:
        for dat, tsv_meta_pds in datasets.items():
            feature_data = '%s/feature-data_%s.qza' % (i_qemistree, dat)
            qemistree = '%s/qemistree_%s.qza' % (i_qemistree, dat)
            if not isfile(feature_data) or not isfile(qemistree):
                continue
            out_sh = '%s/run_qemistree_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            odir = get_analysis_folder(i_datasets_folder, 'qemistree/%s' % dat)
            classyfire_qza = '%s/%s-classyfire.qza' % (odir, dat)
            classyfire_tsv = '%s.tsv' % splitext(classyfire_qza)[0]
            with open(out_sh, 'w') as cur_sh:
                if force or not isfile(classyfire_tsv):
                    write_qemistree(feature_data, classyfire_qza,
                                    classyfire_tsv, qemistree,
                                    cur_sh)
                    written += 1

            if isfile(classyfire_tsv):
                odir = get_analysis_folder(i_datasets_folder, 'taxonomy/%s' % dat)
                out_rad = '%s/tax_%s' % (odir, dat)
                tax_qza = '%s.qza' % out_rad
                tax_tsv = '%s.tsv' % out_rad
                classyfire_pd = pd.read_csv(classyfire_tsv, header=0, sep='\t')
                with open(tax_tsv, 'w') as o:
                    cols = ['id', 'kingdom', 'superclass', 'class', 'subclass', 'direct_parent']
                    o.write('Feature ID\tTaxon\n')
                    for row in classyfire_pd[cols].values:
                        o.write('%s\t%s\n' % (row[0], '; '.join(row[1:])))
                run_export(tax_tsv, tax_qza, 'FeatureData[Taxonomy]')
                taxonomies[dat] = ['direct_parent', tax_qza]
                written += 1
            else:
                print('[Warning] Maybe run qemistree first and then re-run pipeline to '
                      'have the classyfire taxonomy include in the barplots!')

            to_chunk.append(out_sh)
            if not chunkit:
                run_xpbs(out_sh, out_pbs, '%s.qmstr.%s%s' % (prjct_nm, dat, filt_raref), qiime_env,
                     run_params["time"], run_params["n_nodes"], run_params["n_procs"],
                     run_params["mem_num"], run_params["mem_dim"],
                     chmod, written, 'single', o, noloc, slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'qemistree',
                      prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"],
                      run_params["mem_num"], run_params["mem_dim"],
                      qiime_env, chmod, noloc, slurm, jobs, chunkit, None)

    if written:
        print_message('# Make qemistree classyfire classifications', 'sh', run_pbs, jobs)
Esempio n. 23
0
                                filt_raref)
                            cur_sh = cur_sh.replace(' ', '-')
                            # print("case", case)
                            all_sh_pbs.setdefault((dat, out_sh),
                                                  []).append(cur_sh)
                            res, group_case_nodfs = run_single_nestedness(
                                odir, cur_raref, level, group, meta_pd, nodfs,
                                nulls, modes, cur_sh, qza, case, case_var,
                                case_vals, binary, params, force)
                            nodfs_fps.setdefault(stats_tax_dat,
                                                 []).extend(group_case_nodfs)
                            nestedness_raref[(group, case)] = res
                break
            nestedness_res[dat].append(nestedness_raref)

    job_folder = get_job_folder(i_datasets_folder, 'nestedness')
    main_sh = write_main_sh(
        job_folder, '3_run_nestedness_%s%s' % (prjct_nm, filt_raref),
        all_sh_pbs, '%s.prm%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit)
    if main_sh:
        if p_nestedness_groups:
            print("# nestedness (config in %s)" % p_nestedness_groups)
        else:
            print("# nestedness")
        print_message('', 'sh', main_sh, jobs)

    return nestedness_res, colors, nodfs_fps

def run_sepp(i_datasets_folder: str, datasets: dict, datasets_read: dict,
             datasets_phylo: dict, datasets_rarefs: dict, prjct_nm: str,
             i_sepp_tree: str, trees: dict, force: bool, qiime_env: str,
             chmod: str, noloc: bool, slurm: bool, run_params: dict,
             filt_raref: str, jobs: bool) -> None:
    """
    Run SEPP on the datasets composed or 16S deblur sequences (e.g. from redbiom/Qiita).

    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets: dataset -> [tsv/biom path, meta path]
    :param datasets_read: dataset -> [tsv table, meta table]
    :param datasets_phylo: to be updated with ('tree_to_use', 'corrected_or_not') per dataset.
    :param prjct_nm: Short nick name for your project.
    :param i_sepp_tree: database to use for sepp phylogeny reads placement.
    :param trees: to be update with tree to use for a dataset phylogenetic analyses.
    :param force: Force the re-writing of scripts for all commands.
    :param qiime_env: name of your qiime2 conda environment (e.g. qiime2-2019.10).
    :param chmod: whether to change permission of output files (defalt: 775).
    """
    # check whether there's dataset(s) that may use the reference tree (i.e. features are DNA sequences)
    sepp_datasets = [dat for dat, (tree, correction) in datasets_phylo.items() if tree == 'amplicon']
    if len(sepp_datasets):
        ref_tree_qza = get_sepp_tree(i_sepp_tree)

        job_folder = get_job_folder(i_datasets_folder, 'phylo')
        job_folder2 = get_job_folder(i_datasets_folder, 'phylo/chunks')
        main_written = 0
        main_sh = '%s/1_run_sepp_%s%s.sh' % (job_folder, prjct_nm, filt_raref)
        with open(main_sh, 'w') as main_o:
            for dat, tsv_metas_fps_ in datasets.items():
                written = 0
                if dat not in sepp_datasets:
                    continue
                out_sh = '%s/run_sepp_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref)
                if slurm:
                    out_pbs = '%s.slm' % splitext(out_sh)[0]
                else:
                    out_pbs = '%s.pbs' % splitext(out_sh)[0]
                with open(out_sh, 'w') as cur_sh:
                    for idx, tsv_metas_fps in enumerate(tsv_metas_fps_):
                        tsv, meta = tsv_metas_fps
                        if not isinstance(datasets_read[dat][idx][0], pd.DataFrame) and datasets_read[dat][idx][0] == 'raref':
                            qza_raw_in = '%s/data/tab_%s_inTree.qza' % (i_datasets_folder, dat)
                            if isfile(qza_raw_in) and not force:
                                odir_sepp = get_analysis_folder(i_datasets_folder, 'phylo/%s' % dat)
                                out_fp_sepp_tree = '%s/tree_%s.qza' % (odir_sepp, dat)
                                # if idx:
                                #     trees[dat].append((qza_raw_in, out_fp_sepp_tree))
                                # else:
                                #     trees[dat] = [(qza_raw_in, out_fp_sepp_tree)]
                                if not idx:
                                    trees[dat] = (qza_raw_in, out_fp_sepp_tree)
                                print('Using the non rarefied tree (no need to recompute)...\nExiting')
                                continue
                            elif not isfile(tsv):
                                print('Must have run rarefaction to use it further...\nExiting')
                                sys.exit(0)
                            tsv_pd, meta_pd = get_raref_tab_meta_pds(meta, tsv)
                            datasets_read[dat][idx] = [tsv_pd, meta_pd]
                        else:
                            tsv_pd, meta_pd = datasets_read[dat][idx]

                        qza = '%s.qza' % splitext(tsv)[0]
                        if not isfile(qza):
                            print('Need to first import %s to .qza to do reads placement '
                                  '(see "# Import tables to qiime2")\nExiting...' % tsv)
                            sys.exit(0)

                        cur_raref = datasets_rarefs[dat][idx]
                        qza_in = '%s_inTree%s.qza' % (splitext(tsv)[0], cur_raref)
                        qza_in_tsv = '%s.tsv' % splitext(qza_in)[0]
                        qza_out = '%s_notInTree%s.qza' % (splitext(tsv)[0], cur_raref)

                        odir_seqs = get_analysis_folder(i_datasets_folder, 'seqs/%s' % dat)
                        odir_sepp = get_analysis_folder(i_datasets_folder, 'phylo/%s' % dat)

                        out_fp_seqs_rad = '%s/seq_%s%s' % (odir_seqs, dat, cur_raref)
                        out_fp_seqs_fasta = '%s.fasta' % out_fp_seqs_rad
                        out_fp_seqs_qza = '%s.qza' % out_fp_seqs_rad

                        out_fp_sepp_tree = '%s/tree_%s%s.qza' % (odir_sepp, dat, cur_raref)
                        # if idx:
                        #     trees[dat].append((qza_in, out_fp_sepp_tree))
                        # else:
                        #     trees[dat] = [(qza_in, out_fp_sepp_tree)]
                        if not idx:
                            trees[dat] = (qza_in, out_fp_sepp_tree)

                        written = 0
                        if force or not isfile(out_fp_seqs_qza):
                            cmd = write_seqs_fasta(out_fp_seqs_fasta,
                                                   out_fp_seqs_qza, tsv_pd)
                            cur_sh.write('echo "%s"\n' % cmd)
                            cur_sh.write('%s\n\n' % cmd)
                            written += 1
                            main_written += 1
                        if force or not isfile(out_fp_sepp_tree) or not isfile(qza_in_tsv):
                            cmd = write_fragment_insertion(
                                out_fp_seqs_qza, ref_tree_qza,
                                out_fp_sepp_tree, qza,
                                qza_in, qza_in_tsv, qza_out)
                            cur_sh.write('echo "%s"\n' % cmd)
                            cur_sh.write('%s\n\n' % cmd)
                            written += 1
                            main_written += 1
                run_xpbs(out_sh, out_pbs, '%s.spp.%s%s' % (prjct_nm, dat, filt_raref), qiime_env,
                         run_params["time"], run_params["n_nodes"], run_params["n_procs"],
                         run_params["mem_num"], run_params["mem_dim"],
                         chmod, written, 'single', main_o, noloc, slurm, jobs)
        if main_written:
            print_message("# Fragment insertion using SEPP (%s)" % ', '.join(sepp_datasets), 'sh', main_sh, jobs)
def run_permanova(i_datasets_folder: str, betas: dict,
                  main_testing_groups: tuple, p_perm_tests_min: int,
                  p_beta_type: tuple, datasets_rarefs: dict,
                  p_perm_groups: str, force: bool, prjct_nm: str,
                  qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                  split: bool, run_params: dict, filt_raref: str, jobs: bool,
                  chunkit: int) -> dict:
    """
    Run beta-group-significance: Beta diversity group significance.
    https://docs.qiime2.org/2019.10/plugins/available/diversity/beta-group-significance/
    Main per-dataset looper for the PERMANOVA tests on beta diversity matrices.

    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets: list of datasets.
    :param betas: beta diversity matrices.
    :param main_testing_groups: groups to test.
    :param p_perm_groups: groups to subset.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Nick name for your project.
    :param qiime_env: qiime2-xxxx.xx conda environment.
    :param chmod: whether to change permission of output files (defalt: 775).
    """
    permanovas = {}
    job_folder2 = get_job_folder(i_datasets_folder, 'permanova/chunks')
    main_cases_dict = get_main_cases_dict(p_perm_groups)

    npermutations = 999

    metric_check = set()
    all_sh_pbs = {}
    first_print = 0
    for dat, metric_groups_metas_qzas_dms_trees_ in betas.items():
        permanovas[dat] = []
        if not split:
            out_sh = '%s/run_beta_group_significance_%s_%s%s.sh' % (
                job_folder2, prjct_nm, dat, filt_raref)
        for idx, metric_groups_metas_qzas_dms_trees in enumerate(
                metric_groups_metas_qzas_dms_trees_):
            cur_depth = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder,
                                       'permanova/%s%s' % (dat, cur_depth))
            for metric, subset_files in metric_groups_metas_qzas_dms_trees.items(
            ):
                permanovas.setdefault(dat, []).append(metric)
                if split:
                    out_sh = '%s/run_beta_group_significance_%s_%s_%s%s.sh' % (
                        job_folder2, prjct_nm, dat, metric, filt_raref)
                for subset, metas_qzas_mat_qzas_trees in subset_files.items():
                    (meta, qza, mat_qza, tree) = metas_qzas_mat_qzas_trees[0]
                    if not isfile(mat_qza):
                        if not first_print:
                            print(
                                'Beta diversity, distances matrices must be generated already to automatise PERMANOVA\n'
                                '\t(re-run this after steps "2_run_beta.sh" and "2x_run_beta_export.pbs" are done)'
                            )
                            first_print += 1
                        continue
                    if (dat, subset) not in metric_check:
                        meta_pd = read_meta_pd(meta)
                        meta_pd = meta_pd.set_index('sample_name')
                        cases_dict = check_metadata_cases_dict(
                            meta, meta_pd, dict(main_cases_dict), 'PERMANOVA')
                        testing_groups = check_metadata_testing_groups(
                            meta, meta_pd, main_testing_groups,
                            p_perm_tests_min, 'PERMANOVA')
                        metric_check.add((dat, subset))

                    for case_var, case_vals_list in cases_dict.items():
                        testing_groups_case_var = list(
                            set(testing_groups + [case_var]))
                        for case_vals in case_vals_list:
                            case = get_case(case_vals,
                                            case_var).replace(' ', '_')
                            for testing_group in testing_groups_case_var:
                                if testing_group == 'ALL':
                                    continue
                                cur_sh = '%s/run_beta_group_significance_%s%s_%s_%s_%s_%s%s.sh' % (
                                    job_folder2, dat, cur_depth, metric,
                                    subset, case, testing_group, filt_raref)
                                cur_sh = cur_sh.replace(' ', '-')
                                all_sh_pbs.setdefault((dat, out_sh),
                                                      []).append(cur_sh)
                                run_single_perm(odir, subset, meta_pd, cur_sh,
                                                metric, case, testing_group,
                                                p_perm_tests_min, p_beta_type,
                                                qza, mat_qza, case_var,
                                                case_vals, npermutations,
                                                force)
Esempio n. 26
0
def run_deicode(i_datasets_folder: str, datasets: dict, datasets_rarefs: dict,
                p_perm_groups: str, force: bool, prjct_nm: str, qiime_env: str,
                chmod: str, noloc: bool, slurm: bool, run_params: dict,
                filt_raref: str, jobs: bool, chunkit: int) -> None:
    """
    Performs robust center log-ratio transform robust PCA and
    ranks the features by the loadings of the resulting SVD.
    https://library.qiime2.org/plugins/deicode/19/
    Main per-dataset looper for the ADONIS tests on beta diversity matrices.

    :param i_data_sets_folder: Path to the folder containing the data/metadata subfolders.
    :param data_sets: list of data_sets.
    :param p_perm_groups: groups to subset.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Nick name for your project.
    :param qiime_env: qiime2-xxxx.xx conda environment.
    :param chmod: whether to change permission of output files (defalt: 775).
    """
    job_folder2 = get_job_folder(i_datasets_folder, 'deicode/chunks')
    main_cases_dict = get_main_cases_dict(p_perm_groups)
    # jobs = []
    all_sh_pbs = {}
    for dat, tsv_meta_pds_ in datasets.items():
        out_sh = '%s/run_deicode_%s_%s%s.sh' % (job_folder2, prjct_nm, dat,
                                                filt_raref)
        for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
            cur_raref = datasets_rarefs[dat][idx]
            tsv, meta = tsv_meta_pds
            meta_alphas = meta.replace('.tsv', '_alphas.tsv')
            meta_alphas_full = meta.replace('.tsv', '_alphas_full.tsv')
            if isfile(meta_alphas_full):
                meta = meta_alphas_full
            elif isfile(meta_alphas):
                meta = meta_alphas
            meta_pd = read_meta_pd(meta)
            meta_pd = meta_pd.set_index('sample_name')
            cases_dict = check_metadata_cases_dict(meta, meta_pd,
                                                   dict(main_cases_dict),
                                                   'DEICODE')
            odir = get_analysis_folder(i_datasets_folder,
                                       'deicode/%s%s' % (dat, cur_raref))
            for case_var, case_vals_list in cases_dict.items():
                cur_sh = '%s/run_beta_deicode_%s_%s%s_%s%s.sh' % (
                    job_folder2, prjct_nm, dat, cur_raref, case_var,
                    filt_raref)
                cur_sh = cur_sh.replace(' ', '-')
                all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                run_single_deicode(odir, tsv, meta_pd, case_var,
                                   case_vals_list, cur_sh, force)

    job_folder = get_job_folder(i_datasets_folder, 'deicode')
    main_sh = write_main_sh(
        job_folder, '3_run_beta_deicode_%s%s' % (filt_raref, prjct_nm),
        all_sh_pbs, '%s.dcd%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit)
    if main_sh:
        if p_perm_groups:
            if p_perm_groups.startswith('/panfs'):
                p_perm_groups = p_perm_groups.replace(os.getcwd(), '')
            print('# DEICODE (groups config in %s)' % p_perm_groups)
        else:
            print('# DEICODE')
        print_message('', 'sh', main_sh, jobs)
Esempio n. 27
0
def run_sourcetracking(i_datasets_folder: str, datasets: dict,
                       p_sourcetracking_config: str, datasets_rarefs: dict,
                       force: bool, prjct_nm: str, qiime_env: str, chmod: str,
                       noloc: bool, slurm: bool, run_params: dict,
                       filt_raref: str, split: bool, jobs: bool,
                       chunkit: int) -> None:

    job_folder2 = get_job_folder(i_datasets_folder, 'sourcetracking/chunks')
    sourcetracking_dicts = get_sourcetracking_config(p_sourcetracking_config)
    sourcetracking_sourcesink = sourcetracking_dicts[0]
    sourcetracking_filtering = sourcetracking_dicts[1]
    sourcetracking_params = sourcetracking_dicts[2]
    main_cases_dict = sourcetracking_dicts[3]

    all_sh_pbs = {}
    all_import_sh_pbs = {}
    for dat, tsv_meta_pds_ in datasets.items():
        if dat in sourcetracking_filtering:
            filters = sourcetracking_filtering[dat]
        else:
            filters = {'0_0': ['0', '0']}
        for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
            tsv, meta = tsv_meta_pds
            meta_pd = read_meta_pd(meta)
            meta_pd = meta_pd.set_index('sample_name')
            cases_dict = check_metadata_cases_dict(meta, meta_pd,
                                                   dict(main_cases_dict),
                                                   'sourcetracking')
            cur_raref = datasets_rarefs[dat][idx]
            out_import_sh = '%s/run_import_sourcetracking_%s_%s%s%s.sh' % (
                job_folder2, prjct_nm, dat, filt_raref, cur_raref)
            imports = set()
            odir = get_analysis_folder(i_datasets_folder,
                                       'sourcetracking/%s' % dat)
            for method in sourcetracking_params['method']:
                out_sh = '%s/run_sourcetracking_%s_%s%s%s_%s.sh' % (
                    job_folder2, prjct_nm, dat, filt_raref, cur_raref, method)
                for case_var, case_vals_list in cases_dict.items():
                    for filt, (fp, fa) in filters.items():
                        cur_sh = '%s/run_sourcetracking_%s_%s_%s%s%s_%s_%s.sh' % (
                            job_folder2, prjct_nm, dat, case_var, filt_raref,
                            cur_raref, method, filt)
                        cur_sh = cur_sh.replace(' ', '-')
                        cur_import_sh = '%s/run_import_sourcetracking_%s_%s_%s%s%s_%s_%s.sh' % (
                            job_folder2, prjct_nm, dat, case_var, filt_raref,
                            cur_raref, method, filt)
                        cur_import_sh = cur_import_sh.replace(' ', '-')
                        all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                        all_import_sh_pbs.setdefault((dat, out_import_sh),
                                                     []).append(cur_import_sh)
                        run_single_sourcetracking(
                            odir, tsv, meta_pd, case_var,
                            sourcetracking_params, method, imports,
                            sourcetracking_sourcesink, case_vals_list, cur_sh,
                            cur_import_sh, force, filt, cur_raref, fp, fa,
                            run_params["n_nodes"], run_params["n_procs"])

    job_folder = get_job_folder(i_datasets_folder, 'sourcetracking')
    main_sh = write_main_sh(
        job_folder,
        '3_run_import_sourcetracking_%s%s' % (prjct_nm, filt_raref),
        all_import_sh_pbs, '%s.mpt.srctrk%s' % (prjct_nm, filt_raref),
        run_params["time"], run_params["n_nodes"], run_params["n_procs"],
        run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc,
        slurm, jobs, chunkit, '~/.')
    if main_sh:
        if p_sourcetracking_config:
            if p_sourcetracking_config.startswith('/panfs'):
                p_sourcetracking_config = p_sourcetracking_config.replace(
                    os.getcwd(), '')
            print('# import sourcetracking (groups config in %s)' %
                  p_sourcetracking_config)
        else:
            print('# import sourcetracking')
        print_message('', 'sh', main_sh, jobs)

    main_sh = write_main_sh(
        job_folder, '3_run_sourcetracking_%s%s' % (prjct_nm, filt_raref),
        all_sh_pbs, '%s.srctrk%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit,
        '~/.')
    if main_sh:
        if p_sourcetracking_config:
            if p_sourcetracking_config.startswith('/panfs'):
                p_sourcetracking_config = p_sourcetracking_config.replace(
                    os.getcwd(), '')
            print('# sourcetracking (groups config in %s)' %
                  p_sourcetracking_config)
        else:
            print('# sourcetracking')
        print_message('', 'sh', main_sh, jobs)
Esempio n. 28
0
def run_adonis(p_formulas: str, i_datasets_folder: str, betas: dict,
               datasets_rarefs: dict, p_perm_groups: str, force: bool,
               prjct_nm: str, qiime_env: str, chmod: str, noloc: bool,
               slurm: bool, split: bool, run_params: dict, filt_raref: str,
               jobs: bool, chunkit: int) -> None:
    """
    Run beta-group-significance: Beta diversity group significance.
    https://docs.qiime2.org/2019.10/plugins/available/diversity/beta-group-significance/
    Main per-dataset looper for the ADONIS tests on beta diversity matrices.

    :param p_formulas: formulas to test.
    :param i_data_sets_folder: Path to the folder containing the data/metadata subfolders.
    :param data_sets: list of datasets.
    :param betas: beta diversity matrices.
    :param p_perm_groups: groups to subset.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Nick name for your project.
    :param qiime_env: qiime2-xxxx.xx conda environment.
    :param chmod: whether to change permission of output files (defalt: 775).
    """

    job_folder2 = get_job_folder(i_datasets_folder, 'adonis/chunks')

    main_cases_dict = get_main_cases_dict(p_perm_groups)
    formulas = get_formulas_dict(p_formulas)

    metric_check = set()
    all_sh_pbs = {}
    first_print = 0

    for dat, metric_groups_metas_qzas_dms_trees_ in betas.items():
        if dat not in formulas:
            continue
        if not split:
            out_sh = '%s/run_adonis_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref)
        for idx, metric_groups_metas_qzas_dms_trees in enumerate(metric_groups_metas_qzas_dms_trees_):
            cur_depth = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder, 'adonis/%s%s' % (dat, cur_depth))
            for metric, subset_files in metric_groups_metas_qzas_dms_trees.items():
                if split:
                    out_sh = '%s/run_adonis_%s_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, metric, filt_raref)

                for subset, metas_qzas_mat_qzas_trees in subset_files.items():
                    for meta, qza, mat_qza, tree in metas_qzas_mat_qzas_trees:
                        if not isfile(mat_qza):
                            if not first_print:
                                print('Beta diversity, distances matrices must be generated already to automatise PERMANOVA\n'
                                      '\t(re-run this after steps "2_run_beta.sh" and "2x_run_beta_export.pbs" are done)')
                                first_print += 1
                            continue

                        if (dat, subset) not in metric_check:
                            meta_pd = read_meta_pd(meta).set_index('sample_name')
                            cases_dict = check_metadata_cases_dict(meta, meta_pd, dict(main_cases_dict), 'ADONIS')
                            formulas = check_metadata_formulas(meta, meta_pd, formulas[dat], 'ADONIS')
                            metric_check.add((dat, subset))

                        for fdx, form in enumerate(formulas[dat].keys()):
                            formula = formulas[dat][form]
                            for cdx, case_var in enumerate(cases_dict.keys()):
                                case_vals_list = cases_dict[case_var]
                                cur_sh = '%s/run_adonis_%s%s_%s_%s_%s%s.sh' % (
                                    job_folder2, dat, cur_depth, metric, fdx, cdx, filt_raref)
                                cur_sh = cur_sh.replace(' ', '-')
                                all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                                run_single_adonis(odir, subset, case_vals_list, metric, case_var,
                                                  form, formula, qza, mat_qza, meta_pd, cur_sh, force)

    job_folder = get_job_folder(i_datasets_folder, 'adonis')
    main_sh = write_main_sh(job_folder, '3_run_adonis_%s%s' % (prjct_nm, filt_raref), all_sh_pbs,
                            '%s.dns%s' % (prjct_nm, filt_raref),
                            run_params["time"], run_params["n_nodes"], run_params["n_procs"],
                            run_params["mem_num"], run_params["mem_dim"],
                            qiime_env, chmod, noloc, slurm, jobs, chunkit)
    if main_sh:
        if p_perm_groups:
            print("# Run Adonis (groups config in %s)" % p_perm_groups)
        else:
            print("# Run Adonis")
        print_message('', 'sh', main_sh, jobs)
def run_mmvec(p_mmvec_pairs: str, i_datasets_folder: str, datasets: dict,
              datasets_filt: dict, datasets_read: dict, train_test_dict: dict,
              force: bool, gpu: bool, standalone: bool, prjct_nm: str,
              qiime_env: str, chmod: str, noloc: bool, split: bool,
              filt_raref: str, run_params: dict, input_to_filtered: dict,
              jobs: bool, chunkit: int) -> list:
    """Run mmvec: Neural networks for microbe-metabolite interaction analysis.
    https://github.com/biocore/mmvec
    Main two-datasets looper for the mmvec co-occurrences.

    Parameters
    ----------
    p_mmvec_pairs
        :param p_mmvec_pairs: Pairs of datasets for which to compute co-occurrences probabilities.
        :param p_diff_models: Formulas for multinomial regression-based differential abundance ranking.
        :param datasets: list of data_sets.
        :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
        :param datasets_read: dataset -> [tsv table, meta table] (here it updates tsv table after features correction)
        :param force: Force the re-writing of scripts for all commands.
        :param gpu: Use GPUs instead of CPUs for MMVEC.
        :param standalone:
        :param prjct_nm: Nick name for your project.
        :param qiime_env: qiime2-xxxx.xx conda environment.
        :param chmod: whether to change permission of output files (default: 644).
    i_datasets_folder
    datasets
    datasets_filt
    datasets_read
    train_test_dict
    force
    gpu
    standalone
    prjct_nm
    qiime_env
    chmod
    noloc
    split
    filt_raref
    run_params
    input_to_filtered
    jobs
    chunkit

    Returns
    -------

    """
    mmvec_dicts = get_mmvec_dicts(p_mmvec_pairs)
    mmvec_pairs = mmvec_dicts[0]
    mmvec_filtering = mmvec_dicts[1]
    mmvec_params = mmvec_dicts[2]
    mmvec_subsets = mmvec_dicts[3]
    unique_datasets = list(
        set([dat for pair_dats in mmvec_pairs.values() for dat in pair_dats]))
    unique_filterings = get_unique_mmvec_filtering(mmvec_filtering)

    print(mmvec_pairs)
    print()
    print(mmvec_filtering)
    print()
    print(mmvec_params)
    print()
    print(mmvec_subsets)
    print()
    print(unique_datasets)
    print()
    print(unique_filterings)
    print()
    print("datasets")
    print(datasets)
    print("datasets_filt")
    print(datasets_filt)
    print(fdsa)

    filt_datasets_done, common_datasets_done = check_filtered_and_common_dataset(
        i_datasets_folder, datasets, datasets_filt, unique_datasets,
        mmvec_pairs, mmvec_filtering, unique_filterings, 'mmvec',
        input_to_filtered, mmvec_subsets)

    already_computed = {}
    job_folder = get_job_folder(i_datasets_folder, 'mmvec')
    filt_datasets, common_datasets = make_filtered_and_common_dataset(
        i_datasets_folder, datasets, datasets_filt, datasets_read,
        unique_datasets, train_test_dict, mmvec_pairs, mmvec_filtering,
        unique_filterings, job_folder, force, prjct_nm, qiime_env, chmod,
        noloc, 'mmvec', filt_raref, filt_datasets_done, common_datasets_done,
        input_to_filtered, already_computed, mmvec_subsets, jobs)

    all_sh_pbs = {}
    mmvec_outputs = []

    for pair, pair_data in common_datasets.items():

        job_folder2 = get_job_folder(i_datasets_folder,
                                     'mmvec/chunks/%s' % pair)
        if not split:
            out_sh = '%s/chunks/run_mmvec_%s_%s%s.sh' % (job_folder, prjct_nm,
                                                         pair, filt_raref)

        for (meta_fp, omic1, omic2, filt1, filt2, tsv1, tsv2, qza1, qza2,
             ncommon, case) in pair_data:
            train_columns = mmvec_params['train_column']
            n_examples = mmvec_params['n_examples']
            batches = mmvec_params['batches']
            learns = mmvec_params['learns']
            epochs = mmvec_params['epochs']
            priors = mmvec_params['priors']
            thresh_feats = mmvec_params['thresh_feats']
            latent_dims = mmvec_params['latent_dims']
            if split:
                out_sh = '%s/chunks/run_mmvec_%s_%s_%s_%s_%s_%s_%s%s.sh' % (
                    job_folder, prjct_nm, pair, case, omic1, filt1, omic2,
                    filt2, filt_raref)
            if train_columns != ['None']:
                n_examples = ['']
            for idx, it in enumerate(
                    itertools.product(train_columns, n_examples, batches,
                                      learns, epochs, priors, thresh_feats,
                                      latent_dims)):
                train_column, n_example, batch, learn, epoch, prior, thresh_feat, latent_dim = [
                    str(x) for x in it
                ]
                res_dir = 'b-%s_l-%s_e-%s_p-%s_f-%s_d-%s_t-%s_n-%s_gpu-%s'\
                          % (
                    batch, learn, epoch, prior.replace('.', ''),
                    thresh_feat, latent_dim, train_column,
                    n_example, str(gpu)[0]
                )
                # skip is not at least half samples (for training if train columns specified)
                if train_columns != ['None']:
                    meta_pd = pd.read_table(
                        meta_fp, usecols=['sample_name',
                                          train_column.lower()])
                    ntrain = meta_pd[
                        train_column.lower()].value_counts()['Train']
                    if ncommon < (1.2 * ntrain):
                        print('\t\t--> skipped pair "%s" (too few samples '
                              '[%s samples for %s training samples]):' %
                              (pair, ncommon))
                        print('\t\t  - %s %s' % (omic1, filt1))
                        print('\t\t  - %s %s' % (omic2, filt2))
                        continue
                else:
                    if int(ncommon) < (1.2 * int(n_example)):
                        print('\t\t--> skipped pair "%s" (too few samples '
                              '[%s samples for %s examples]):' %
                              (pair, ncommon, n_example))
                        print('\t\t  - %s %s' % (omic1, filt1))
                        print('\t\t  - %s %s' % (omic2, filt2))
                        continue

                odir = get_analysis_folder(
                    i_datasets_folder, 'mmvec/paired/%s/%s/%s_%s__%s_%s/%s' %
                    (pair, case, omic1, filt1, omic2, filt2, res_dir))
                mmvec_outputs.append([
                    pair, case, omic1, omic2, filt1, filt2, ncommon, meta_fp,
                    tsv1, tsv2, qza1, qza2,
                    'mmvec_out__%s' % res_dir, odir
                ])
                cur_sh = '%s/run_mmvec_%s_%s_%s_%s_%s%s.sh' % (
                    job_folder2, pair, case, filt1, filt2, res_dir, filt_raref)
                all_sh_pbs.setdefault((pair, out_sh), []).append(cur_sh)
                run_single_mmvec(odir, meta_fp, qza1, qza2, res_dir, cur_sh,
                                 batch, learn, epoch, prior, thresh_feat,
                                 latent_dim, train_column, n_example, gpu,
                                 force, standalone, qiime_env)

    main_sh = write_main_sh(
        job_folder, '3_mmvec_%s%s' % (prjct_nm, filt_raref), all_sh_pbs,
        '%s.mmvc%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], qiime_env, chmod, noloc, jobs, chunkit)
    if main_sh:
        if p_mmvec_pairs.startswith('/panfs'):
            p_mmvec_pairs = p_mmvec_pairs.replace(os.getcwd(), '')
        print_message("# MMVEC (datasets pairs in %s)" % p_mmvec_pairs, 'sh',
                      main_sh, jobs)

    return mmvec_outputs
Esempio n. 30
0
def import_datasets(i_datasets_folder: str, datasets: dict,
                    datasets_phylo: dict, force: bool, prjct_nm: str,
                    qiime_env: str, chmod: str, noloc: bool, run_params: dict,
                    filt_raref: str, jobs: bool, slurm: bool,
                    chunkit: int) -> None:
    """Initial imports of the .tsv datasets in to Qiime2 Artefacts

    Parameters
    ----------
    i_datasets_folder : str
        Names identifying the datasets in the input folder
    datasets : dict
        Mapping dataset name -> [data file path, metadata file path]
    datasets_phylo : dict
        Mapping dataset name -> ('tree_to_use', 'corrected_or_not')
    force : bool
        Force the re-writing of scripts for all commands
    prjct_nm : str
        Nick name for the project.
    qiime_env : str
        Name of a qiime2 conda environment where analysis
        tools to be run are installed
    chmod : str
    noloc : bool
    run_params : dict
    filt_raref : str
    jobs : bool
    chunkit : int

    Returns
    -------

    """
    job_folder = get_job_folder(i_datasets_folder, 'import_tables')
    job_folder2 = get_job_folder(i_datasets_folder, 'import_tables/chunks')

    to_chunk = []
    main_written = 0
    run_pbs = '%s/0_run_import_%s%s.sh' % (job_folder, prjct_nm, filt_raref)
    with open(run_pbs, 'w') as o:
        for dat, tsv_meta_pds_ in datasets.items():
            written = 0
            out_sh = '%s/0_run_import_%s_%s%s.sh' % (job_folder2, prjct_nm,
                                                     dat, filt_raref)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            with open(out_sh, 'w') as cur_sh:
                for tsv_meta_pds in tsv_meta_pds_:  # REMOVE IF FIXED NOT KEPT
                    tsv, meta = tsv_meta_pds
                    qza = '%s.qza' % splitext(tsv)[0]
                    if datasets_phylo[dat][1]:
                        cmd = run_import(tsv, qza, 'FeatureTable[Frequency]')
                        cur_sh.write('echo "%s"\n' % cmd)
                        cur_sh.write('%s\n' % cmd)
                        written += 1
                    elif force or not isfile(qza):
                        cmd = run_import(tsv, qza, 'FeatureTable[Frequency]')
                        cur_sh.write('echo "%s"\n' % cmd)
                        cur_sh.write('%s\n' % cmd)
                        written += 1
            if written:
                main_written += 1
                to_chunk.append(out_sh)
                if not chunkit:
                    job_name = '%s.mprt.%s%s' % (prjct_nm, dat, filt_raref)
                    run_xpbs(out_sh, out_pbs, job_name, qiime_env,
                             run_params["time"], run_params["n_nodes"],
                             run_params["n_procs"], run_params["mem_num"],
                             run_params["mem_dim"], chmod, written, 'single',
                             o, noloc, slurm, jobs)
    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'imports', prjct_nm,
                      run_params["time"], run_params["n_nodes"],
                      run_params["n_procs"], run_params["mem_num"],
                      run_params["mem_dim"], qiime_env, chmod, noloc, slurm,
                      jobs, chunkit, None)

    if main_written:
        print_message('# Import tables to qiime2', 'sh', run_pbs, jobs)