Beispiel #1
0
def edit_taxonomies(i_datasets_folder: str, taxonomies: dict, force: bool,
                    prjct_nm: str, qiime_env: str, chmod: str, noloc: bool,
                    slurm: bool, run_params: dict, filt_raref: str, jobs: bool,
                    chunkit: int):

    job_folder = get_job_folder(i_datasets_folder, 'taxonomy')
    job_folder2 = get_job_folder(i_datasets_folder, 'taxonomy/chunks')

    main_written = 0
    to_chunk = []
    run_pbs = '%s/1_run_taxonomy_edit_%s%s.sh' % (job_folder, prjct_nm,
                                                  filt_raref)
    with open(run_pbs, 'w') as o:
        for dat, (_, qza, tsv) in taxonomies.items():
            if not isfile(tsv):
                continue
            written = 0
            out_pd = pd.read_csv(tsv, dtype=str, sep='\t')
            taxo = out_pd['Taxon'].tolist()
            taxo_edit = get_taxa_edit(taxo)
            if taxo != taxo_edit:
                out_pd['Taxon'] = taxo_edit
                out_pd.to_csv(tsv, index=False, sep='\t')
                cmd = run_import(tsv, qza, 'FeatureData[Taxonomy]')

                out_sh = '%s/run_taxonomy_edit_%s_%s%s.sh' % (
                    job_folder2, prjct_nm, dat, filt_raref)
                if slurm:
                    out_pbs = '%s.slm' % splitext(out_sh)[0]
                else:
                    out_pbs = '%s.pbs' % splitext(out_sh)[0]
                with open(out_sh, 'w') as cur_sh:
                    cur_sh.write('echo "%s"\n' % cmd)
                    cur_sh.write('%s\n\n' % cmd)
                    main_written += 1
                    written += 1
                if written:
                    to_chunk.append(out_sh)
                if not chunkit:
                    run_xpbs(out_sh, out_pbs,
                             '%s.tx.dt.%s%s' % (prjct_nm, dat, filt_raref),
                             qiime_env, run_params["time"],
                             run_params["n_nodes"], run_params["n_procs"],
                             run_params["mem_num"], run_params["mem_dim"],
                             chmod, written, 'single', o, noloc, slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'taxonomy_edit',
                      prjct_nm, run_params["time"], run_params["n_nodes"],
                      run_params["n_procs"], run_params["mem_num"],
                      run_params["mem_dim"], qiime_env, chmod, noloc, slurm,
                      jobs, chunkit, None)

    if main_written:
        print_message('# Edit features taxonomy to not contain "," characters',
                      'sh', run_pbs, jobs)
Beispiel #2
0
def run_qemistree(i_datasets_folder: str, datasets: dict, prjct_nm: str,
                  i_qemistree: str, taxonomies: dict, force: bool,
                  qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                  run_params: dict, filt_raref: str, jobs: bool,
                  chunkit: int) -> None:
    """
    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets_read: dataset -> [tsv table, meta table]
    :param prjct_nm: Short nick name for your project.
    :param i_qemistree: path to qemistree folder (feature-data and tree).
    :param taxonomies: dataset -> [method, assignment qza]
    :param force: Force the re-writing of scripts for all commands.
    :param qiime_env: name of your qiime2 conda environment (e.g. qiime2-2019.10).
    :param chmod: whether to change permission of output files (defalt: 775).
    """

    job_folder = get_job_folder(i_datasets_folder, 'qemistree')
    job_folder2 = get_job_folder(i_datasets_folder, 'qemistree/chunks')

    written = 0
    to_chunk = []
    run_pbs = '%s/1_run_qemistree_%s%s.sh' % (job_folder, prjct_nm, filt_raref)
    with open(run_pbs, 'w') as o:
        for dat, tsv_meta_pds in datasets.items():
            feature_data = '%s/feature-data_%s.qza' % (i_qemistree, dat)
            qemistree = '%s/qemistree_%s.qza' % (i_qemistree, dat)
            if not isfile(feature_data) or not isfile(qemistree):
                continue
            out_sh = '%s/run_qemistree_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            odir = get_analysis_folder(i_datasets_folder, 'qemistree/%s' % dat)
            classyfire_qza = '%s/%s-classyfire.qza' % (odir, dat)
            classyfire_tsv = '%s.tsv' % splitext(classyfire_qza)[0]
            with open(out_sh, 'w') as cur_sh:
                if force or not isfile(classyfire_tsv):
                    write_qemistree(feature_data, classyfire_qza,
                                    classyfire_tsv, qemistree,
                                    cur_sh)
                    written += 1

            if isfile(classyfire_tsv):
                odir = get_analysis_folder(i_datasets_folder, 'taxonomy/%s' % dat)
                out_rad = '%s/tax_%s' % (odir, dat)
                tax_qza = '%s.qza' % out_rad
                tax_tsv = '%s.tsv' % out_rad
                classyfire_pd = pd.read_csv(classyfire_tsv, header=0, sep='\t')
                with open(tax_tsv, 'w') as o:
                    cols = ['id', 'kingdom', 'superclass', 'class', 'subclass', 'direct_parent']
                    o.write('Feature ID\tTaxon\n')
                    for row in classyfire_pd[cols].values:
                        o.write('%s\t%s\n' % (row[0], '; '.join(row[1:])))
                run_export(tax_tsv, tax_qza, 'FeatureData[Taxonomy]')
                taxonomies[dat] = ['direct_parent', tax_qza]
                written += 1
            else:
                print('[Warning] Maybe run qemistree first and then re-run pipeline to '
                      'have the classyfire taxonomy include in the barplots!')

            to_chunk.append(out_sh)
            if not chunkit:
                run_xpbs(out_sh, out_pbs, '%s.qmstr.%s%s' % (prjct_nm, dat, filt_raref), qiime_env,
                     run_params["time"], run_params["n_nodes"], run_params["n_procs"],
                     run_params["mem_num"], run_params["mem_dim"],
                     chmod, written, 'single', o, noloc, slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'qemistree',
                      prjct_nm, run_params["time"], run_params["n_nodes"], run_params["n_procs"],
                      run_params["mem_num"], run_params["mem_dim"],
                      qiime_env, chmod, noloc, slurm, jobs, chunkit, None)

    if written:
        print_message('# Make qemistree classyfire classifications', 'sh', run_pbs, jobs)
Beispiel #3
0
def run_barplot(i_datasets_folder: str, datasets: dict, taxonomies: dict,
                force: bool, prjct_nm: str, qiime_env: str, chmod: str,
                noloc: bool, slurm: bool, run_params: dict, filt_raref: str,
                jobs: bool, chunkit: int) -> None:
    """Visualize taxonomy with an interactive bar plot.

    Parameters
    ----------
    i_datasets_folder : str
        Path to the folder containing the data/metadata subfolders
    datasets : dict
        Mappig dataset name -> [tsv file path, metadata file path]
    taxonomies : dict
        Mappig dataset name -> [classification_method, tax_qza]
    force : bool
        Force the re-writing of scripts for all commands
    prjct_nm : str
        Short nick name for your project
    qiime_env : str
        Mame of a qiime2 conda environment
    chmod : str
        Whether to change permission of output files (defalt: 744)
    noloc : bool
    run_params : dict
    filt_raref : str
    jobs : bool
    chunkit : int

    Returns
    -------

    """
    job_folder = get_job_folder(i_datasets_folder, 'barplot')
    job_folder2 = get_job_folder(i_datasets_folder, 'barplot/chunks')

    written = 0
    to_chunk = []
    run_pbs = '%s/1_run_barplot_%s%s.sh' % (job_folder, prjct_nm, filt_raref)
    with open(run_pbs, 'w') as o:
        for dat, tsv_meta_pds_ in datasets.items():
            out_sh = '%s/run_barplot_%s_%s%s.sh' % (job_folder2, prjct_nm, dat,
                                                    filt_raref)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            with open(out_sh, 'w') as cur_sh:
                for tsv_meta_pds in tsv_meta_pds_:
                    tsv, meta = tsv_meta_pds
                    if dat not in taxonomies:
                        continue
                    method, tax_qza, tax_tsv = taxonomies[dat]
                    if not method:
                        method = 'taxofromfile'
                    qza = '%s.qza' % splitext(tsv)[0]
                    odir = get_analysis_folder(i_datasets_folder,
                                               'barplot/%s' % dat)
                    out_qzv = '%s/bar_%s_%s.qzv' % (odir, dat, method)
                    if force or not isfile(out_qzv):
                        write_barplots(out_qzv, qza, meta, tax_qza, cur_sh)
                        written += 1
            to_chunk.append(out_sh)
            if not chunkit:
                run_xpbs(out_sh, out_pbs,
                         '%s.brplt.%s%s' % (prjct_nm, dat, filt_raref),
                         qiime_env, run_params["time"], run_params["n_nodes"],
                         run_params["n_procs"], run_params["mem_num"],
                         run_params["mem_dim"], chmod, written, 'single', o,
                         noloc, slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'barplot', prjct_nm,
                      run_params["time"], run_params["n_nodes"],
                      run_params["n_procs"], run_params["mem_num"],
                      run_params["mem_dim"], qiime_env, chmod, noloc, slurm,
                      jobs, chunkit, None)

    if written:
        print_message('# Make sample compositions barplots', 'sh', run_pbs,
                      jobs)
Beispiel #4
0
def run_taxonomy(method: str, i_datasets_folder: str, datasets: dict,
                 datasets_read: dict, datasets_phylo: dict,
                 datasets_features: dict, datasets_filt_map: dict,
                 i_classifier: str, taxonomies: dict, force: bool,
                 prjct_nm: str, qiime_env: str, chmod: str, noloc: bool,
                 slurm: bool, run_params: dict, filt_raref: str, jobs: bool,
                 chunkit: int) -> None:
    """

    Parameters
    ----------
    method
    i_datasets_folder : str
        Path to the folder containing the data/metadata subfolders.
    datasets : dict
        Mappring dataset name -> [data file path, metadata file path].
    datasets_read : dict
        Mapping dataset name -> [data table, metadata table]
    datasets_phylo : dict
        To be updated with ('tree_to_use', 'corrected_or_not') per dataset.
    datasets_features : dict
        Mapping dataset name -> list of features names in
                                the dataset tsv / biom file.
    datasets_filt_map : dict
    i_classifier : str
        Path to the taxonomic classifier.
    taxonomies : dict
        Mapping Dataset name -> [method, assignment qza]
    force : bool
        Force the re-writing of scripts for all commands.
    prjct_nm : str
        Short nick name for your project.
    qiime_env : str
        Name of your qiime2 conda environment (e.g. qiime2-2019.10).
    chmod : str
        Whether to change permission of output files (default: 744).
    noloc : str
    run_params : dict
    filt_raref : str
    jobs : bool
    chunkit : int

    Returns
    -------

    """
    job_folder = get_job_folder(i_datasets_folder, 'taxonomy')
    job_folder2 = get_job_folder(i_datasets_folder, 'taxonomy/chunks')
    amplicon_datasets = [
        dat for dat, (tree, correction) in datasets_phylo.items()
        if tree == 'amplicon'
    ]
    wol_datasets = [
        dat for dat, (tree, correction) in datasets_phylo.items()
        if tree == 'wol'
    ]

    main_written = 0
    to_chunk = []
    run_pbs = '%s/1_run_taxonomy_%s%s.sh' % (job_folder, prjct_nm, filt_raref)
    with open(run_pbs, 'w') as o:
        for dat, tsv_meta_pds_ in datasets_read.items():
            out_sh = '%s/run_taxonomy_%s_%s%s.sh' % (job_folder2, prjct_nm,
                                                     dat, filt_raref)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            if dat in datasets_filt_map:
                taxonomies[dat] = taxonomies[datasets_filt_map[dat]]
                continue
            written = 0
            with open(out_sh, 'w') as cur_sh:
                for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
                    if idx:
                        continue
                    tsv, meta = datasets[dat][idx]
                    if not isinstance(tsv_meta_pds[0], pd.DataFrame) and \
                            tsv_meta_pds[0] == 'raref':
                        if not isfile(tsv):
                            print('Must have run rarefaction to use it '
                                  'further...\nExiting')
                            sys.exit(0)
                        tsv_pd, meta_pd = get_raref_tab_meta_pds(meta, tsv)
                        datasets_read[dat][idx] = [tsv_pd, meta_pd]
                    else:
                        tsv_pd, meta_pd = tsv_meta_pds

                    odir = get_analysis_folder(i_datasets_folder,
                                               'taxonomy/%s' % dat)
                    out_rad = '%s/tax_%s' % (odir, dat)

                    if dat in amplicon_datasets:
                        out_qza = '%s_%s.qza' % (out_rad, method)
                        out_tsv = '%s.tsv' % splitext(out_qza)[0]
                        taxonomies[dat] = [method, out_qza, out_tsv]
                        if not i_classifier:
                            print('No classifier passed for 16S '
                                  'data\nExiting...')
                            continue
                        cmd = run_taxonomy_amplicon(dat, i_datasets_folder,
                                                    force, tsv_pd, out_qza,
                                                    out_tsv, i_classifier)
                    else:
                        out_qza = '%s.qza' % out_rad
                        out_tsv = '%s.tsv' % out_rad
                        if dat in wol_datasets:
                            cur_datasets_features = datasets_features[dat]
                            taxonomies[dat] = ['wol', out_qza, out_tsv]
                            cmd = run_taxonomy_wol(force, tsv_pd, out_qza,
                                                   out_tsv,
                                                   cur_datasets_features)
                        else:
                            if len(
                                [x for x in tsv_pd.index
                                 if str(x).isdigit()]) == tsv_pd.shape[0]:
                                continue
                            taxonomies[dat] = ['feat', out_qza, out_tsv]
                            cmd = run_taxonomy_others(force, tsv_pd, out_qza,
                                                      out_tsv)
                    if cmd:
                        cur_sh.write('echo "%s"\n' % cmd)
                        cur_sh.write('%s\n\n' % cmd)
                        main_written += 1
                        written += 1
            if written:
                to_chunk.append(out_sh)
                if not chunkit:
                    run_xpbs(out_sh, out_pbs,
                             '%s.tx.sklrn.%s%s' % (prjct_nm, dat, filt_raref),
                             qiime_env, run_params["time"],
                             run_params["n_nodes"], run_params["n_procs"],
                             run_params["mem_num"], run_params["mem_dim"],
                             chmod, written, 'single', o, noloc, slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'taxonomy', prjct_nm,
                      run_params["time"], run_params["n_nodes"],
                      run_params["n_procs"], run_params["mem_num"],
                      run_params["mem_dim"], qiime_env, chmod, noloc, slurm,
                      jobs, chunkit, None)

    if main_written:
        print_message('# Classify features using classify-sklearn', 'sh',
                      run_pbs, jobs)
Beispiel #5
0
def import_datasets(i_datasets_folder: str, datasets: dict,
                    datasets_phylo: dict, force: bool, prjct_nm: str,
                    qiime_env: str, chmod: str, noloc: bool, run_params: dict,
                    filt_raref: str, jobs: bool, slurm: bool,
                    chunkit: int) -> None:
    """Initial imports of the .tsv datasets in to Qiime2 Artefacts

    Parameters
    ----------
    i_datasets_folder : str
        Names identifying the datasets in the input folder
    datasets : dict
        Mapping dataset name -> [data file path, metadata file path]
    datasets_phylo : dict
        Mapping dataset name -> ('tree_to_use', 'corrected_or_not')
    force : bool
        Force the re-writing of scripts for all commands
    prjct_nm : str
        Nick name for the project.
    qiime_env : str
        Name of a qiime2 conda environment where analysis
        tools to be run are installed
    chmod : str
    noloc : bool
    run_params : dict
    filt_raref : str
    jobs : bool
    chunkit : int

    Returns
    -------

    """
    job_folder = get_job_folder(i_datasets_folder, 'import_tables')
    job_folder2 = get_job_folder(i_datasets_folder, 'import_tables/chunks')

    to_chunk = []
    main_written = 0
    run_pbs = '%s/0_run_import_%s%s.sh' % (job_folder, prjct_nm, filt_raref)
    with open(run_pbs, 'w') as o:
        for dat, tsv_meta_pds_ in datasets.items():
            written = 0
            out_sh = '%s/0_run_import_%s_%s%s.sh' % (job_folder2, prjct_nm,
                                                     dat, filt_raref)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            with open(out_sh, 'w') as cur_sh:
                for tsv_meta_pds in tsv_meta_pds_:  # REMOVE IF FIXED NOT KEPT
                    tsv, meta = tsv_meta_pds
                    qza = '%s.qza' % splitext(tsv)[0]
                    if datasets_phylo[dat][1]:
                        cmd = run_import(tsv, qza, 'FeatureTable[Frequency]')
                        cur_sh.write('echo "%s"\n' % cmd)
                        cur_sh.write('%s\n' % cmd)
                        written += 1
                    elif force or not isfile(qza):
                        cmd = run_import(tsv, qza, 'FeatureTable[Frequency]')
                        cur_sh.write('echo "%s"\n' % cmd)
                        cur_sh.write('%s\n' % cmd)
                        written += 1
            if written:
                main_written += 1
                to_chunk.append(out_sh)
                if not chunkit:
                    job_name = '%s.mprt.%s%s' % (prjct_nm, dat, filt_raref)
                    run_xpbs(out_sh, out_pbs, job_name, qiime_env,
                             run_params["time"], run_params["n_nodes"],
                             run_params["n_procs"], run_params["mem_num"],
                             run_params["mem_dim"], chmod, written, 'single',
                             o, noloc, slurm, jobs)
    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'imports', prjct_nm,
                      run_params["time"], run_params["n_nodes"],
                      run_params["n_procs"], run_params["mem_num"],
                      run_params["mem_dim"], qiime_env, chmod, noloc, slurm,
                      jobs, chunkit, None)

    if main_written:
        print_message('# Import tables to qiime2', 'sh', run_pbs, jobs)
Beispiel #6
0
def run_rarefy(i_datasets_folder: str, datasets: dict, datasets_read: dict,
               datasets_phylo: dict, datasets_filt_map: dict,
               datasets_rarefs: dict, p_raref_depths: str, eval_rarefs: bool,
               force: bool, prjct_nm: str, qiime_env: str, chmod: str,
               noloc: bool, run_params: dict, filt_raref: str, filt_only: bool,
               jobs: bool, slurm: bool, chunkit: int) -> dict:
    """
    Run rarefy: Rarefy table.
    https://docs.qiime2.org/2019.10/plugins/available/feature-table/rarefy/

    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets: dataset -> [tsv/biom path, meta path]
    :param datasets_read: dataset -> [tsv table, meta table]
    :param datasets_features: dataset -> list of features names in the dataset tsv / biom file.
    :param datasets_phylo: to be updated with ('tree_to_use', 'corrected_or_not') per dataset.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Nick name for your project.
    :param qiime_env: qiime2-xxxx.xx conda environment.
    :param chmod: whether to change permission of output files (defalt: 775).
    :return: deta divesity matrices.
    """

    evaluation = ''
    eval_depths = {}
    datasets_raref_depths, datasets_raref_evals = check_rarefy_need(
        i_datasets_folder, datasets_read, p_raref_depths)
    if eval_rarefs:
        evaluation = '_eval'

    set_filt_rarefy(datasets_raref_depths, datasets_filt_map)

    datasets_update = {}
    datasets_read_update = {}
    datasets_phylo_update = {}
    datasets_append = {}

    main_written = 0
    job_folder = get_job_folder(i_datasets_folder, 'rarefy%s' % evaluation)
    job_folder2 = get_job_folder(i_datasets_folder,
                                 'rarefy%s/chunks' % evaluation)
    to_chunk = []
    run_pbs = '%s/1_run_rarefy_%s%s%s.sh' % (job_folder, prjct_nm, evaluation,
                                             filt_raref)
    with open(run_pbs, 'w') as o:
        for dat, tsv_meta_pds_ in datasets.items():

            written = 0
            if dat not in datasets_raref_depths:
                continue
            if filt_only and dat not in datasets_filt_map:
                continue

            odir = get_analysis_folder(i_datasets_folder,
                                       'rarefy%s/%s' % (evaluation, dat))
            out_sh = '%s/run_rarefy_%s%s_%s.sh' % (job_folder2, prjct_nm,
                                                   evaluation, dat)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            with open(out_sh, 'w') as cur_sh:

                depths = datasets_raref_depths[dat][1]
                if eval_rarefs:
                    depths = datasets_raref_evals[dat]

                tsv_pd, meta_pd = datasets_read[dat][0]
                tsv_sums = tsv_pd.sum()
                for tsv_meta_pds in tsv_meta_pds_:
                    tsv, meta = tsv_meta_pds
                    for depth_ in depths:
                        depth = get_digit_depth(depth_, tsv_sums)
                        dat_raref = '%s_raref%s%s' % (dat, evaluation,
                                                      str(depth))
                        meta_out = '%s/meta_%s.tsv' % (odir, dat_raref)
                        remaining_samples = tsv_sums[
                            tsv_sums >= depth].index.tolist()
                        meta_raref_pd = meta_pd.loc[
                            meta_pd.sample_name.isin(remaining_samples), :]
                        meta_raref_pd.to_csv(meta_out, index=False, sep='\t')

                        qza = tsv.replace('.tsv', '.qza')
                        qza_out = '%s/tab_%s.qza' % (odir, dat_raref)
                        tsv_out = '%s.tsv' % splitext(qza_out)[0]
                        if force or not os.path.isfile(tsv_out):
                            cmd = write_rarefy(qza, qza_out, depth)
                            cur_sh.write('echo "%s"\n' % cmd)
                            cur_sh.write('%s\n\n' % cmd)
                            cmd = run_export(qza_out, tsv_out,
                                             'FeatureTable[Frequency]')
                            cur_sh.write('echo "%s"\n' % cmd)
                            cur_sh.write('%s\n\n' % cmd)
                            main_written += 1
                            written += 1

                        if eval_rarefs:
                            eval_depths.setdefault(dat, []).append(
                                '%s_%s' % (dat, str(depth)))
                            datasets_update['%s_%s' % (dat, str(depth))] = [[
                                tsv_out, meta_out
                            ]]
                            datasets_read_update['%s_%s' %
                                                 (dat, str(depth))] = (
                                                     'raref', str(depth))
                            datasets_phylo_update[
                                '%s_%s' %
                                (dat, str(depth))] = datasets_phylo[dat]
                        else:
                            datasets_append.setdefault(dat, []).append(
                                [tsv_out, meta_out])
                            if isfile(tsv_out) and isfile(meta_out):
                                tab_filt_pd = pd.read_csv(tsv_out,
                                                          index_col=0,
                                                          header=0,
                                                          sep='\t')
                                with open(meta_out) as f:
                                    for line in f:
                                        break
                                meta_filt_pd = pd.read_csv(
                                    meta_out,
                                    header=0,
                                    sep='\t',
                                    dtype={line.split('\t')[0]: str},
                                    low_memory=False)
                                datasets_read[dat].append(
                                    [tab_filt_pd, meta_filt_pd])
                            else:
                                datasets_read[dat].append(
                                    ('raref', str(depth)))
                            datasets_rarefs.setdefault(dat, []).append(
                                '_raref%s%s' % (evaluation, str(depth)))

            to_chunk.append(out_sh)
            if not chunkit:
                run_xpbs(
                    out_sh, out_pbs,
                    '%s.bt%s.%s%s' % (prjct_nm, evaluation, dat, filt_raref),
                    qiime_env, run_params["time"], run_params["n_nodes"],
                    run_params["n_procs"], run_params["mem_num"],
                    run_params["mem_dim"], chmod, written, 'single', o, noloc,
                    slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'rarefy%s' % evaluation,
                      prjct_nm, run_params["time"], run_params["n_nodes"],
                      run_params["n_procs"], run_params["mem_num"],
                      run_params["mem_dim"], qiime_env, chmod, noloc, slurm,
                      jobs, chunkit, None)

    if main_written:
        print_message('# Get rarefied datasets', 'sh', run_pbs, jobs)

    if eval_rarefs:
        datasets.update(datasets_update)
        datasets_read.update(datasets_read_update)
        datasets_phylo.update(datasets_phylo_update)
    else:
        for dat, fps in datasets_append.items():
            datasets[dat].extend(fps)

    return eval_depths
def run_volatility(i_datasets_folder: str, datasets: dict, p_longi_column: str,
                   datasets_rarefs: dict, force: bool, prjct_nm: str,
                   qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                   run_params: dict, filt_raref: str, jobs: bool,
                   chunkit: int) -> None:
    """
    Run volatility: Generate interactive volatility plot.
    https://docs.qiime2.org/2019.10/plugins/available/longitudinal/volatility/

    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets: list of datasets.
    :param p_longi_column: metadata column that is the time stratification.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Nick name for your project.
    :param qiime_env: qiime2-xxxx.xx conda environment.
    :param chmod: whether to change permission of output files (defalt: 775).
    """
    job_folder = get_job_folder(i_datasets_folder, 'longitudinal')
    job_folder2 = get_job_folder(i_datasets_folder, 'longitudinal/chunks')
    main_written = 0
    first_print = 0
    first_print2 = 0
    to_chunk = []
    run_pbs = '%s/5_run_volatility_%s%s.sh' % (job_folder, prjct_nm,
                                               filt_raref)
    with open(run_pbs, 'w') as o:
        for dat, tsv_meta_pds_ in datasets.items():
            written = 0
            out_sh = '%s/run_volatility_%s_%s%s.sh' % (job_folder2, prjct_nm,
                                                       dat, filt_raref)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            with open(out_sh, 'w') as cur_sh:
                for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
                    tsv, meta = tsv_meta_pds
                    cur_raref = datasets_rarefs[dat][idx]
                    meta_alphas = '%s_alphas.tsv' % splitext(meta)[0]
                    if not isfile(meta_alphas):
                        if not first_print:
                            print(
                                '\nWarning: First make sure you run alpha -> alpha merge/export (2_run_merge_alphas.sh) '
                                ' before running volatility\n\t(if you need the alpha as a response variable)!'
                            )
                            first_print += 1
                        continue
                    with open(meta) as f:
                        for line in f:
                            break
                    time_point = [
                        x for x in line.strip().split('\t')
                        if p_longi_column in x
                    ][0]
                    if not time_point:
                        if not first_print2:
                            print('Variable %s not in metadata %s\n' %
                                  (p_longi_column, meta_alphas))
                            first_print2 += 1
                        continue
                    odir = get_analysis_folder(
                        i_datasets_folder,
                        'longitudinal/%s%s' % (dat, cur_raref))
                    out_fp = '%s/%s_volatility.qzv' % (odir, dat)
                    if force or not isfile(out_fp):
                        write_longitudinal_volatility(out_fp, meta_alphas,
                                                      time_point, cur_sh)
                        written += 1
                        main_written += 1
            to_chunk.append(out_sh)
            if not chunkit:
                run_xpbs(out_sh, out_pbs,
                         '%s.vltlt.%s%s' % (prjct_nm, dat, filt_raref),
                         qiime_env, run_params["time"], run_params["n_nodes"],
                         run_params["n_procs"], run_params["mem_num"],
                         run_params["mem_dim"], chmod, written, 'single', o,
                         noloc, slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'volatility', prjct_nm,
                      run_params["time"], run_params["n_nodes"],
                      run_params["n_procs"], run_params["mem_num"],
                      run_params["mem_dim"], qiime_env, chmod, noloc, slurm,
                      jobs, chunkit, None)

    if main_written:
        print_message('# Longitudinal change in alpha diversity indices', 'sh',
                      run_pbs, jobs)
def run_alpha_rarefaction(i_datasets_folder: str, datasets: dict,
                          datasets_rarefs: dict, datasets_phylo: dict,
                          trees: dict, force: bool, prjct_nm: str,
                          qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                          As: tuple, run_params: dict, filt_raref: str,
                          jobs: bool, chunkit: int) -> None:
    """
    Run alpha-rarefaction: Alpha rarefaction
    https://docs.qiime2.org/2019.10/plugins/available/diversity/alpha-rarefaction/
    """
    alpha_metrics = get_metrics('alpha_metrics', As)
    job_folder = get_job_folder(i_datasets_folder, 'alpha_rarefaction')
    job_folder2 = get_job_folder(i_datasets_folder, 'alpha_rarefaction/chunks')
    main_written = 0
    run_pbs = '%s/4_run_alpha_rarefaction_%s%s.sh' % (job_folder, prjct_nm,
                                                      filt_raref)
    to_chunk = []
    with open(run_pbs, 'w') as o:
        for dat, tsv_meta_pds_ in datasets.items():
            written = 0
            out_sh = '%s/run_alpha_rarefaction_%s_%s%s.sh' % (
                job_folder2, prjct_nm, dat, filt_raref)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            with open(out_sh, 'w') as cur_sh:
                for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
                    tsv, meta = tsv_meta_pds
                    qza = '%s.qza' % splitext(tsv)[0]
                    cur_raref = datasets_rarefs[dat][idx]
                    odir = get_analysis_folder(
                        i_datasets_folder,
                        'alpha_rarefaction/%s%s' % (dat, cur_raref))
                    for metric in alpha_metrics:
                        out_fp = '%s/rarefcurve_%s%s_%s.qzv' % (
                            odir, dat, cur_raref, metric)
                        if force or not isfile(out_fp):
                            if write_diversity_alpha_rarefaction(
                                    out_fp, qza, metric, datasets_phylo, trees,
                                    dat, meta, cur_sh):
                                continue
                            written += 1
                            main_written += 1
            to_chunk.append(out_sh)
            if not chunkit:
                run_xpbs(out_sh, out_pbs,
                         '%s.lphrrf.%s%s' % (prjct_nm, dat, filt_raref),
                         qiime_env, run_params["time"], run_params["n_nodes"],
                         run_params["n_procs"], run_params["mem_num"],
                         run_params["mem_dim"], chmod, written, 'single', o,
                         noloc, slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'alpha_rarefaction',
                      prjct_nm, run_params["time"], run_params["n_nodes"],
                      run_params["n_procs"], run_params["mem_num"],
                      run_params["mem_dim"], qiime_env, chmod, noloc, slurm,
                      jobs, chunkit, None)

    if main_written:
        print_message('# Compute rarefaction curve on alpha diversity indices',
                      'sh', run_pbs, jobs)
def run_correlations(i_datasets_folder: str, datasets: dict, diversities: dict,
                     datasets_rarefs: dict, force: bool, prjct_nm: str,
                     qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                     run_params: dict, filt_raref: str, jobs: bool,
                     chunkit: int) -> None:
    """
    Run alpha-correlation: Alpha diversity correlation
    https://docs.qiime2.org/2019.10/plugins/available/diversity/alpha-correlation/

    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets: list of datasets.
    :param diversities: alpha diversity qiime2 Arfetact per dataset.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Nick name for your project.
    :param qiime_env: qiime2-xxxx.xx conda environment.
    :param chmod: whether to change permission of output files (defalt: 775).
    """
    job_folder = get_job_folder(i_datasets_folder, 'alpha_correlations')
    job_folder2 = get_job_folder(i_datasets_folder,
                                 'alpha_correlations/chunks')
    main_written = 0
    run_pbs = '%s/4_run_alpha_correlation_%s%s.sh' % (job_folder, prjct_nm,
                                                      filt_raref)
    to_chunk = []
    with open(run_pbs, 'w') as o:
        for dat, tsv_meta_pds_ in datasets.items():
            if dat not in diversities:
                continue
            written = 0
            out_sh = '%s/run_alpha_correlation_%s_%s%s.sh' % (
                job_folder2, prjct_nm, dat, filt_raref)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            with open(out_sh, 'w') as cur_sh:
                for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
                    tsv, meta = tsv_meta_pds
                    cur_raref = datasets_rarefs[dat][idx]
                    for method in ['spearman', 'pearson']:
                        for group, divs in diversities[dat][idx].items():
                            if group:
                                odir = get_analysis_folder(
                                    i_datasets_folder,
                                    'alpha_correlations/%s%s/%s' %
                                    (dat, cur_raref, group))
                            else:
                                odir = get_analysis_folder(
                                    i_datasets_folder,
                                    'alpha_correlations/%s%s' %
                                    (dat, cur_raref))
                            for qza in [x[0] for x in divs]:
                                out_fp = '%s/alpha_corr_%s' % (
                                    odir, basename(qza).replace(
                                        '.qza', '_%s.qzv' % method))
                                if force or not isfile(out_fp):
                                    write_diversity_alpha_correlation(
                                        out_fp, qza, method, meta, cur_sh)
                                    written += 1
                                    main_written += 1
            to_chunk.append(out_sh)
            if not chunkit:
                run_xpbs(out_sh, out_pbs,
                         '%s.lphcrr.%s%s' % (prjct_nm, dat, filt_raref),
                         qiime_env, run_params["time"], run_params["n_nodes"],
                         run_params["n_procs"], run_params["mem_num"],
                         run_params["mem_dim"], chmod, written, 'single', o,
                         noloc, slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'alpha_correlations',
                      prjct_nm, run_params["time"], run_params["n_nodes"],
                      run_params["n_procs"], run_params["mem_num"],
                      run_params["mem_dim"], qiime_env, chmod, noloc, slurm,
                      jobs, chunkit, None)

    if main_written:
        print_message(
            '# Correlate numeric metadata variables with alpha diversity indices',
            'sh', run_pbs, jobs)
def run_alpha(i_datasets_folder: str, datasets: dict, datasets_read: dict,
              datasets_phylo: dict, datasets_rarefs: dict,
              p_alpha_subsets: str, trees: dict, force: bool, prjct_nm: str,
              qiime_env: str, chmod: str, noloc: bool, slurm: bool, As: tuple,
              dropout: bool, run_params: dict, filt_raref: str,
              eval_depths: dict, jobs: bool, chunkit: int) -> dict:
    """
    Computes the alpha diversity vectors for each dataset.

    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets: dataset -> [tsv, meta]
    :param datasets_read: dataset -> [tsv table, meta table]
    :param datasets_phylo: to be updated with ('tree_to_use', 'corrected_or_not') per dataset.
    :param p_alpha_subsets: Subsets for alpha diversity.
    :param trees: to be update with tree to use for a dataset phylogenetic analyses.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Short nick name for your project.
    :param qiime_env: name of your qiime2 conda environment (e.g. qiime2-2019.10).
    :param chmod: whether to change permission of output files (defalt: 775).
    :return: {'dataset1': [ 'meta', {'div_index1': '.qza', 'div_index2': '.qza', ... }],
              'dataset2': [ 'meta', {'div_index1': '.qza', 'div_index2': '.qza', ... }], '...'}
    """
    evaluation = ''
    if len(eval_depths):
        evaluation = '_eval'
    alpha_metrics = get_metrics('alpha_metrics', As)
    alpha_subsets = read_yaml_file(p_alpha_subsets)
    job_folder = get_job_folder(i_datasets_folder, 'alpha%s' % evaluation)
    job_folder2 = get_job_folder(i_datasets_folder,
                                 'alpha%s/chunks' % evaluation)
    diversities = {}
    run_pbs = '%s/1_run_alpha_%s%s%s.sh' % (job_folder, prjct_nm, evaluation,
                                            filt_raref)
    main_written = 0
    to_chunk = []
    with open(run_pbs, 'w') as o:
        for dat, tsv_meta_pds_ in datasets.items():
            written = 0
            diversities[dat] = []
            out_sh = '%s/run_alpha_%s%s_%s%s.sh' % (
                job_folder2, prjct_nm, evaluation, dat, filt_raref)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            with open(out_sh, 'w') as cur_sh:
                for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
                    tsv, meta = tsv_meta_pds
                    if not isinstance(
                            datasets_read[dat][idx][0], pd.DataFrame
                    ) and datasets_read[dat][idx][0] == 'raref':
                        if not isfile(tsv):
                            print(
                                'Must have run rarefaction to use it further...\nExiting'
                            )
                            sys.exit(0)
                        tsv_pd, meta_pd = get_raref_tab_meta_pds(meta, tsv)
                        datasets_read[dat][idx] = [tsv_pd, meta_pd]
                    else:
                        tsv_pd, meta_pd = datasets_read[dat][idx]
                    cur_raref = datasets_rarefs[dat][idx]
                    qza = '%s.qza' % splitext(tsv)[0]
                    divs = {}
                    for metric in alpha_metrics:
                        odir = get_analysis_folder(
                            i_datasets_folder, 'alpha/%s%s' % (dat, cur_raref))
                        out_fp = '%s/%s_%s.qza' % (
                            odir, basename(splitext(qza)[0]), metric)
                        out_tsv = '%s.tsv' % splitext(out_fp)[0]
                        if force or not isfile(out_fp):
                            ret_continue = write_diversity_alpha(
                                out_fp, datasets_phylo, trees, dat, qza,
                                metric, cur_sh, qiime_env)
                            if ret_continue:
                                continue
                            cmd = run_export(out_fp, out_tsv, '')
                            cur_sh.write('echo "%s"\n' % cmd)
                            cur_sh.write('%s\n\n' % cmd)
                            written += 1
                            main_written += 1
                        divs.setdefault('', []).append((out_fp, metric))

                    if alpha_subsets and dat in alpha_subsets:
                        for subset, subset_regex in alpha_subsets[dat].items():
                            odir = get_analysis_folder(
                                i_datasets_folder,
                                'alpha/%s%s/%s' % (dat, cur_raref, subset))
                            if dropout:
                                qza_subset_ = '%s/%s_%s.qza' % (
                                    odir, basename(splitext(qza)[0]), subset)
                            else:
                                qza_subset_ = '%s/%s_%s_noDropout.qza' % (
                                    odir, basename(splitext(qza)[0]), subset)
                            feats_subset = '%s.meta' % splitext(qza_subset_)[0]
                            feats = get_subset(tsv_pd, subset_regex)
                            if not len(feats):
                                continue
                            subset_pd = pd.DataFrame({
                                'Feature ID':
                                feats,
                                'Subset': [subset] * len(feats)
                            })
                            subset_pd.to_csv(feats_subset,
                                             index=False,
                                             sep='\t')
                            write_filter_features(tsv_pd, feats, qza,
                                                  qza_subset_, feats_subset,
                                                  cur_sh, dropout)
                            for metric in alpha_metrics:

                                if metric in [
                                        'faith_pd'
                                ] and datasets_phylo[dat][1] and dat in trees:
                                    tree_in_qza = trees[dat][0]
                                    tree_in_tsv = '%s.tsv' % splitext(
                                        tree_in_qza)[0]
                                    if dropout:
                                        qza_subset = '%s/%s_%s.qza' % (
                                            odir,
                                            basename(splitext(tree_in_qza)[0]),
                                            subset)
                                    else:
                                        qza_subset = '%s/%s_%s_noDropout.qza' % (
                                            odir,
                                            basename(splitext(tree_in_qza)[0]),
                                            subset)
                                    write_filter_features(
                                        pd.read_csv(tree_in_tsv,
                                                    header=0,
                                                    index_col=0,
                                                    sep='\t'), feats,
                                        tree_in_qza, qza_subset, feats_subset,
                                        cur_sh, dropout)
                                else:
                                    qza_subset = qza_subset_

                                out_fp = '%s/%s__%s.qza' % (
                                    odir, basename(
                                        splitext(qza_subset)[0]), metric)
                                out_tsv = '%s.tsv' % splitext(out_fp)[0]

                                if force or not isfile(out_fp):
                                    ret_continue = write_diversity_alpha(
                                        out_fp, {dat: [1, 0]}, trees, dat,
                                        qza_subset, metric, cur_sh, qiime_env)
                                    if ret_continue:
                                        continue
                                    cmd = run_export(out_fp, out_tsv, '')
                                    cur_sh.write('echo "%s"\n' % cmd)
                                    cur_sh.write('%s\n\n' % cmd)
                                    written += 1
                                    main_written += 1
                                divs.setdefault(subset, []).append(
                                    (out_fp, metric))
                    diversities[dat].append(divs)
            to_chunk.append(out_sh)
            if not chunkit:
                run_xpbs(
                    out_sh, out_pbs, '%s.mg.lph%s.%s%s' %
                    (prjct_nm, evaluation, dat, filt_raref), qiime_env,
                    run_params["time"], run_params["n_nodes"],
                    run_params["n_procs"], run_params["mem_num"],
                    run_params["mem_dim"], chmod, written, 'single', o, noloc,
                    slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'alpha', prjct_nm,
                      run_params["time"], run_params["n_nodes"],
                      run_params["n_procs"], run_params["mem_num"],
                      run_params["mem_dim"], qiime_env, chmod, noloc, slurm,
                      jobs, chunkit, None)

    if main_written:
        print_message('# Calculate alpha diversity indices', 'sh', run_pbs,
                      jobs)
    return diversities
def merge_meta_alpha(i_datasets_folder: str, datasets: dict,
                     datasets_rarefs: dict, diversities: dict, force: bool,
                     prjct_nm: str, qiime_env: str, chmod: str, noloc: bool,
                     slurm: bool, dropout: bool, run_params: dict,
                     filt_raref: str, eval_depths: dict, jobs: bool,
                     chunkit: int) -> dict:
    """
    Computes the alpha diversity vectors for each dataset.

    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets: list of datasets.
    :param datasets_rarefs: list of rarefied datasets.
    :param diversities: paths to [alpha_divs]
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Short nick name for your project.
    :param qiime_env: name of your qiime2 conda environment (e.g. qiime2-2019.10).
    :param chmod: whether to change permission of output files (defalt: 775).
    :return:
    """
    evaluation = ''
    if len(eval_depths):
        evaluation = '_eval'

    job_folder = get_job_folder(i_datasets_folder, 'tabulate%s' % evaluation)
    job_folder2 = get_job_folder(i_datasets_folder,
                                 'tabulate%s/chunks' % evaluation)

    to_export = {}
    to_chunk = []
    main_written = 0
    run_pbs = '%s/2_run_merge_alphas_%s%s%s.sh' % (job_folder, prjct_nm,
                                                   evaluation, filt_raref)
    with open(run_pbs, 'w') as o:
        for dat, group_divs_list in diversities.items():
            written = 0
            to_export[dat] = []
            out_sh = '%s/run_merge_alpha_%s%s_%s%s.sh' % (
                job_folder2, prjct_nm, evaluation, dat, filt_raref)
            if slurm:
                out_pbs = '%s.slm' % splitext(out_sh)[0]
            else:
                out_pbs = '%s.pbs' % splitext(out_sh)[0]
            with open(out_sh, 'w') as cur_sh:
                for idx, group_divs in enumerate(group_divs_list):
                    tsv, meta = datasets[dat][idx]
                    cur_raref = datasets_rarefs[dat][idx]
                    base = basename(splitext(tsv)[0]).lstrip('tab_')
                    to_export_groups = []
                    for group, divs in group_divs.items():
                        if group:
                            output_folder = get_analysis_folder(
                                i_datasets_folder, 'tabulate%s/%s%s/%s' %
                                (evaluation, dat, cur_raref, group))
                        else:
                            output_folder = get_analysis_folder(
                                i_datasets_folder, 'tabulate%s/%s%s' %
                                (evaluation, dat, cur_raref))
                        if dropout:
                            out_fp = '%s/%s_alphas__%s.qzv' % (output_folder,
                                                               base, group)
                        else:
                            out_fp = '%s/%s_alphas_noDropout__%s.qzv' % (
                                output_folder, base, group)
                        out_fp_tsv = '%s.tsv' % splitext(out_fp)[0]
                        if isfile(out_fp_tsv):
                            with open(out_fp_tsv) as f:
                                for line in f:
                                    indices = line.strip().split('\t')[1:]
                                    break
                            divs_alphas = [x[1] for x in divs]
                            if len(indices) < len(divs_alphas):
                                force = True
                        to_export_groups.append(out_fp_tsv)
                        if force or not isfile(out_fp):
                            write_metadata_tabulate(out_fp, divs, meta, cur_sh)
                            cmd = run_export(out_fp, out_fp_tsv, '')
                            cur_sh.write('echo "%s"\n' % cmd)
                            cur_sh.write('%s\n\n' % cmd)
                            main_written += 1
                            written += 1
                    to_export[dat].append(to_export_groups)
            to_chunk.append(out_sh)
            if not chunkit:
                run_xpbs(
                    out_sh, out_pbs, '%s.mrg.lph%s.%s%s' %
                    (prjct_nm, evaluation, dat, filt_raref), qiime_env,
                    run_params["time"], run_params["n_nodes"],
                    run_params["n_procs"], run_params["mem_num"],
                    run_params["mem_dim"], chmod, written, 'single', o, noloc,
                    slurm, jobs)

    if to_chunk and chunkit:
        simple_chunks(run_pbs, job_folder2, to_chunk, 'tabulate', prjct_nm,
                      run_params["time"], run_params["n_nodes"],
                      run_params["n_procs"], run_params["mem_num"],
                      run_params["mem_dim"], qiime_env, chmod, noloc, slurm,
                      jobs, chunkit, None)

    if main_written:
        print_message('# Merge and export alpha diversity indices', 'sh',
                      run_pbs, jobs)
    return to_export