コード例 #1
0
def run_distance_decay(i_datasets_folder: str, betas: dict,
                       p_distance_decay: str, datasets_rarefs: dict,
                       force: bool, prjct_nm: str, qiime_env: str, chmod: str,
                       noloc: bool, slurm: bool, split: bool, run_params: dict,
                       filt_raref: str, jobs: bool,
                       chunkit: int) -> (dict, list):

    job_folder2 = get_job_folder(i_datasets_folder, 'decay/chunks')
    decay_config = read_yaml_file(p_distance_decay)
    subsets, modes, params = get_decay_config(decay_config)

    all_sh_pbs = {}
    decay_res = {}
    for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items():
        if not split:
            out_sh = '%s/run_decay_%s_%s%s.sh' % (job_folder2, prjct_nm, dat,
                                                  filt_raref)
        decay_res[dat] = []
        for idx, metrics_groups_metas_qzas_dms_trees in enumerate(
                rarefs_metrics_groups_metas_qzas_dms_trees):
            decay_raref = {}
            cur_raref = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder,
                                       'decay/%s%s' % (dat, cur_raref))
            if split:
                out_sh = '%s/run_decay_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, cur_raref, filt_raref)
            for metric, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items(
            ):
                for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items(
                ):
                    for (meta, qza, mat_qza,
                         tree) in metas_qzas_mat_qzas_trees:
                        meta_pd = read_meta_pd(meta).set_index('sample_name')
                        cases_dict = check_metadata_cases_dict(
                            meta, meta_pd, dict(subsets), 'decay')
                        for case_var, case_vals_list in cases_dict.items():
                            for case_vals in case_vals_list:
                                case = get_case(case_vals,
                                                case_var).replace(' ', '_')
                                cur_sh = '%s/run_decay_%s%s_%s_%s_%s%s.sh' % (
                                    job_folder2, dat, cur_raref, metric, group,
                                    case, filt_raref)
                                cur_sh = cur_sh.replace(' ', '-')
                                all_sh_pbs.setdefault((dat, out_sh),
                                                      []).append(cur_sh)
                                new_meta_pd = get_new_meta_pd(
                                    meta_pd, case, case_var, case_vals)
                                res = run_single_decay(
                                    odir, group, new_meta_pd, cur_sh, mat_qza,
                                    case, modes, force, run_params["n_nodes"],
                                    run_params["n_procs"],
                                    int(params['iteration']),
                                    int(params['step']))
                                decay_raref[(metric, group, case)] = res
            decay_res[dat].append(decay_raref)
コード例 #2
0
def run_sourcetracking(i_datasets_folder: str, datasets: dict,
                       p_sourcetracking_config: str, datasets_rarefs: dict,
                       force: bool, prjct_nm: str, qiime_env: str, chmod: str,
                       noloc: bool, slurm: bool, run_params: dict,
                       filt_raref: str, split: bool, jobs: bool,
                       chunkit: int) -> None:

    job_folder2 = get_job_folder(i_datasets_folder, 'sourcetracking/chunks')
    sourcetracking_dicts = get_sourcetracking_config(p_sourcetracking_config)
    sourcetracking_sourcesink = sourcetracking_dicts[0]
    sourcetracking_filtering = sourcetracking_dicts[1]
    sourcetracking_params = sourcetracking_dicts[2]
    main_cases_dict = sourcetracking_dicts[3]

    all_sh_pbs = {}
    all_import_sh_pbs = {}
    for dat, tsv_meta_pds_ in datasets.items():
        if dat in sourcetracking_filtering:
            filters = sourcetracking_filtering[dat]
        else:
            filters = {'0_0': ['0', '0']}
        for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
            tsv, meta = tsv_meta_pds
            meta_pd = read_meta_pd(meta)
            meta_pd = meta_pd.set_index('sample_name')
            cases_dict = check_metadata_cases_dict(meta, meta_pd,
                                                   dict(main_cases_dict),
                                                   'sourcetracking')
            cur_raref = datasets_rarefs[dat][idx]
            out_import_sh = '%s/run_import_sourcetracking_%s_%s%s%s.sh' % (
                job_folder2, prjct_nm, dat, filt_raref, cur_raref)
            imports = set()
            odir = get_analysis_folder(i_datasets_folder,
                                       'sourcetracking/%s' % dat)
            for method in sourcetracking_params['method']:
                out_sh = '%s/run_sourcetracking_%s_%s%s%s_%s.sh' % (
                    job_folder2, prjct_nm, dat, filt_raref, cur_raref, method)
                for case_var, case_vals_list in cases_dict.items():
                    for filt, (fp, fa) in filters.items():
                        cur_sh = '%s/run_sourcetracking_%s_%s_%s%s%s_%s_%s.sh' % (
                            job_folder2, prjct_nm, dat, case_var, filt_raref,
                            cur_raref, method, filt)
                        cur_sh = cur_sh.replace(' ', '-')
                        cur_import_sh = '%s/run_import_sourcetracking_%s_%s_%s%s%s_%s_%s.sh' % (
                            job_folder2, prjct_nm, dat, case_var, filt_raref,
                            cur_raref, method, filt)
                        cur_import_sh = cur_import_sh.replace(' ', '-')
                        all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                        all_import_sh_pbs.setdefault((dat, out_import_sh),
                                                     []).append(cur_import_sh)
                        run_single_sourcetracking(
                            odir, tsv, meta_pd, case_var,
                            sourcetracking_params, method, imports,
                            sourcetracking_sourcesink, case_vals_list, cur_sh,
                            cur_import_sh, force, filt, cur_raref, fp, fa,
                            run_params["n_nodes"], run_params["n_procs"])

    job_folder = get_job_folder(i_datasets_folder, 'sourcetracking')
    main_sh = write_main_sh(
        job_folder,
        '3_run_import_sourcetracking_%s%s' % (prjct_nm, filt_raref),
        all_import_sh_pbs, '%s.mpt.srctrk%s' % (prjct_nm, filt_raref),
        run_params["time"], run_params["n_nodes"], run_params["n_procs"],
        run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc,
        slurm, jobs, chunkit, '~/.')
    if main_sh:
        if p_sourcetracking_config:
            if p_sourcetracking_config.startswith('/panfs'):
                p_sourcetracking_config = p_sourcetracking_config.replace(
                    os.getcwd(), '')
            print('# import sourcetracking (groups config in %s)' %
                  p_sourcetracking_config)
        else:
            print('# import sourcetracking')
        print_message('', 'sh', main_sh, jobs)

    main_sh = write_main_sh(
        job_folder, '3_run_sourcetracking_%s%s' % (prjct_nm, filt_raref),
        all_sh_pbs, '%s.srctrk%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit,
        '~/.')
    if main_sh:
        if p_sourcetracking_config:
            if p_sourcetracking_config.startswith('/panfs'):
                p_sourcetracking_config = p_sourcetracking_config.replace(
                    os.getcwd(), '')
            print('# sourcetracking (groups config in %s)' %
                  p_sourcetracking_config)
        else:
            print('# sourcetracking')
        print_message('', 'sh', main_sh, jobs)
コード例 #3
0
def run_nestedness(i_datasets_folder: str, betas: dict,
                   datasets_collapsed_map: dict, p_nestedness_groups: str,
                   datasets_rarefs: dict, force: bool, prjct_nm: str,
                   qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                   split: bool, run_params: dict, filt_raref: str, jobs: bool,
                   chunkit: int) -> (dict, list, dict):

    job_folder2 = get_job_folder(i_datasets_folder, 'nestedness/chunks')

    nestedness_config = read_yaml_file(p_nestedness_groups)
    if 'soft' not in nestedness_config:
        print(
            'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)'
        )
        return {}
    if nestedness_config['soft'].endswith('Autocorrelation.jar') and isfile(
            nestedness_config['soft']):
        binary = nestedness_config['soft']
    else:
        binary = '%s/bin/Autocorrelation.jar' % nestedness_config['soft']
        if not isfile(binary):
            print(
                'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)'
            )
            return {}

    subsets, nodfs, colors, nulls, modes, params = get_nestedness_config(
        nestedness_config)

    nodfs_fps = {}
    all_sh_pbs = {}
    nestedness_res = {}
    for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items():
        if not split:
            out_sh = '%s/run_nestedness_%s_%s%s.sh' % (job_folder2, prjct_nm,
                                                       dat, filt_raref)

        stats_tax_dat, level = get_stats_tax_dat(dat, datasets_collapsed_map)

        nestedness_res[dat] = []
        for idx, metrics_groups_metas_qzas_dms_trees in enumerate(
                rarefs_metrics_groups_metas_qzas_dms_trees):
            nestedness_raref = {}
            cur_raref = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder,
                                       'nestedness/%s%s' % (dat, cur_raref))
            if split:
                out_sh = '%s/run_nestedness_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, cur_raref, filt_raref)
            for _, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items(
            ):
                for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items(
                ):

                    meta, qza, mat_qza, tree = metas_qzas_mat_qzas_trees[0]
                    meta_pd = read_meta_pd(meta).set_index('sample_name')
                    cases_dict = check_metadata_cases_dict(
                        meta, meta_pd, dict(subsets), 'nestedness')
                    for case_var, case_vals_list in cases_dict.items():
                        for case_vals in case_vals_list:
                            case = get_case(case_vals,
                                            case_var).replace(' ', '_')
                            cur_sh = '%s/run_nestedness_%s%s_%s_%s%s.sh' % (
                                job_folder2, dat, cur_raref, group, case,
                                filt_raref)
                            cur_sh = cur_sh.replace(' ', '-')
                            # print("case", case)
                            all_sh_pbs.setdefault((dat, out_sh),
                                                  []).append(cur_sh)
                            res, group_case_nodfs = run_single_nestedness(
                                odir, cur_raref, level, group, meta_pd, nodfs,
                                nulls, modes, cur_sh, qza, case, case_var,
                                case_vals, binary, params, force)
                            nodfs_fps.setdefault(stats_tax_dat,
                                                 []).extend(group_case_nodfs)
                            nestedness_raref[(group, case)] = res
                break
            nestedness_res[dat].append(nestedness_raref)
コード例 #4
0
def run_doc(i_datasets_folder: str, datasets: dict, p_doc_config: str,
            datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str,
            chmod: str, noloc: bool, slurm: bool, run_params: dict,
            filt_raref: str, phates: dict, doc_phate: bool, split: bool,
            jobs: bool, chunkit: int) -> None:

    job_folder2 = get_job_folder(i_datasets_folder, 'doc/chunks')
    doc_filtering, doc_params, main_cases_dict = get_doc_config(p_doc_config)

    all_sh_pbs = {}
    all_import_sh_pbs = {}
    dat_cases_tabs = {}
    need_to_run_phate = []
    need_to_run_less_phate = []
    for dat, tsv_meta_pds_ in datasets.items():
        dat_cases_tabs[dat] = {}
        if dat in doc_filtering:
            filters = doc_filtering[dat]
        else:
            filters = {'0-0': ['0', '0']}
        for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
            dat_phates = []
            if dat in phates:
                dat_phates = phates[dat][idx]
            tsv, meta = tsv_meta_pds
            meta_pd = read_meta_pd(meta)
            meta_pd = meta_pd.set_index('sample_name')
            cases_dict = check_metadata_cases_dict(meta, meta_pd,
                                                   dict(main_cases_dict),
                                                   'DOC')
            cur_raref = datasets_rarefs[dat][idx]
            dat_cases_tabs[dat][cur_raref] = {}
            if not split:
                out_sh = '%s/run_doc_%s%s%s.sh' % (job_folder2, dat,
                                                   filt_raref, cur_raref)
                out_import_sh = '%s/run_import_doc_%s%s%s.sh' % (
                    job_folder2, dat, filt_raref, cur_raref)
            odir = get_analysis_folder(i_datasets_folder, 'doc/%s' % dat)
            for filt, (fp, fa) in filters.items():
                if split:
                    out_sh = '%s/run_doc_%s%s%s_%s.sh' % (
                        job_folder2, dat, filt_raref, cur_raref, filt)
                    out_import_sh = '%s/run_import_doc_%s%s%s_%s.sh' % (
                        job_folder2, dat, filt_raref, cur_raref, filt)
                for case_var, case_vals_list in cases_dict.items():
                    cur_sh = '%s/run_doc_%s_%s%s%s_%s.sh' % (
                        job_folder2, dat, case_var, filt_raref, cur_raref,
                        filt)
                    cur_sh = cur_sh.replace(' ', '-')
                    cur_import_sh = '%s/run_import_doc_%s_%s%s%s_%s.sh' % (
                        job_folder2, dat, case_var, filt_raref, cur_raref,
                        filt)
                    cur_import_sh = cur_import_sh.replace(' ', '-')
                    all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                    all_import_sh_pbs.setdefault((dat, out_import_sh),
                                                 []).append(cur_import_sh)
                    cases = run_single_doc(
                        i_datasets_folder, odir, tsv, meta_pd, case_var,
                        doc_params, case_vals_list, cur_sh, cur_import_sh,
                        force, filt, cur_raref, fp, fa, run_params["n_nodes"],
                        run_params["n_procs"], dat_phates, doc_phate,
                        need_to_run_phate, need_to_run_less_phate)
                    dat_cases_tabs[dat][cur_raref].setdefault(case_var,
                                                              []).extend(cases)

    for need_to_run in need_to_run_phate:
        print(' -', need_to_run)

    job_folder = get_job_folder(i_datasets_folder, 'doc')
    main_sh = write_main_sh(job_folder, '3_run_import_doc%s' % filt_raref,
                            all_import_sh_pbs,
                            '%s.doc.mpt%s' % (prjct_nm, filt_raref), "4", "1",
                            "1", "500", "mb", qiime_env, chmod, noloc, slurm,
                            jobs, chunkit)
    if main_sh:
        if p_doc_config:
            if p_doc_config.startswith('/panfs'):
                p_doc_config = p_doc_config.replace(os.getcwd(), '')
            print('# Import for DOC (groups config in %s)' % p_doc_config)
        else:
            print('# Import DOC')
        print_message('', 'sh', main_sh, jobs)

    main_sh = write_main_sh(job_folder, '3_run_doc%s' % filt_raref, all_sh_pbs,
                            '%s.doc%s' % (prjct_nm, filt_raref),
                            run_params["time"], run_params["n_nodes"],
                            run_params["n_procs"], run_params["mem_num"],
                            run_params["mem_dim"], qiime_env, chmod, noloc,
                            jobs, slurm, chunkit, '~/.')
    if main_sh:
        if p_doc_config:
            if p_doc_config.startswith('/panfs'):
                p_doc_config = p_doc_config.replace(os.getcwd(), '')
            print('# DOC (groups config in %s)' % p_doc_config)
        else:
            print('# DOC')
        print_message('', 'sh', main_sh, jobs)

    do_r = 1
    if do_r:
        job_folder = get_job_folder(i_datasets_folder, 'doc/R')
        job_folder2 = get_job_folder(i_datasets_folder, 'doc/R/chunks')
        main_written = 0
        main_sh = '%s/run_R_doc%s.sh' % (job_folder, filt_raref)
        with open(main_sh, 'w') as main_o:
            for dat, raref_case_var_cases in dat_cases_tabs.items():

                shs = []
                written = 0
                odir = get_analysis_folder(i_datasets_folder, 'doc/%s' % dat)
                log_error = '%s/log.error' % odir
                for raref, case_var_cases in raref_case_var_cases.items():
                    for case_var, cases in case_var_cases.items():
                        for cdx, case in enumerate(cases):
                            plot = '%s_%s_%s_%s' % (dat, raref, case_var, cdx)
                            case_r = '%s/R' % case
                            pdf = '%s/plot.pdf' % case_r
                            do = '%s/DO.tsv' % case_r
                            if not isfile(pdf):
                                cur_r = '%s/run_R_doc_%s_%s_%s_vanilla.R' % (
                                    job_folder2, dat, case_var, cdx)
                                cur_sh = 'echo "*** %s" >> %s\n' % (plot,
                                                                    log_error)
                                cur_sh += 'R -f %s --vanilla 2>> %s\n' % (
                                    cur_r, log_error)
                                cur_sh += 'echo "end" >> %s\n' % log_error
                                shs.append(cur_sh)
                                with open(cur_r, 'w') as o:
                                    o.write("library(DOC)\n")
                                    o.write("library(ggplot2)\n")
                                    if not isfile(do):
                                        o.write(
                                            "otu <- read.table('%s/tab.tsv', header=T, sep='\\t', comment.char='', check.names=F, nrows=2)\n"
                                            % case)
                                        o.write(
                                            "index_name <- colnames(otu)[1]\n")
                                        o.write(
                                            "otu <- read.table('%s/tab.tsv', header=T, sep='\\t', comment.char='', check.names=F, row.names=index_name)\n"
                                            % case)
                                        o.write("if (dim(otu)[1] > 100) {\n")
                                        o.write("    res <- DOC(otu)\n")
                                        o.write(
                                            "    res.null <- DOC.null(otu)\n")
                                        o.write(
                                            "    write.table(x=res$DO, file='%s/DO.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res$LME, file='%s/LME.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    colnames(res$NEG) <- c('Neg_Slope', 'Data')\n"
                                        )
                                        o.write(
                                            "    write.table(x=res$NEG, file='%s/NEG.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res$FNS, file='%s/FNS.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res$BOOT, file='%s/BOOT.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res$CI, file='%s/CI.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res.null$DO, file='%s/null_DO.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res.null$LME, file='%s/null_LME.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    colnames(res.null$NEG) <- c('Neg_Slope', 'Data')\n"
                                        )
                                        o.write(
                                            "    write.table(x=res.null$NEG, file='%s/null_NEG.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res.null$FNS, file='%s/null_FNS.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res.null$BOOT, file='%s/null_BOOT.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res.null$CI, file='%s/null_CI.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write("}\n")
                                    o.write(
                                        "res = list(BOOT=read.table('%s/BOOT.tsv', h=T, sep='\\t'), CI=read.table('%s/CI.tsv', h=T, sep='\\t'), DO=read.table('%s/DO.tsv', h=T, sep='\\t'), LME=read.table('%s/LME.tsv', h=T, sep='\\t'), FNS=read.table('%s/FNS.tsv', h=T, sep='\\t'), NEG=read.table('%s/NEG.tsv', h=T, sep='\\t'))\n"
                                        % (case_r, case_r, case_r, case_r,
                                           case_r, case_r))
                                    o.write(
                                        "res.null = list(BOOT=read.table('%s/null_BOOT.tsv', h=T, sep='\\t'), CI=read.table('%s/null_CI.tsv', h=T, sep='\\t'), DO=read.table('%s/null_DO.tsv', h=T, sep='\\t'), LME=read.table('%s/null_LME.tsv', h=T, sep='\\t'), FNS=read.table('%s/null_FNS.tsv', h=T, sep='\\t'), NEG=read.table('%s/null_NEG.tsv', h=T, sep='\\t'))\n"
                                        % (case_r, case_r, case_r, case_r,
                                           case_r, case_r))
                                    o.write(
                                        "colnames(res$NEG) <- c('Neg.Slope', 'Data')\n"
                                    )
                                    o.write(
                                        "colnames(res.null$NEG) <- c('Neg.Slope', 'Data')\n"
                                    )
                                    o.write(
                                        "res$DO <- res$DO[which(res$DO$Overlap <= 1),]\n"
                                    )
                                    o.write(
                                        "res.null$DO <- res.null$DO[which(res.null$DO$Overlap <= 1),]\n"
                                    )
                                    o.write("pdf('%s')\n" % pdf)
                                    o.write(
                                        "merged <- DOC.merge(list(s_%s = res, s_%s=res.null))\n"
                                        % (plot, plot))
                                    o.write("plot(merged)\n")
                                    o.write("dev.off()\n")
                                    main_written += 1
                                    written += 1
                if written:
                    if chunkit and len(shs) >= chunkit:
                        chunks = [
                            list(x)
                            for x in np.array_split(np.array(shs), chunkit)
                        ]
                    if split and len(shs) >= 3:
                        chunks = [
                            list(x) for x in np.array_split(np.array(shs), 3)
                        ]
                    else:
                        chunks = [shs]
                    for cdx, chunk in enumerate(chunks):
                        out_sh = '%s/run_R_doc_%s%s_%s.sh' % (job_folder2, dat,
                                                              filt_raref, cdx)
                        out_pbs = '%s.pbs' % splitext(out_sh)[0]
                        with open(out_sh, 'w') as o:
                            for c in chunk:
                                o.write('echo "%s"\n\n' % c)
                                o.write('%s\n\n' % c)
                        run_xpbs(
                            out_sh, out_pbs, '%s.doc.R.%s%s_%s' %
                            (prjct_nm, dat, filt_raref, cdx), 'xdoc',
                            run_params["time"], run_params["n_nodes"],
                            run_params["n_procs"], run_params["mem_num"],
                            run_params["mem_dim"], chmod, written, 'single',
                            main_o, noloc, slurm, jobs)
        if main_written:
            print_message('# DOC (R)', 'sh', main_sh, jobs)
コード例 #5
0
def run_adonis(p_formulas: str, i_datasets_folder: str, betas: dict,
               datasets_rarefs: dict, p_perm_groups: str, force: bool,
               prjct_nm: str, qiime_env: str, chmod: str, noloc: bool,
               slurm: bool, split: bool, run_params: dict, filt_raref: str,
               jobs: bool, chunkit: int) -> None:
    """
    Run beta-group-significance: Beta diversity group significance.
    https://docs.qiime2.org/2019.10/plugins/available/diversity/beta-group-significance/
    Main per-dataset looper for the ADONIS tests on beta diversity matrices.

    :param p_formulas: formulas to test.
    :param i_data_sets_folder: Path to the folder containing the data/metadata subfolders.
    :param data_sets: list of datasets.
    :param betas: beta diversity matrices.
    :param p_perm_groups: groups to subset.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Nick name for your project.
    :param qiime_env: qiime2-xxxx.xx conda environment.
    :param chmod: whether to change permission of output files (defalt: 775).
    """

    job_folder2 = get_job_folder(i_datasets_folder, 'adonis/chunks')

    main_cases_dict = get_main_cases_dict(p_perm_groups)
    formulas = get_formulas_dict(p_formulas)

    metric_check = set()
    all_sh_pbs = {}
    first_print = 0

    for dat, metric_groups_metas_qzas_dms_trees_ in betas.items():
        if dat not in formulas:
            continue
        if not split:
            out_sh = '%s/run_adonis_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref)
        for idx, metric_groups_metas_qzas_dms_trees in enumerate(metric_groups_metas_qzas_dms_trees_):
            cur_depth = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder, 'adonis/%s%s' % (dat, cur_depth))
            for metric, subset_files in metric_groups_metas_qzas_dms_trees.items():
                if split:
                    out_sh = '%s/run_adonis_%s_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, metric, filt_raref)

                for subset, metas_qzas_mat_qzas_trees in subset_files.items():
                    for meta, qza, mat_qza, tree in metas_qzas_mat_qzas_trees:
                        if not isfile(mat_qza):
                            if not first_print:
                                print('Beta diversity, distances matrices must be generated already to automatise PERMANOVA\n'
                                      '\t(re-run this after steps "2_run_beta.sh" and "2x_run_beta_export.pbs" are done)')
                                first_print += 1
                            continue

                        if (dat, subset) not in metric_check:
                            meta_pd = read_meta_pd(meta).set_index('sample_name')
                            cases_dict = check_metadata_cases_dict(meta, meta_pd, dict(main_cases_dict), 'ADONIS')
                            formulas = check_metadata_formulas(meta, meta_pd, formulas[dat], 'ADONIS')
                            metric_check.add((dat, subset))

                        for fdx, form in enumerate(formulas[dat].keys()):
                            formula = formulas[dat][form]
                            for cdx, case_var in enumerate(cases_dict.keys()):
                                case_vals_list = cases_dict[case_var]
                                cur_sh = '%s/run_adonis_%s%s_%s_%s_%s%s.sh' % (
                                    job_folder2, dat, cur_depth, metric, fdx, cdx, filt_raref)
                                cur_sh = cur_sh.replace(' ', '-')
                                all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                                run_single_adonis(odir, subset, case_vals_list, metric, case_var,
                                                  form, formula, qza, mat_qza, meta_pd, cur_sh, force)

    job_folder = get_job_folder(i_datasets_folder, 'adonis')
    main_sh = write_main_sh(job_folder, '3_run_adonis_%s%s' % (prjct_nm, filt_raref), all_sh_pbs,
                            '%s.dns%s' % (prjct_nm, filt_raref),
                            run_params["time"], run_params["n_nodes"], run_params["n_procs"],
                            run_params["mem_num"], run_params["mem_dim"],
                            qiime_env, chmod, noloc, slurm, jobs, chunkit)
    if main_sh:
        if p_perm_groups:
            print("# Run Adonis (groups config in %s)" % p_perm_groups)
        else:
            print("# Run Adonis")
        print_message('', 'sh', main_sh, jobs)
コード例 #6
0
def run_deicode(i_datasets_folder: str, datasets: dict, datasets_rarefs: dict,
                p_perm_groups: str, force: bool, prjct_nm: str, qiime_env: str,
                chmod: str, noloc: bool, slurm: bool, run_params: dict,
                filt_raref: str, jobs: bool, chunkit: int) -> None:
    """
    Performs robust center log-ratio transform robust PCA and
    ranks the features by the loadings of the resulting SVD.
    https://library.qiime2.org/plugins/deicode/19/
    Main per-dataset looper for the ADONIS tests on beta diversity matrices.

    :param i_data_sets_folder: Path to the folder containing the data/metadata subfolders.
    :param data_sets: list of data_sets.
    :param p_perm_groups: groups to subset.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Nick name for your project.
    :param qiime_env: qiime2-xxxx.xx conda environment.
    :param chmod: whether to change permission of output files (defalt: 775).
    """
    job_folder2 = get_job_folder(i_datasets_folder, 'deicode/chunks')
    main_cases_dict = get_main_cases_dict(p_perm_groups)
    # jobs = []
    all_sh_pbs = {}
    for dat, tsv_meta_pds_ in datasets.items():
        out_sh = '%s/run_deicode_%s_%s%s.sh' % (job_folder2, prjct_nm, dat,
                                                filt_raref)
        for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
            cur_raref = datasets_rarefs[dat][idx]
            tsv, meta = tsv_meta_pds
            meta_alphas = meta.replace('.tsv', '_alphas.tsv')
            meta_alphas_full = meta.replace('.tsv', '_alphas_full.tsv')
            if isfile(meta_alphas_full):
                meta = meta_alphas_full
            elif isfile(meta_alphas):
                meta = meta_alphas
            meta_pd = read_meta_pd(meta)
            meta_pd = meta_pd.set_index('sample_name')
            cases_dict = check_metadata_cases_dict(meta, meta_pd,
                                                   dict(main_cases_dict),
                                                   'DEICODE')
            odir = get_analysis_folder(i_datasets_folder,
                                       'deicode/%s%s' % (dat, cur_raref))
            for case_var, case_vals_list in cases_dict.items():
                cur_sh = '%s/run_beta_deicode_%s_%s%s_%s%s.sh' % (
                    job_folder2, prjct_nm, dat, cur_raref, case_var,
                    filt_raref)
                cur_sh = cur_sh.replace(' ', '-')
                all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                run_single_deicode(odir, tsv, meta_pd, case_var,
                                   case_vals_list, cur_sh, force)

    job_folder = get_job_folder(i_datasets_folder, 'deicode')
    main_sh = write_main_sh(
        job_folder, '3_run_beta_deicode_%s%s' % (filt_raref, prjct_nm),
        all_sh_pbs, '%s.dcd%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit)
    if main_sh:
        if p_perm_groups:
            if p_perm_groups.startswith('/panfs'):
                p_perm_groups = p_perm_groups.replace(os.getcwd(), '')
            print('# DEICODE (groups config in %s)' % p_perm_groups)
        else:
            print('# DEICODE')
        print_message('', 'sh', main_sh, jobs)
def run_permanova(i_datasets_folder: str, betas: dict,
                  main_testing_groups: tuple, p_perm_tests_min: int,
                  p_beta_type: tuple, datasets_rarefs: dict,
                  p_perm_groups: str, force: bool, prjct_nm: str,
                  qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                  split: bool, run_params: dict, filt_raref: str, jobs: bool,
                  chunkit: int) -> dict:
    """
    Run beta-group-significance: Beta diversity group significance.
    https://docs.qiime2.org/2019.10/plugins/available/diversity/beta-group-significance/
    Main per-dataset looper for the PERMANOVA tests on beta diversity matrices.

    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets: list of datasets.
    :param betas: beta diversity matrices.
    :param main_testing_groups: groups to test.
    :param p_perm_groups: groups to subset.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Nick name for your project.
    :param qiime_env: qiime2-xxxx.xx conda environment.
    :param chmod: whether to change permission of output files (defalt: 775).
    """
    permanovas = {}
    job_folder2 = get_job_folder(i_datasets_folder, 'permanova/chunks')
    main_cases_dict = get_main_cases_dict(p_perm_groups)

    npermutations = 999

    metric_check = set()
    all_sh_pbs = {}
    first_print = 0
    for dat, metric_groups_metas_qzas_dms_trees_ in betas.items():
        permanovas[dat] = []
        if not split:
            out_sh = '%s/run_beta_group_significance_%s_%s%s.sh' % (
                job_folder2, prjct_nm, dat, filt_raref)
        for idx, metric_groups_metas_qzas_dms_trees in enumerate(
                metric_groups_metas_qzas_dms_trees_):
            cur_depth = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder,
                                       'permanova/%s%s' % (dat, cur_depth))
            for metric, subset_files in metric_groups_metas_qzas_dms_trees.items(
            ):
                permanovas.setdefault(dat, []).append(metric)
                if split:
                    out_sh = '%s/run_beta_group_significance_%s_%s_%s%s.sh' % (
                        job_folder2, prjct_nm, dat, metric, filt_raref)
                for subset, metas_qzas_mat_qzas_trees in subset_files.items():
                    (meta, qza, mat_qza, tree) = metas_qzas_mat_qzas_trees[0]
                    if not isfile(mat_qza):
                        if not first_print:
                            print(
                                'Beta diversity, distances matrices must be generated already to automatise PERMANOVA\n'
                                '\t(re-run this after steps "2_run_beta.sh" and "2x_run_beta_export.pbs" are done)'
                            )
                            first_print += 1
                        continue
                    if (dat, subset) not in metric_check:
                        meta_pd = read_meta_pd(meta)
                        meta_pd = meta_pd.set_index('sample_name')
                        cases_dict = check_metadata_cases_dict(
                            meta, meta_pd, dict(main_cases_dict), 'PERMANOVA')
                        testing_groups = check_metadata_testing_groups(
                            meta, meta_pd, main_testing_groups,
                            p_perm_tests_min, 'PERMANOVA')
                        metric_check.add((dat, subset))

                    for case_var, case_vals_list in cases_dict.items():
                        testing_groups_case_var = list(
                            set(testing_groups + [case_var]))
                        for case_vals in case_vals_list:
                            case = get_case(case_vals,
                                            case_var).replace(' ', '_')
                            for testing_group in testing_groups_case_var:
                                if testing_group == 'ALL':
                                    continue
                                cur_sh = '%s/run_beta_group_significance_%s%s_%s_%s_%s_%s%s.sh' % (
                                    job_folder2, dat, cur_depth, metric,
                                    subset, case, testing_group, filt_raref)
                                cur_sh = cur_sh.replace(' ', '-')
                                all_sh_pbs.setdefault((dat, out_sh),
                                                      []).append(cur_sh)
                                run_single_perm(odir, subset, meta_pd, cur_sh,
                                                metric, case, testing_group,
                                                p_perm_tests_min, p_beta_type,
                                                qza, mat_qza, case_var,
                                                case_vals, npermutations,
                                                force)
def run_mantel(i_datasets_folder: str, datasets_filt: dict, p_mantel: str,
               betas: dict, force: bool, prjct_nm: str, qiime_env: str,
               chmod: str, noloc: bool, slurm: bool, split: bool,
               run_params: dict, filt_raref: str, filt_only: bool,
               eval_depths: dict, jobs: bool, chunkit: int) -> None:
    """
    """
    evaluation = ''
    if eval_depths:
        evaluation = '_eval'
        mantel_pairs = {}
        for dat, depths in eval_depths.items():
            sorted_depths = sorted(depths, key=lambda x: int(x.split('_')[-1]))
            for idx, x in enumerate(sorted_depths[:-1]):
                y = sorted_depths[(idx + 1)]
                n0 = x.split('_')[-1]
                n1 = y.split('_')[-1]
                mantel_pairs['%s_%s' % (n0, n1)] = [x, y]
        mantel_subsets = {'ALL': [[]]}
    else:
        mantel_pairs, mantel_subsets = get_procrustes_mantel_dicts(p_mantel)

    get_job_folder(i_datasets_folder, 'mantel%s' % evaluation)

    all_sh_pbs = {}
    missing_dats = set()
    for pair, (dat1_, dat2_) in mantel_pairs.items():

        dat1, raref1 = get_dat_idx(dat1_, evaluation, datasets_filt, filt_only)
        dat2, raref2 = get_dat_idx(dat2_, evaluation, datasets_filt, filt_only)

        if check_dat_exists(betas, dat1, missing_dats) or check_dat_exists(
                betas, dat2, missing_dats):
            continue

        if evaluation:
            metrics_groups_metas_qzas_dms_trees1 = betas[dat1]
            metrics_groups_metas_qzas_dms_trees2 = betas[dat2]
        else:
            metrics_groups_metas_qzas_dms_trees1 = betas[dat1][0]
            metrics_groups_metas_qzas_dms_trees2 = betas[dat2][0]

        job_folder2 = get_job_folder(
            i_datasets_folder,
            'mantel%s/chunks/%s%s' % (evaluation, pair, filt_raref))
        if not split:
            out_sh = '%s/run_mantel_%s%s_%s%s.sh' % (
                job_folder2, prjct_nm, evaluation, pair, filt_raref)

        for metric, groups_metas_qzas_dms_trees1 in metrics_groups_metas_qzas_dms_trees1.items(
        ):
            if split:
                out_sh = '%s/run_mantel_%s%s_%s_%s%s.sh' % (
                    job_folder2, prjct_nm, evaluation, pair, metric,
                    filt_raref)
            if metric not in metrics_groups_metas_qzas_dms_trees2:
                continue
            groups_metas_qzas_dms_trees2 = metrics_groups_metas_qzas_dms_trees2[
                metric]
            groups1 = sorted(groups_metas_qzas_dms_trees1.keys())
            groups2 = sorted(groups_metas_qzas_dms_trees2.keys())
            for (group1_, group2_) in itertools.product(*[groups1, groups2]):
                if group1_ == '':
                    group1 = 'full'
                else:
                    group1 = group1_
                if group2_ == '':
                    group2 = 'full'
                else:
                    group2 = group2_

                meta1, qza1, dm1, tree1 = groups_metas_qzas_dms_trees1[
                    group1_][0]
                meta2, qza2, dm2, tree2 = groups_metas_qzas_dms_trees2[
                    group2_][0]

                skip = 0
                if not evaluation:
                    if '__raref' in dat1_:
                        dm1, meta1 = get_dm_meta(dat1, dm1, meta1, raref1,
                                                 metric, i_datasets_folder,
                                                 skip)
                    if '__raref' in dat2_:
                        dm2, meta2 = get_dm_meta(dat2, dm2, meta2, raref2,
                                                 metric, i_datasets_folder,
                                                 skip)
                if skip:
                    print(
                        '[Mantels] One desired rarefaction depth not run (pair %s)'
                        % pair)
                    continue

                meta_pd1 = read_meta_pd(meta1)
                meta_pd2 = read_meta_pd(meta2)
                common_sams = list(
                    set(meta_pd1.sample_name) & set(meta_pd2.sample_name))
                if len(common_sams) < 3:
                    continue

                meta_pd = meta_pd1.loc[meta_pd1.sample_name.isin(common_sams)]
                cases_dict = check_metadata_cases_dict(meta1, meta_pd,
                                                       dict(mantel_subsets),
                                                       'mantel')
                odir = get_analysis_folder(
                    i_datasets_folder, 'mantel%s/%s%s/%s_vs_%s' %
                    (evaluation, pair, filt_raref, group1, group2))
                job_folder3 = get_job_folder(
                    i_datasets_folder, 'mantel%s/chunks/%s%s/%s_vs_%s' %
                    (evaluation, pair, filt_raref, group1, group2))

                for case_var, case_vals_list in cases_dict.items():
                    for case_vals in case_vals_list:
                        case_ = get_case(case_vals, case_var).replace(' ', '_')
                        cur = '%s__%s' % (metric, case_)
                        cur_sh = '%s/run_mantel%s_%s%s.sh' % (
                            job_folder3, evaluation, cur, filt_raref)
                        cur_sh = cur_sh.replace(' ', '-')
                        all_sh_pbs.setdefault((pair, out_sh),
                                              []).append(cur_sh)

                        dm_out1 = '%s/dm_%s__%s_DM.qza' % (odir, dat1_, cur)
                        dm_out2 = '%s/dm_%s__%s_DM.qza' % (odir, dat2_, cur)
                        mantel_out = '%s/mantel%s_%s__%s__%s.qzv' % (
                            odir, evaluation, dat1_, dat2_, cur)
                        run_single_procrustes_mantel('mantel', odir, dm1, dm2,
                                                     meta_pd, dm_out1, dm_out2,
                                                     mantel_out, cur_sh, cur,
                                                     case_var, case_vals,
                                                     force)

    job_folder = get_job_folder(i_datasets_folder, 'mantel%s' % evaluation)
    main_sh = write_main_sh(
        job_folder, '4_run_mantel_%s%s%s' % (prjct_nm, evaluation, filt_raref),
        all_sh_pbs, '%s.mntl%s%s' % (prjct_nm, evaluation, filt_raref),
        run_params["time"], run_params["n_nodes"], run_params["n_procs"],
        run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc,
        slurm, jobs, chunkit)
    if main_sh:
        if p_mantel and p_mantel != 1:
            if p_mantel.startswith('/panfs'):
                p_mantel = p_mantel.replace(os.getcwd(), '')
            print('# Mantels (pairs and samples subsets config in %s)' %
                  p_mantel)
        else:
            print('# Mantels')
        print_message('', 'sh', main_sh, jobs)
def run_procrustes(i_datasets_folder: str, datasets_filt: dict,
                   p_procrustes: str, betas: dict, force: bool, prjct_nm: str,
                   qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                   split: bool, run_params: dict, filt_raref: str,
                   filt_only: bool, eval_depths: dict, jobs: bool,
                   chunkit: int) -> None:
    """
    """
    evaluation = ''
    if eval_depths:
        evaluation = '_eval'
        procrustes_pairs = {}
        for dat, depths in eval_depths.items():
            sorted_depths = sorted(depths, key=lambda x: int(x.split('_')[-1]))
            for idx, x in enumerate(sorted_depths[:-1]):
                y = sorted_depths[(idx + 1)]
                n0 = x.split('_')[-1]
                n1 = y.split('_')[-1]
                procrustes_pairs['%s_%s' % (n0, n1)] = [x, y]
        procrustes_subsets = {'ALL': [[]]}
    else:
        procrustes_pairs, procrustes_subsets = get_procrustes_mantel_dicts(
            p_procrustes)
    get_job_folder(i_datasets_folder, 'procrustes%s' % evaluation)
    dms_tab = []
    all_sh_pbs = {}
    missing_dats = set()
    for pair, (dat1_, dat2_) in procrustes_pairs.items():

        dat1, raref1 = get_dat_idx(dat1_, evaluation, datasets_filt, filt_only)
        dat2, raref2 = get_dat_idx(dat2_, evaluation, datasets_filt, filt_only)

        if check_dat_exists(betas, dat1, missing_dats) or check_dat_exists(
                betas, dat2, missing_dats):
            continue

        if evaluation:
            metrics_groups_metas_qzas_dms_trees1 = betas[dat1]
            metrics_groups_metas_qzas_dms_trees2 = betas[dat2]
        else:
            metrics_groups_metas_qzas_dms_trees1 = betas[dat1][0]
            metrics_groups_metas_qzas_dms_trees2 = betas[dat2][0]

        job_folder2 = get_job_folder(
            i_datasets_folder,
            'procrustes%s/chunks/%s%s' % (evaluation, pair, filt_raref))
        if not split:
            out_sh = '%s/run_procrustes_%s%s_%s%s.sh' % (
                job_folder2, prjct_nm, evaluation, pair, filt_raref)

        for metric, groups_metas_qzas_dms_trees1 in metrics_groups_metas_qzas_dms_trees1.items(
        ):
            if split:
                out_sh = '%s/run_procrustes_%s%s_%s_%s%s.sh' % (
                    job_folder2, prjct_nm, evaluation, pair, metric,
                    filt_raref)
            if metric not in metrics_groups_metas_qzas_dms_trees2:
                continue
            groups_metas_qzas_dms_trees2 = metrics_groups_metas_qzas_dms_trees2[
                metric]
            groups1 = sorted(groups_metas_qzas_dms_trees1.keys())
            groups2 = sorted(groups_metas_qzas_dms_trees2.keys())
            for (group1_, group2_) in itertools.product(*[groups1, groups2]):
                if group1_ == '':
                    group1 = 'full'
                else:
                    group1 = group1_
                if group2_ == '':
                    group2 = 'full'
                else:
                    group2 = group2_

                meta1, qza1, dm1, tree1 = groups_metas_qzas_dms_trees1[
                    group1_][0]
                meta2, qza2, dm2, tree2 = groups_metas_qzas_dms_trees2[
                    group2_][0]

                skip = 0
                if not evaluation:
                    if '__raref' in dat1_:
                        dm1, meta1 = get_dm_meta(dat1, dm1, meta1, raref1,
                                                 metric, i_datasets_folder,
                                                 skip)
                    if '__raref' in dat2_:
                        dm2, meta2 = get_dm_meta(dat2, dm2, meta2, raref2,
                                                 metric, i_datasets_folder,
                                                 skip)
                if skip:
                    print(
                        '[Proscustes] One desired rarefaction depth not run (pair %s)'
                        % pair)
                    continue

                meta_pd1 = read_meta_pd(meta1)
                meta_pd2 = read_meta_pd(meta2)
                common_sams = list(
                    set(meta_pd1.sample_name) & set(meta_pd2.sample_name))
                if len(common_sams) < 3:
                    continue

                meta_pd = meta_pd1.loc[meta_pd1.sample_name.isin(common_sams)]
                cases_dict = check_metadata_cases_dict(
                    meta1, meta_pd, dict(procrustes_subsets), 'procrustes')
                odir = get_analysis_folder(
                    i_datasets_folder, 'procrustes%s/%s%s/%s_vs_%s' %
                    (evaluation, pair, filt_raref, group1, group2))
                job_folder3 = get_job_folder(
                    i_datasets_folder, 'procrustes%s/chunks/%s%s/%s_vs_%s' %
                    (evaluation, pair, filt_raref, group1, group2))
                for case_var, case_vals_list in cases_dict.items():
                    for case_vals in case_vals_list:
                        case_ = get_case(case_vals, case_var).replace(' ', '_')
                        cur = '%s__%s' % (metric, case_)
                        cur_sh = '%s/run_procrustes%s_%s%s.sh' % (
                            job_folder3, evaluation, cur, filt_raref)
                        cur_sh = cur_sh.replace(' ', '-')
                        all_sh_pbs.setdefault((pair, out_sh),
                                              []).append(cur_sh)

                        dm_out1 = '%s/dm_%s__%s_DM.qza' % (odir, dat1_, cur)
                        dm_out2 = '%s/dm_%s__%s_DM.qza' % (odir, dat2_, cur)
                        dm_out1_tsv = '%s.tsv' % splitext(dm_out1)[0]
                        dm_out2_tsv = '%s.tsv' % splitext(dm_out2)[0]
                        biplot = '%s/procrustes%s_%s__%s__%s.qzv' % (
                            odir, evaluation, dat1_, dat2_, cur)
                        run_single_procrustes_mantel('procrustes', odir, dm1,
                                                     dm2, meta_pd, dm_out1,
                                                     dm_out2, biplot, cur_sh,
                                                     cur, case_var, case_vals,
                                                     force)
                        dms_tab.append([
                            pair, dat1_, dat2_, group1, group2, case_, metric,
                            dm_out1_tsv, dm_out2_tsv
                        ])

    job_folder = get_job_folder(i_datasets_folder, 'procrustes%s' % evaluation)
    main_sh = write_main_sh(
        job_folder,
        '4_run_procrustes_%s%s%s' % (prjct_nm, evaluation, filt_raref),
        all_sh_pbs, '%s.prcst%s%s' % (prjct_nm, evaluation, filt_raref),
        run_params["time"], run_params["n_nodes"], run_params["n_procs"],
        run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc,
        slurm, jobs, chunkit)
    if main_sh:
        if p_procrustes and p_procrustes != 1:
            if p_procrustes.startswith('/panfs'):
                p_procrustes = p_procrustes.replace(os.getcwd(), '')
            print('# Procrustes (pairs and samples subsets config in %s)' %
                  p_procrustes)
        else:
            print('# Procrustes')
        print_message('', 'sh', main_sh, jobs)

    dms_tab_pd = pd.DataFrame(dms_tab,
                              columns=[
                                  'pair',
                                  'dat1',
                                  'dat2',
                                  'metric',
                                  'group1',
                                  'group2',
                                  'case',
                                  'dm_out1',
                                  'dm_out2',
                              ])

    odir = get_analysis_folder(i_datasets_folder,
                               'procrustes%s/R' % evaluation)
    out_Rs = glob.glob('%s/pairs_proscrustes_results%s%s*.tsv' %
                       (odir, evaluation, filt_raref))
    if len(out_Rs):
        done_R = pd.concat([pd.read_table(x, sep=' ') for x in out_Rs])
        dms_tab_pd = dms_tab_pd.loc[~dms_tab_pd[['dm_out1', 'dm_out2']].sum(1).
                                    isin(done_R[['f1', 'f2']].sum(1))]

    if dms_tab_pd.shape[0]:
        fp_num = 0
        if len(out_Rs):
            last = sorted(
                out_Rs, key=lambda fp: int(fp.split('.tsv')[0].split('_')[-1]))
            fp_num = int(last[-1].split('.tsv')[0].split('_')[-1]) + 1

        dms_tab_fp = '%s/pairs%s%s_%s.tsv' % (odir, evaluation, filt_raref,
                                              fp_num)
        dms_tab_pd.to_csv(dms_tab_fp, index=False, sep='\t')
        out_R = '%s/pairs_proscrustes_results%s%s_%s.tsv' % (
            odir, evaluation, filt_raref, fp_num)
        job_folder = get_job_folder(i_datasets_folder, 'procrustes/R')
        R_script = '%s/4_run_procrustes_%s%s.R' % (job_folder, prjct_nm,
                                                   filt_raref)
        with open(R_script, 'w') as o:
            o.write("library(vegan)\n")
            o.write("dms_files <- read.table('%s', h=T)\n" % dms_tab_fp)
            o.write(
                "cols <- c('pair', 'd1', 'd2', 'g1', 'g2', 'case', 'metric', 'f1', 'f2', 'samples', 'M2', 'p-value')\n"
            )
            o.write(
                "res <- setNames(data.frame(matrix(ncol = 12, nrow = 0)), cols)\n"
            )
            o.write("for (i in seq(1, dim(dms_files)[1])) {\n")
            o.write("    row <- as.vector(unlist(dms_files[i,]))\n")
            o.write("    pair <- row[1]\n")
            o.write("    d1 <- row[2]\n")
            o.write("    d2 <- row[3]\n")
            o.write("    group1 <- row[4]\n")
            o.write("    group2 <- row[5]\n")
            o.write("    case <- row[6]\n")
            o.write("    metric <- row[7]\n")
            o.write("    f1 <- row[8]\n")
            o.write("    f2 <- row[9]\n")
            o.write("    if (sum(file.exists(f1, f2)) == 2) {\n")
            o.write(
                "        filin_tsv_pd1 <- read.csv(f1, header = TRUE, check.names=FALSE,\n"
            )
            o.write(
                "                                  row.names = 1, colClasses = 'character', sep = '\\t')\n"
            )
            o.write(
                "        filin_tsv_pd2 <- read.csv(f2, header = TRUE, check.names=FALSE,\n"
            )
            o.write(
                "                                  row.names = 1, colClasses = 'character', sep = '\\t')\n"
            )
            o.write("        filin_tsv_pd1 <- data.matrix(filin_tsv_pd1)\n")
            o.write("        filin_tsv_pd2 <- data.matrix(filin_tsv_pd2)\n")
            o.write(
                "        filin_tsv_pd1 <- filin_tsv_pd1[rownames(filin_tsv_pd2), rownames(filin_tsv_pd2)]\n"
            )
            o.write(
                "        # procrustes12 <- procrustes(filin_tsv_pd1, filin_tsv_pd2, kind=2, permutations=999)\n"
            )
            o.write(
                "        prtst <- protest(filin_tsv_pd1, filin_tsv_pd2, permutations = 999)\n"
            )
            o.write("        n <- dim(filin_tsv_pd1)[1]\n")
            o.write(
                "        res[i,] <- c(pair, d1, d2, group1, group2, case, metric, f1, f2, n, prtst$ss, prtst$signif)\n"
            )
            o.write("    }\n")
            o.write("}\n")
            o.write("write.table(x = res, file = '%s')\n" % out_R)

        out_sh = '%s/4_run_procrustes_%s%s_R%s.sh' % (job_folder, prjct_nm,
                                                      evaluation, filt_raref)
        out_pbs = '%s.pbs' % splitext(out_sh)[0]
        with open(out_sh, 'w') as o:
            o.write('R -f %s --vanilla\n' % R_script)

        run_xpbs(
            out_sh, out_pbs,
            '%s.prcrt%s.R%s' % (prjct_nm, evaluation, filt_raref), 'renv',
            run_params["time"], run_params["n_nodes"], run_params["n_procs"],
            run_params["mem_num"], run_params["mem_dim"], chmod, 1,
            '# Procrustes for stats in R (pairs and samples subsets config in %s)'
            % p_procrustes, None, False, jobs)
コード例 #10
0
def run_phate(p_phate_config: str, i_datasets_folder: str, datasets: dict,
              datasets_rarefs: dict, force: bool, prjct_nm: str,
              qiime_env: str, chmod: str, noloc: bool, slurm: bool,
              split: bool, run_params: dict, filt_raref: str, jobs: bool,
              chunkit: int) -> dict:

    job_folder2 = get_job_folder(i_datasets_folder, 'phate/chunks')
    phate_dicts = get_phate_dicts(p_phate_config)
    phate_filtering, phate_labels, phate_params, main_cases_dict = phate_dicts

    phates = {}
    all_sh_pbs = {}
    all_import_sh_pbs = {}
    for dat, tsv_meta_pds_ in datasets.items():
        phates[dat] = []
        if dat in phate_filtering:
            filters = phate_filtering[dat]
        else:
            filters = {'0_0': ['0', '0']}
        for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
            tsv, meta = tsv_meta_pds
            meta_pd = read_meta_pd(meta)
            meta_pd = meta_pd.set_index('sample_name')
            cases_dict = check_metadata_cases_dict(meta, meta_pd,
                                                   dict(main_cases_dict),
                                                   'phate')
            cur_raref = datasets_rarefs[dat][idx]
            if not split:
                out_sh = '%s/run_phate_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, filt_raref, cur_raref)
                out_import_sh = '%s/run_import_phate_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, filt_raref, cur_raref)
            odir = get_analysis_folder(i_datasets_folder, 'phate/%s' % dat)
            raref_phates = {}
            for filt, (fp, fa) in filters.items():
                raref_phates[filt] = {}
                if split:
                    out_sh = '%s/run_phate_%s_%s%s%s%s.sh' % (
                        job_folder2, prjct_nm, dat, filt_raref, cur_raref,
                        filt)
                    out_import_sh = '%s/run_import_phate_%s_%s%s%s%s.sh' % (
                        job_folder2, prjct_nm, dat, filt_raref, cur_raref,
                        filt)
                for case_var, case_vals_list in cases_dict.items():
                    cur_sh = '%s/run_phate_%s_%s%s%s_%s_%s.sh' % (
                        job_folder2, prjct_nm, dat, filt_raref, cur_raref,
                        case_var, filt)
                    cur_sh = cur_sh.replace(' ', '-')
                    cur_import_sh = '%s/run_import_phate_%s_%s%s%s_%s_%s.sh' % (
                        job_folder2, prjct_nm, dat, filt_raref, cur_raref,
                        case_var, filt)
                    cur_import_sh = cur_import_sh.replace(' ', '-')
                    all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                    all_import_sh_pbs.setdefault((dat, out_import_sh),
                                                 []).append(cur_import_sh)
                    phate = run_single_phate(dat, odir, tsv, meta_pd, case_var,
                                             phate_labels, phate_params,
                                             run_params, case_vals_list,
                                             cur_sh, cur_import_sh, force,
                                             filt, cur_raref, fp, fa)
                    raref_phates[filt][case_var] = phate
            phates[dat].append(raref_phates)

    job_folder = get_job_folder(i_datasets_folder, 'phate')
    main_sh = write_main_sh(
        job_folder, '3_run_import_phate_%s%s' % (prjct_nm, filt_raref),
        all_import_sh_pbs, '%s.mrt.pht%s' % (prjct_nm, filt_raref),
        run_params["time"], run_params["n_nodes"], run_params["n_procs"],
        run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc,
        slurm, jobs, chunkit)
    if main_sh:
        if p_phate_config:
            if p_phate_config.startswith('/panfs'):
                p_phate_config = p_phate_config.replace(os.getcwd(), '')
            print('# Import for PHATE (groups config in %s)' % p_phate_config)
        else:
            print('# Import for PHATE')
        print_message('', 'sh', main_sh, jobs)

    main_sh = write_main_sh(
        job_folder, '3_run_phate_%s%s' % (prjct_nm, filt_raref), all_sh_pbs,
        '%s.pht%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], 'xphate', chmod, noloc, slurm, jobs, chunkit)
    if main_sh:
        if p_phate_config:
            if p_phate_config.startswith('/panfs'):
                p_phate_config = p_phate_config.replace(os.getcwd(), '')
            print('# PHATE (groups config in %s)' % p_phate_config)
        else:
            print('# PHATE')
        print_message('', 'sh', main_sh, jobs)
    return phates
コード例 #11
0
def run_alpha_group_significance(i_datasets_folder: str, datasets: dict,
                                 diversities: dict, datasets_rarefs: dict,
                                 p_perm_groups: str, force: bool,
                                 prjct_nm: str, qiime_env: str, chmod: str,
                                 noloc: bool, slurm: bool, As: tuple,
                                 split: bool, run_params: dict,
                                 filt_raref: str, jobs: bool,
                                 chunkit: int) -> None:
    """
    Run alpha-group-significance: Alpha diversity comparisons.
    https://docs.qiime2.org/2019.10/plugins/available/diversity/alpha-group-significance/
    Main per-dataset looper for the Kruskal-Wallis tests on alpha diversity vectors.

    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets: list of datasets.
    :param diversities: alpha diversity qiime2 Arfetact per dataset.
    :param p_perm_groups: path to the subsets file.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Nick name for your project.
    :param qiime_env: qiime2-xxxx.xx conda environment.
    :param chmod: whether to change permission of output files (defalt: 775).
    """

    job_folder2 = get_job_folder(i_datasets_folder,
                                 'alpha_group_significance/chunks')
    # alpha_metrics = get_metrics('alpha_metrics', As)
    main_cases_dict = get_main_cases_dict(p_perm_groups)

    jobs = []
    all_sh_pbs = {}
    first_print = 0

    for dat, tsv_meta_pds_ in datasets.items():
        for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
            meta = tsv_meta_pds[1]
            cur_raref = datasets_rarefs[dat][idx]
            raref_diversities = diversities[dat][idx]

            presence_mat = [
                1 for (qza, metric) in raref_diversities[''] if isfile(qza)
            ]
            if not presence_mat:
                if not first_print:
                    print(
                        'Alpha diversity must be measured already to automatise Kruskal-Wallis tests\n'
                        '\t(re-run this after step "1_run_alpha.sh" is done)')
                    first_print += 1
                continue

            meta_pd = read_meta_pd(meta)
            meta_pd = meta_pd.set_index('sample_name')
            cases_dict = check_metadata_cases_dict(meta, meta_pd,
                                                   dict(main_cases_dict),
                                                   'alpha Kruskal-Wallis')

            odir = get_analysis_folder(
                i_datasets_folder,
                'alpha_group_significance/%s%s' % (dat, cur_raref))
            for (qza, metric) in raref_diversities['']:
                # metric = get_metric(alpha_metrics, qza)
                div_tsv = '%s.tsv' % splitext(qza)[0]
                if not isfile(div_tsv) or not isfile(div_tsv):
                    print(
                        '  [KRUSKAL-WALLIS] metric %s not calculated\nSkipping it...'
                        % metric)
                    continue
                out_sh = '%s/run_alpha_group_significance_%s%s_%s%s.sh' % (
                    job_folder2, dat, cur_raref, metric, filt_raref)
                for case_var, case_vals_list in cases_dict.items():
                    cur_sh = '%s/run_alpha_group_significance_%s%s_%s_%s%s.sh' % (
                        job_folder2, dat, cur_raref, metric, case_var,
                        filt_raref)
                    cur_sh = cur_sh.replace(' ', '-')
                    all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                    p = multiprocessing.Process(target=run_multi_kw,
                                                args=(odir, meta_pd, qza,
                                                      case_vals_list, case_var,
                                                      cur_sh, force))
                    p.start()
                    jobs.append(p)
    for j in jobs:
        j.join()

    job_folder = get_job_folder(i_datasets_folder, 'alpha_group_significance')
    main_sh = write_main_sh(
        job_folder,
        '6_run_alpha_group_significance_%s%s' % (filt_raref, prjct_nm),
        all_sh_pbs, '%s.kv%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit)
    if main_sh:
        if p_perm_groups:
            print("# Kruskal-Wallis on alpha diversity (groups config in %s)" %
                  p_perm_groups)
        else:
            print("# Kruskal-Wallis on alpha diversity")
        print_message('', 'sh', main_sh, jobs)