コード例 #1
0
def main():

    args = parser.parse_args()

    # If intermediate output directory set then create and output there.
    # Otherwise make a temporary directory for the intermediate files.
    if args.intermediate:

        make_output_dir(args.intermediate)

        place_seqs_pipeline(study_fasta=args.study_fasta,
                            ref_dir=args.ref_dir,
                            out_tree=args.out_tree,
                            threads=args.processes,
                            out_dir=args.intermediate,
                            chunk_size=args.chunk_size,
                            print_cmds=args.print_cmds)

    else:
        with TemporaryDirectory() as temp_dir:
                place_seqs_pipeline(study_fasta=args.study_fasta,
                                    ref_dir=args.ref_dir,
                                    out_tree=args.out_tree,
                                    threads=args.processes,
                                    out_dir=temp_dir,
                                    chunk_size=args.chunk_size,
                                    print_cmds=args.print_cmds)
コード例 #2
0
def main():

    args = parser.parse_args()

    # Check that input filenames exist.
    check_files_exist([args.study_fasta, args.ref_msa, args.tree])	

    # If intermediate output directory set then create and output there.
    # Otherwise make a temporary directory for the intermediate files.
    if args.intermediate:

        make_output_dir(args.intermediate)

        place_seqs_pipeline(study_fasta=args.study_fasta,
                            ref_msa=args.ref_msa,
                            tree=args.tree,
                            out_tree=args.out_tree,
                            threads=args.threads,
                            papara_output=args.papara_output,
                            out_dir=args.intermediate,
                            chunk_size=args.chunk_size,
                            print_cmds=args.print_cmds)

    else:
        with TemporaryDirectory() as temp_dir:
                place_seqs_pipeline(study_fasta=args.study_fasta,
                                    ref_msa=args.ref_msa,
                                    tree=args.tree,
                                    out_tree=args.out_tree,
                                    threads=args.threads,
                                    papara_output=args.papara_output,
                                    out_dir=temp_dir,
                                    chunk_size=args.chunk_size,
                                    print_cmds=args.print_cmds)
コード例 #3
0
def run_epa_ng(tree: str,
               ref_msa_fastafile: str,
               study_msa_fastafile: str,
               model: str,
               out_dir: str,
               chunk_size=5000,
               threads=1,
               print_cmds=False):
    '''Run EPA-NG on specified tree, reference MSA, and study sequence MSA.
    Will output a .jplace file in out_dir.'''

    make_output_dir(out_dir)

    epa_ng_command = [
        "epa-ng", "--tree", tree, "--ref-msa", ref_msa_fastafile, "--query",
        study_msa_fastafile, "--chunk-size",
        str(chunk_size), "-T",
        str(threads), "-m", model, "-w", out_dir, "--filter-acc-lwr", "0.99",
        "--filter-max", "100"
    ]

    system_call_check(epa_ng_command, print_out=print_cmds)

    # Parse jplace file so that output is reprodicible.
    jplace_orig = path.join(out_dir, "epa_result.jplace")
    jplace_parsed = path.join(out_dir, "epa_result_parsed.jplace")
    parse_jplace(jplace_orig, jplace_parsed)
コード例 #4
0
ファイル: place_seqs.py プロジェクト: arghya1611/picrust2
def run_epa_ng(tree: str,
               ref_msa_fastafile: str,
               study_msa_fastafile: str,
               out_dir: str,
               chunk_size=5000,
               threads=1,
               print_cmds=False):
    '''Run EPA-NG on specified tree, reference MSA, and study sequence MSA.
    Will opath.joinutput a .jplace file in out_dir.'''

    make_output_dir(out_dir)

    system_call_check("epa-ng --tree " + tree + " --ref-msa " +
                      ref_msa_fastafile + " --query " + study_msa_fastafile +
                      " --chunk-size " + str(chunk_size) + " -T " +
                      str(threads) + " -w " + out_dir,
                      print_out=print_cmds)
コード例 #5
0
def main():

    parser = argparse.ArgumentParser(

        description="Creates output FASTA for each sample with each ASV repeated for every count in that sample.",

formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument("-f", "--fasta", metavar="FASTA", type=str,
                        help="Path to full FASTA file.", required=True)

    parser.add_argument("-b", "--biom", metavar="BIOM", type=str,
                        help="Path to BIOM table.", required=True)

    parser.add_argument("-o", "--outdir", metavar="PATH", type=str,
                        help="Name of folder to make for output files.", required=True)

    args = parser.parse_args()

    in_fasta = read_fasta(args.fasta)

    in_table = biom_to_pandas_df(biom.load_table(args.biom))

    # If no sequences in file then stop job.
    if not in_fasta:
        sys.exit("Stopping - no sequences in file.")

    make_output_dir(args.outdir)

    for sample in in_table.columns:
        sample_outfile = args.outdir + "/" + sample + ".fasta"

        sample_outfh = open(sample_outfile, 'wt')

        for asv in in_table.index.values:
            asv_count = in_table.loc[asv, sample]
            if asv_count > 0:
                for i in range(int(asv_count)):
                    print(">" + asv + "_" + sample + "_" + str(i), file=sample_outfh)
                    print(in_fasta[asv], file=sample_outfh)

        sample_outfh.close()
コード例 #6
0
def main():

    args = parser.parse_args()

    # Check that input files exist.
    check_files_exist([args.input, args.map])

    # If intermediate output directory set then create and output there.
    # Otherwise make a temporary directory for the intermediate files.
    if args.intermediate:
        make_output_dir(args.intermediate)

        unstrat_out, strat_out = run_minpath_pipeline(
            inputfile=args.input,
            mapfile=args.map,
            proc=args.proc,
            out_dir=args.intermediate,
            print_cmds=args.print_cmds)
    else:
        with TemporaryDirectory() as temp_dir:
            unstrat_out, strat_out = run_minpath_pipeline(
                inputfile=args.input,
                mapfile=args.map,
                proc=args.proc,
                out_dir=temp_dir,
                print_cmds=args.print_cmds)

    # Write output files.
    unstrat_outfile = args.out_prefix + "_unstrat_path.tsv"
    strat_outfile = args.out_prefix + "_strat_path.tsv"

    unstrat_out.to_csv(path_or_buf=unstrat_outfile,
                       sep="\t",
                       index_label="pathway")

    strat_out.to_csv(path_or_buf=strat_outfile, sep="\t", index=False)
コード例 #7
0
ファイル: pathway_pipeline.py プロジェクト: semir2/picrust2
def main():

    args = parser.parse_args()

    # Check that input files exist.
    check_files_exist([args.input, args.map])

    gap_fill_opt = not args.no_gap_fill

    run_minpath_opt = not args.skip_minpath

    # If intermediate output directory set then create and output there.
    # Otherwise make a temporary directory for the intermediate files.
    if args.intermediate:

        make_output_dir(args.intermediate)

        unstrat_abun, unstrat_cov, strat_abun, strat_cov = pathway_pipeline(
                                                      inputfile=args.input,
                                                      mapfile=args.map,
                                                      regroup_mapfile=args.regroup_map,
                                                      proc=args.proc,
                                                      out_dir=args.intermediate,
                                                      run_minpath=run_minpath_opt,
                                                      coverage=args.coverage,
                                                      gap_fill_on=gap_fill_opt,
                                                      no_regroup=args.no_regroup,
                                                      per_sequence_contrib=args.per_sequence_contrib,
                                                      per_sequence_abun=args.per_sequence_abun,
                                                      per_sequence_function=args.per_sequence_function,
                                                      print_cmds=args.print_cmds)
    else:
        with TemporaryDirectory() as temp_dir:
            unstrat_abun, unstrat_cov, strat_abun, strat_cov = pathway_pipeline(
                                                            inputfile=args.input,
                                                            mapfile=args.map,
                                                            regroup_mapfile=args.regroup_map,
                                                            proc=args.proc,
                                                            out_dir=temp_dir,
                                                            run_minpath=run_minpath_opt,
                                                            coverage=args.coverage,
                                                            gap_fill_on=gap_fill_opt,
                                                            no_regroup=args.no_regroup,
                                                            per_sequence_contrib=args.per_sequence_contrib,
                                                            per_sequence_abun=args.per_sequence_abun,
                                                            per_sequence_function=args.per_sequence_function,
                                                            print_cmds=args.print_cmds)

    make_output_dir(args.out_dir)

    # Write output files.
    unstrat_abun_outfile = path.join(args.out_dir, "path_abun_unstrat.tsv.gz")
    unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile,  sep="\t",
                       index_label="pathway", compression="gzip")

    if args.coverage:
        unstrat_cov_outfile = path.join(args.out_dir, "path_cov_unstrat.tsv.gz")
        unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile,  sep="\t",
                           index_label="pathway", compression="gzip")

    # Write stratified output only if something besides None was returned.
    if strat_abun is not None:
        strat_abun_outfile = path.join(args.out_dir, "path_abun_strat.tsv.gz")
        strat_abun.to_csv(path_or_buf=strat_abun_outfile,  sep="\t",
                          index=False, compression="gzip")

    if args.coverage and strat_cov is not None:
        strat_cov_outfile = path.join(args.out_dir, "path_cov_strat.tsv.gz")
        strat_cov.to_csv(path_or_buf=strat_cov_outfile,  sep="\t",
                         index=False, compression="gzip")
コード例 #8
0
def pathway_pipeline(inputfile,
                     mapfile,
                     out_dir,
                     proc=1,
                     run_minpath=True,
                     coverage=False,
                     no_regroup=False,
                     regroup_mapfile=None,
                     gap_fill_on=True,
                     per_sequence_contrib=False,
                     per_sequence_abun=None,
                     per_sequence_function=None,
                     print_cmds=False):
    '''Pipeline containing full pipeline for reading input files, making
    calls to functions to run MinPath and calculate pathway abundances and
    coverages. Will return 3 output Pandas dataframes: (1) unstratified pathway
    abundances, (2) unstratified pathway coverages, and (3) stratified pathway
    abundances.'''

    # If no regrouping flag set then set input regrouping mapfile to be None.
    if no_regroup:
        regroup_mapfile = None

    # Read in table of gene family abundances and determine if in stratified
    # format or not.
    in_metagenome, strat_format = read_metagenome_input(inputfile)

    # Basic checks if --per_sequence_contrib set.
    if per_sequence_contrib:

        # Throw error if --per_sequence_contrib set, but --per_sequence_abun
        # and/or --per_sequence_function not set.
        if not per_sequence_abun or not per_sequence_function:
            sys.exit("Error: \"per_sequence_contrib\" option set, but at "
                     "least one of \"per_sequence_abun\" or "
                     "\"per_sequence_function\" were not set. These input "
                     "arguments need to be specified when "
                     "\"per_sequence_contrib\" is used")

        # Make sure that the input files for --per_sequence_contrib exist.
        check_files_exist([per_sequence_abun, per_sequence_function])

    # Remove 'description' column if it exists.
    if "description" in in_metagenome.columns:
        in_metagenome.drop("description", axis=1, inplace=True)

    # Get list of sample ids.
    samples = [
        col for col in in_metagenome.columns
        if col not in ["function", "sequence"]
    ]

    # Initialize reactions to be empty unless regroup mapfile given.
    reactions = []

    # Regroup functions in input table to different ids if regroup mapfile is
    # provided.
    if regroup_mapfile:
        reactions = read_reaction_names(regroup_mapfile)

        in_metagenome = regroup_func_ids(in_metagenome, strat_format,
                                         regroup_mapfile, proc)
        regrouped_outfile = path.join(out_dir, "regrouped_infile.tsv")
        in_metagenome.to_csv(path_or_buf=regrouped_outfile,
                             sep="\t",
                             index=False)

    # Read in pathway structures.
    pathways_in = PathwaysDatabase(database=mapfile, reaction_names=reactions)

    # Write out mapfile with all structure removed.
    if run_minpath:
        minpath_mapfile = path.join(out_dir, "parsed_mapfile.tsv")
        with open(minpath_mapfile, "w") as out_map:
            out_map.write(pathways_in.get_database())
    else:
        minpath_mapfile = None

    # Subset input table of reactions to only those found in pathway database.
    in_metagenome = in_metagenome[in_metagenome.function.isin(
        pathways_in.reaction_list())]

    # Initialize output objects to be None (expect for usntratified abundance).
    path_cov_unstrat = None
    path_cov_strat = None
    path_abun_strat = None

    minpath_out_dir = path.join(out_dir, "minpath_running")
    make_output_dir(minpath_out_dir)

    # Run minpath wrapper on all samples if table is stratified. Note that
    # input stratified table is subsetted to required columns only.
    if strat_format:

        # Get unstratified and stratified pathway levels.
        # Note that stratified tables will only be returned by this step if
        # per_sequence_contrib=False (extra step required below).
        path_out_raw = Parallel(n_jobs=proc)(
            delayed(basic_strat_pathway_levels)
            (sample_id, in_metagenome[["function", "sequence", sample_id]],
             minpath_mapfile, minpath_out_dir, pathways_in, run_minpath,
             coverage, gap_fill_on, per_sequence_contrib, print_cmds)
            for sample_id in samples)

        # Split the output into unstratified and stratified.
        path_raw_abun_unstrat = []
        path_raw_cov_unstrat = []

        if not per_sequence_contrib:
            path_raw_abun_strat = []

            for sample_output in path_out_raw:
                path_raw_abun_unstrat += [sample_output[0]]
                path_raw_cov_unstrat += [sample_output[1]]
                path_raw_abun_strat += [sample_output[2]]

            # If --per_sequence_contrib not sent then prep output stratified
            # table the same as the unstratified table(s) below.
            path_abun_strat = prep_pathway_df_out(path_raw_abun_strat,
                                                  strat_index=True)

            path_abun_strat.columns = ["pathway", "sequence"] + samples

            path_abun_strat.sort_values(['pathway', 'sequence'], inplace=True)

        else:

            for sample_output in path_out_raw:
                path_raw_abun_unstrat += [sample_output[0]]
                path_raw_cov_unstrat += [sample_output[1]]

        # Prep unstratified output tables.
        path_abun_unstrat = prep_pathway_df_out(path_raw_abun_unstrat)
        path_abun_unstrat.columns = samples

        if coverage:
            path_cov_unstrat = prep_pathway_df_out(path_raw_cov_unstrat,
                                                   num_digits=10)
            path_cov_unstrat.columns = samples

    # Otherwise the data is in unstratified format, which is more straight-
    # forward to process.
    else:
        path_raw_unstrat = Parallel(n_jobs=proc)(
            delayed(unstrat_pathway_levels)(sample_id, in_metagenome[[
                "function", sample_id
            ]], minpath_mapfile, minpath_out_dir, pathways_in, run_minpath,
                                            coverage, gap_fill_on, print_cmds)
            for sample_id in samples)

        # Prep output df.
        path_raw_abun_unstrat = []
        path_raw_cov_unstrat = []

        for sample_output in path_raw_unstrat:
            path_raw_abun_unstrat += [sample_output[0]]
            path_raw_cov_unstrat += [sample_output[1]]

        path_abun_unstrat = prep_pathway_df_out(path_raw_abun_unstrat)

        if coverage:
            path_cov_unstrat = prep_pathway_df_out(path_raw_cov_unstrat,
                                                   num_digits=10)
            path_cov_unstrat.columns = samples
        else:
            path_cov_unstrat = None

        # Set column labels of unstratified dataframe to be sample names.
        path_abun_unstrat.columns = samples

    # Sort unstratified output tables by index name.
    path_abun_unstrat.sort_index(axis=0, inplace=True)

    if coverage:
        path_cov_unstrat.sort_index(axis=0, inplace=True)

    # Calculate pathway levels for each individual sequence (in parallel)
    # and then multiply this table by the abundance of each sequence
    # within each sample (using same approach as in metagenome pipeline).
    if per_sequence_contrib:

        per_seq_out_dir = path.join(out_dir, "minpath_running_per_seq")
        make_output_dir(per_seq_out_dir)

        path_abun_strat, path_cov_strat = per_sequence_contrib_levels(
            sequence_abun=per_sequence_abun,
            sequence_func=per_sequence_function,
            minpath_map=minpath_mapfile,
            per_seq_out_dir=per_seq_out_dir,
            pathway_db=pathways_in,
            run_minpath=run_minpath,
            calc_coverage=coverage,
            gap_fill_on=gap_fill_on,
            nproc=proc,
            regroup_map=regroup_mapfile,
            print_opt=print_cmds)

    return (path_abun_unstrat, path_cov_unstrat, path_abun_strat,
            path_cov_strat)
コード例 #9
0
def run_metagenome_pipeline(input_seqabun,
                            function,
                            max_nsti,
                            marker=None,
                            min_reads=1,
                            min_samples=1,
                            strat_out=False,
                            wide_table=False,
                            skip_norm=False,
                            out_dir='metagenome_out'):
    '''Main function to run full metagenome pipeline. Meant to run modular
    functions largely listed below. Will return predicted metagenomes
    straitifed and unstratified by contributing genomes (i.e. taxa).'''

    if not marker and not skip_norm:
        sys.exit("Table of predicted marker gene copy numbers is required "
                 "unless --skip_norm is specified.")
    elif marker and skip_norm:
        sys.exit("Table of predicted marker gene copy numbers should not be "
                 "specified when --skip_norm option is set.")

    make_output_dir(out_dir)

    # Initialize empty pandas dataframe to contain NSTI values.
    nsti_val = pd.DataFrame()

    study_seq_counts = read_seqabun(input_seqabun)

    pred_function = pd.read_csv(function, sep="\t", dtype={'sequence': str})
    pred_function.set_index('sequence', drop=True, inplace=True)

    # If NSTI column present then remove all rows with value above specified
    # max value. Also, remove NSTI column (in both dataframes).
    if 'metadata_NSTI' in pred_function.columns:
        pred_function, nsti_val = drop_tips_by_nsti(tab=pred_function,
                                                    nsti_col='metadata_NSTI',
                                                    max_nsti=max_nsti)
    if not skip_norm:
        check_files_exist([marker])
        pred_marker = pd.read_csv(marker, sep="\t", dtype={'sequence': str})
        pred_marker.set_index('sequence', drop=True, inplace=True)

        if 'metadata_NSTI' in pred_marker.columns:
            pred_marker, nsti_val = drop_tips_by_nsti(tab=pred_marker,
                                                      nsti_col='metadata_NSTI',
                                                      max_nsti=max_nsti)

        # Re-order predicted abundance tables to be in same order as study seqs.
        # Also, drop any sequence ids that don't overlap across all dataframes.
        study_seq_counts, pred_function, pred_marker = three_df_index_overlap_sort(
            study_seq_counts, pred_function, pred_marker)
        norm_output = path.join(out_dir, "seqtab_norm.tsv.gz")

        # Normalize input study sequence abundances by predicted abundance of
        # marker genes and output normalized table if specified.
        study_seq_counts = norm_by_marker_copies(
            input_seq_counts=study_seq_counts,
            input_marker_num=pred_marker,
            norm_filename=norm_output)
    else:
        # Get intersecting rows between input files and sort.
        label_overlap = pred_function.index.intersection(
            study_seq_counts.index).sort_values()

        if len(label_overlap) == 0:
            sys.exit("No sequence ids overlap between both input files.")

        pred_function = pred_function.reindex(label_overlap)
        study_seq_counts = study_seq_counts.reindex(label_overlap)

    # If NSTI column input then output weighted NSTI values.
    if not nsti_val.empty:
        weighted_nsti_out = path.join(out_dir, "weighted_nsti.tsv.gz")
        calc_weighted_nsti(seq_counts=study_seq_counts,
                           nsti_input=nsti_val,
                           outfile=weighted_nsti_out)

    # Determine which sequences should be in the "RARE" category if stratified
    # table is specified.
    if strat_out:
        rare_seqs = []

        if min_reads != 1 or min_samples != 1:
            rare_seqs = id_rare_seqs(in_counts=study_seq_counts,
                                     min_reads=min_reads,
                                     min_samples=min_samples)

    # Generate and return final tables.
    if not strat_out:
        return (None,
                unstrat_funcs_only_by_samples(pred_function, study_seq_counts))

    elif strat_out and not wide_table:
        return (metagenome_contributions(pred_function, study_seq_counts,
                                         rare_seqs),
                unstrat_funcs_only_by_samples(pred_function, study_seq_counts))

    elif strat_out and wide_table:
        return (strat_funcs_by_samples(pred_function, study_seq_counts,
                                       rare_seqs))
コード例 #10
0
def run_metagenome_pipeline(input_seqabun,
                            function,
                            marker,
                            max_nsti,
                            min_reads=1,
                            min_samples=1,
                            metagenome_contrib=False,
                            strat_out=False,
                            out_dir='metagenome_out'):
    '''Main function to run full metagenome pipeline. Meant to run modular
    functions largely listed below. Will return predicted metagenomes
    straitifed and unstratified by contributing genomes (i.e. taxa).'''

    # Read in input table of sequence abundances.

    study_seq_counts = read_seqabun(input_seqabun)

    # Read in predicted function and marker gene abundances.
    pred_function = pd.read_csv(function, sep="\t", index_col="sequence")
    pred_marker = pd.read_csv(marker, sep="\t", index_col="sequence")

    pred_function.index = pred_function.index.astype(str)
    pred_marker.index = pred_marker.index.astype(str)

    # Initialize empty pandas dataframe to contain NSTI values.
    nsti_val = pd.DataFrame()

    # If NSTI column present then remove all rows with value above specified
    # max value. Also, remove NSTI column (in both dataframes).
    if 'metadata_NSTI' in pred_function.columns:
        pred_function, nsti_val = drop_tips_by_nsti(tab=pred_function,
                                                    nsti_col='metadata_NSTI',
                                                    max_nsti=max_nsti)

    if 'metadata_NSTI' in pred_marker.columns:
        pred_marker, nsti_val = drop_tips_by_nsti(tab=pred_marker,
                                                  nsti_col='metadata_NSTI',
                                                  max_nsti=max_nsti)

    # Re-order predicted abundance tables to be in same order as study seqs.
    # Also, drop any sequence ids that don't overlap across all dataframes.
    study_seq_counts, pred_function, pred_marker = three_df_index_overlap_sort(
        study_seq_counts, pred_function, pred_marker)

    # Create output directory if it does not already exist.
    make_output_dir(out_dir)

    # Create normalized sequence abundance filename.
    norm_output = path.join(out_dir, "seqtab_norm.tsv.gz")

    # Normalize input study sequence abundances by predicted abundance of
    # marker genes and output normalized table if specified.
    study_seq_counts = norm_by_marker_copies(input_seq_counts=study_seq_counts,
                                             input_marker_num=pred_marker,
                                             norm_filename=norm_output)

    # If NSTI column input then output weighted NSTI values.
    if not nsti_val.empty:
        weighted_nsti_out = path.join(out_dir, "weighted_nsti.tsv.gz")
        calc_weighted_nsti(seq_counts=study_seq_counts,
                           nsti_input=nsti_val,
                           outfile=weighted_nsti_out)

    # Determine which sequences should be in the "RARE" category if either
    # the metagenome contributions table or stratified table is requested.
    if metagenome_contrib or strat_out:
        rare_seqs = []

        if min_reads != 1 or min_samples != 1:
            rare_seqs = id_rare_seqs(in_counts=study_seq_counts,
                                     min_reads=min_reads,
                                     min_samples=min_samples)

    # Output metagenome contributions table if specified.
    if metagenome_contrib:

        metagenome_contib_output = metagenome_contributions(
            func_abun=pred_function,
            sample_abun=study_seq_counts,
            rare_seqs=rare_seqs)

        metagenome_contib_outfile = path.join(out_dir,
                                              "metagenome_contrib.tsv.gz")

        metagenome_contib_output.to_csv(path_or_buf=metagenome_contib_outfile,
                                        sep="\t",
                                        index=False,
                                        compression="gzip")

    # Generate tables of functions by sample and return (either stratified or
    # not).
    if strat_out:

        return (strat_funcs_by_samples(pred_function, study_seq_counts,
                                       rare_seqs))
    else:
        return (None,
                unstrat_funcs_only_by_samples(pred_function, study_seq_counts))
コード例 #11
0
ファイル: run_minpath.py プロジェクト: misazaa/picrust2
def minpath_wrapper(sample_id,
                    unstrat_input,
                    minpath_map,
                    out_dir,
                    print_opt=False,
                    extra_str=""):
    '''Run MinPath based on gene abundances in a single sample. Will return
    the abundances of gene families within each identified pathway.'''

    # Make output directory for MinPath intermediate files.
    make_output_dir(path.join(out_dir, "minpath_running"))

    # Define MinPath input and output filenames.
    minpath_in = path.join(out_dir, "minpath_running",
                           sample_id + extra_str + "_minpath_in.txt")

    minpath_report = path.join(out_dir, "minpath_running",
                               sample_id + extra_str + "_minpath_report.txt")

    minpath_details = path.join(out_dir, "minpath_running",
                                sample_id + extra_str + "_minpath_details.txt")

    minpath_mps = path.join(out_dir, "minpath_running",
                            sample_id + extra_str + "_minpath.mps")

    minpath_output = open(path.join(out_dir, sample_id + "_minpath_out.txt"),
                          "w")

    id_minpath_fh = open(minpath_in, "w")

    # Inititalize dictionary for keeping track of reaction abundances.
    reaction_abun = defaultdict(int)

    # Loop over all reactions (which are the index labels in unstrat table
    # unless regrouped).
    for reaction_id in unstrat_input.index.values:
        # Get count of each sequence in sample and write that sequence out
        # along with count if non-zero abundance.
        reaction_count = unstrat_input.loc[reaction_id, sample_id]

        # If 0 then skip.
        if reaction_count == 0:
            continue

        id_minpath_fh.write(reaction_id + "\t" + str(reaction_count) + "\n")

        reaction_abun[reaction_id] = reaction_count

    id_minpath_fh.close()

    # Run MinPath on this sample.
    path2minpath = path.join(get_picrust_project_dir(), 'picrust2', 'MinPath',
                             'MinPath12hmp.py')

    minpath_cmd = path2minpath + " -any " + minpath_in + " -map " +\
                  minpath_map + " -report " + minpath_report +\
                  " -details " + minpath_details + " -mps " + minpath_mps

    system_call_check(minpath_cmd, print_out=print_opt, stdout=minpath_output)

    # Read through MinPath report and keep track of pathways identified
    # to be present.
    path_present = identify_minpath_present(minpath_report)

    # Return list of which pathways are present and the abundances of all gene
    # families.
    return (path_present, reaction_abun)
コード例 #12
0
def full_pipeline(study_fasta, input_table, output_folder, threads, ref_msa,
                  tree, hmm, in_traits, custom_trait_tables, marker_gene_table,
                  pathway_map, no_pathways, regroup_map, no_regroup,
                  stratified, alignment_tool, max_nsti, min_reads, min_samples,
                  hsp_method, calculate_NSTI, confidence, seed, no_gap_fill,
                  per_sequence_contrib, no_descrip, verbose):
    '''Function that contains wrapper commands for full PICRUSt2 pipeline.
    Descriptions of all of these input arguments/options are given in the
    picrust2_pipeline.py script.'''

    # Check that input files exist.
    check_files_exist([study_fasta, input_table])

    if path.exists(output_folder):
        sys.exit("Stopping - output directory " + output_folder +
                 " already exists.")

    # Make output folder.
    make_output_dir(output_folder)

    out_tree = path.join(output_folder, "out.tre")

    if custom_trait_tables is None:

        # Check that specified functional categories are allowed.
        FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM']
        funcs = in_traits.split(",")
        for func in funcs:
            if func not in FUNC_TRAIT_OPTIONS:
                sys.exit("Error - specified category " + func +
                         " is not one of "
                         "the default categories.")

        # Add EC to this set if pathways are to be predicted.
        if "EC" not in funcs and not no_pathways:
            funcs.append("EC")

        rxn_func = "EC"

        func_tables = default_tables

    else:

        no_descrip = True

        funcs = []
        func_tables = {}

        table_i = 0

        for custom in custom_trait_tables.split(","):

            func_id = path.splitext(path.basename(custom))[0]
            funcs.append(func_id)
            func_tables[func_id] = custom

            if table_i == 0:
                rxn_func = func_id
                table_i += 1

    # Append marker as well, since this also needs to be run.
    funcs.append("marker")
    func_tables["marker"] = marker_gene_table

    # Methods for discrete trait prediction with CI enabled.
    discrete_set = set(['emp_prob', 'mp'])

    if confidence and hsp_method in discrete_set:
        ci_setting = True
    else:
        ci_setting = False

    gap_fill_opt = not no_gap_fill

    if verbose:
        print("Placing sequences onto reference tree.")

    # Define folders for intermediate files.
    intermediate_dir = path.join(output_folder, "intermediate")
    place_seqs_intermediate = path.join(intermediate_dir, "place_seqs")
    make_output_dir(intermediate_dir)
    make_output_dir(place_seqs_intermediate)

    place_seqs_pipeline(study_fasta=study_fasta,
                        ref_msa=ref_msa,
                        tree=tree,
                        hmm=hmm,
                        out_tree=out_tree,
                        alignment_tool=alignment_tool,
                        threads=threads,
                        out_dir=place_seqs_intermediate,
                        chunk_size=5000,
                        print_cmds=verbose)

    if verbose:
        print("Finished placing sequences on output tree: " + out_tree)

    # Get predictions for all specified functions and keep track of outfiles.
    predicted_funcs = {}

    for func in funcs:

        count_outfile = hsp_pipeline_steps(func=func,
                                           calculate_NSTI=calculate_NSTI,
                                           out_tree=out_tree,
                                           func_table_in=func_tables[func],
                                           hsp_method=hsp_method,
                                           ci_setting=ci_setting,
                                           threads=threads,
                                           seed=seed,
                                           output_folder=output_folder,
                                           verbose=verbose)

        # Keep track of output file name for next step of pipeline.
        predicted_funcs[func] = count_outfile

    marker_infile = predicted_funcs["marker"]

    # Inititalize dictionary of function names to output filenames to return.
    func_output = {}

    # Each value will be a list of 2 elements corresponding to the unstratified
    # and stratified tables respectively (stratified will be None of not calculated).

    # Loop over each function again and run metagenome pipeline.
    for func in funcs:

        if func == "marker":
            continue

        if verbose:
            print("Running metagenome pipeline for " + func)

        func_infile = predicted_funcs[func]

        func_output_dir = path.join(output_folder, func + "_metagenome_out")

        func_map = None

        if func in default_map:
            func_map = default_map[func]

        func_strat_out, func_unstrat_out = metagenome_pipeline_steps(
            input_table=input_table,
            func_infile=func_infile,
            marker_infile=marker_infile,
            func_output_dir=func_output_dir,
            no_descrip=no_descrip,
            max_nsti=max_nsti,
            min_reads=min_reads,
            min_samples=min_samples,
            stratified=stratified,
            threads=threads,
            func_map=func_map,
            verbose=verbose)
        if stratified:
            func_output[func] = func_strat_out
        else:
            func_output[func] = func_unstrat_out

    pathway_outfiles = None

    # Infer pathway abundances and coverages unless --no_pathways set.
    if not no_pathways:

        pathways_intermediate = path.join(intermediate_dir, "pathways")
        make_output_dir(pathways_intermediate)

        if verbose:
            print("Inferring pathways from predicted " + rxn_func)

        predicted_rxn = func_output[rxn_func]

        # Set regrouping mapfile to be empty if no_regroup set.
        if no_regroup:
            regroup_map = None

        unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline(
            inputfile=predicted_rxn,
            mapfile=pathway_map,
            regroup_mapfile=regroup_map,
            proc=threads,
            out_dir=pathways_intermediate,
            gap_fill=gap_fill_opt,
            per_sequence_contrib=per_sequence_contrib,
            print_cmds=verbose)

        pathways_out = path.join(output_folder, "pathways_out")

        unstrat_abun.index.name = 'pathway'
        unstrat_cov.index.name = 'pathway'
        unstrat_abun.reset_index(inplace=True)
        unstrat_cov.reset_index(inplace=True)

        pathway_outfiles = {}

        if not no_descrip:
            unstrat_abun = add_descrip_col(inputfile=unstrat_abun,
                                           mapfile=default_map["METACYC"],
                                           in_df=True)
        if not no_descrip:
            unstrat_cov = add_descrip_col(inputfile=unstrat_cov,
                                          mapfile=default_map["METACYC"],
                                          in_df=True)

        if verbose:
            print("Writing predicted pathway abundances and coverages to " +
                  pathways_out)

        make_output_dir(pathways_out)

        unstrat_abun_outfile = path.join(pathways_out, "path_abun_unstrat.tsv")
        unstrat_cov_outfile = path.join(pathways_out, "path_cov_unstrat.tsv")

        unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile,
                            sep="\t",
                            index=False)
        unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile,
                           sep="\t",
                           index=False)

        pathway_outfiles["unstrat_abun"] = unstrat_abun_outfile
        pathway_outfiles["unstrat_cov"] = unstrat_cov_outfile

        strat_abun_outfile = None
        strat_cov_outfile = None

        # Write stratified output only if something besides None was returned.
        if strat_abun is not None:

            if not no_descrip:
                strat_abun = add_descrip_col(inputfile=strat_abun,
                                             mapfile=default_map["METACYC"],
                                             in_df=True)
            strat_abun_outfile = path.join(pathways_out, "path_abun_strat.tsv")
            strat_abun.to_csv(path_or_buf=strat_abun_outfile,
                              sep="\t",
                              index=False)

        if strat_cov is not None:

            if not no_descrip:
                strat_cov = add_descrip_col(inputfile=strat_cov,
                                            mapfile=default_map["METACYC"],
                                            in_df=True)

            strat_cov_outfile = path.join(pathways_out, "path_cov_strat.tsv")
            strat_cov.to_csv(path_or_buf=strat_cov_outfile,
                             sep="\t",
                             index=False)

        pathway_outfiles["strat_abun"] = strat_abun_outfile
        pathway_outfiles["strat_cov"] = strat_cov_outfile

    return (func_output, pathway_outfiles)
コード例 #13
0
def main():

    args = parser.parse_args()

    # Check that input files exist.
    check_files_exist([args.input, args.map])

    gap_fill_opt = not args.no_gap_fill

    run_minpath_opt = not args.skip_minpath

    # If intermediate output directory set then create and output there.
    # Otherwise make a temporary directory for the intermediate files.
    if args.intermediate:

        make_output_dir(args.intermediate)

        unstrat_abun, \
            unstrat_cov, \
            strat_abun, \
            strat_cov, \
            path_abun_by_seq, \
            path_cov_by_seq, \
            unstrat_abun_per_seq = pathway_pipeline(
                            inputfile=args.input,
                            mapfile=args.map,
                            regroup_mapfile=args.regroup_map,
                            proc=args.processes,
                            out_dir=args.intermediate,
                            run_minpath=run_minpath_opt,
                            coverage=args.coverage,
                            gap_fill_on=gap_fill_opt,
                            no_regroup=args.no_regroup,
                            per_sequence_contrib=args.per_sequence_contrib,
                            per_sequence_abun=args.per_sequence_abun,
                            per_sequence_function=args.per_sequence_function,
                            wide_table=args.wide_table,
                            print_cmds=args.print_cmds)
    else:
        with TemporaryDirectory() as temp_dir:
            unstrat_abun, \
                unstrat_cov, \
                strat_abun, \
                strat_cov, \
                path_abun_by_seq, \
                path_cov_by_seq, \
                unstrat_abun_per_seq = pathway_pipeline(
                            inputfile=args.input,
                            mapfile=args.map,
                            regroup_mapfile=args.regroup_map,
                            proc=args.processes,
                            out_dir=temp_dir,
                            run_minpath=run_minpath_opt,
                            coverage=args.coverage,
                            gap_fill_on=gap_fill_opt,
                            no_regroup=args.no_regroup,
                            per_sequence_contrib=args.per_sequence_contrib,
                            per_sequence_abun=args.per_sequence_abun,
                            per_sequence_function=args.per_sequence_function,
                            wide_table=args.wide_table,
                            print_cmds=args.print_cmds)

    make_output_dir(args.out_dir)

    # Write output files. The unstratified abundance table will always be
    # written, but the other files will only be written if applicable.
    unstrat_abun_outfile = path.join(args.out_dir, "path_abun_unstrat.tsv.gz")
    unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile,
                        sep="\t",
                        index_label="pathway",
                        compression="gzip")

    if args.coverage:
        unstrat_cov_outfile = path.join(args.out_dir,
                                        "path_cov_unstrat.tsv.gz")
        unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile,
                           sep="\t",
                           index_label="pathway",
                           compression="gzip")

    if strat_abun is not None:

        if args.wide_table:
            strat_abun_outfile = path.join(args.out_dir,
                                           "path_abun_strat.tsv.gz")
        else:
            strat_abun_outfile = path.join(args.out_dir,
                                           "path_abun_contrib.tsv.gz")

        strat_abun.to_csv(path_or_buf=strat_abun_outfile,
                          sep="\t",
                          index=False,
                          compression="gzip")

    if args.coverage and strat_cov is not None:
        if args.wide_table:
            strat_cov_outfile = path.join(args.out_dir,
                                          "path_cov_strat.tsv.gz")
        else:
            strat_cov_outfile = path.join(args.out_dir,
                                          "path_cov_contrib.tsv.gz")

        strat_cov.to_csv(path_or_buf=strat_cov_outfile,
                         sep="\t",
                         index=False,
                         compression="gzip")

    if path_abun_by_seq is not None:
        genome_path_abun_outfile = path.join(args.out_dir,
                                             "path_abun_predictions.tsv.gz")
        path_abun_by_seq.to_csv(path_or_buf=genome_path_abun_outfile,
                                sep="\t",
                                index=True,
                                compression="gzip",
                                index_label="sequence")

    if args.coverage and path_cov_by_seq is not None:
        genome_path_cov_outfile = path.join(args.out_dir,
                                            "path_cov_predictions.tsv.gz")
        path_cov_by_seq.to_csv(path_or_buf=genome_path_cov_outfile,
                               sep="\t",
                               index=True,
                               compression="gzip",
                               index_label="sequence")

    if unstrat_abun_per_seq is not None:
        unstrat_abun_per_seq_outfile = path.join(
            args.out_dir, "path_abun_unstrat_per_seq.tsv.gz")
        unstrat_abun_per_seq.to_csv(path_or_buf=unstrat_abun_per_seq_outfile,
                                    sep="\t",
                                    index_label="pathway",
                                    compression="gzip")
def full_pipeline(study_fasta, input_table, output_folder, processes, ref_dir,
                  in_traits, custom_trait_tables, marker_gene_table,
                  pathway_map, rxn_func, no_pathways, regroup_map, no_regroup,
                  stratified, max_nsti, min_reads, min_samples, hsp_method,
                  min_align, skip_nsti, skip_minpath, no_gap_fill, coverage,
                  per_sequence_contrib, wide_table, skip_norm,
                  remove_intermediate, verbose):
    '''Function that contains wrapper commands for full PICRUSt2 pipeline.
    Descriptions of all of these input arguments/options are given in the
    picrust2_pipeline.py script.'''

    # Throw warning if --per_sequence_contrib set but --stratified unset.
    if per_sequence_contrib and not stratified:
        print(
            "\nThe option --per_sequence_contrib was set, but not the option "
            "--stratified. This means that a stratified pathway table will "
            "be output only (i.e. a stratified metagenome table will NOT "
            "be output).\n",
            file=sys.stderr)

    out_tree = path.join(output_folder, "out.tre")

    if custom_trait_tables is None:

        # Check that specified functional categories are allowed.
        FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM']
        funcs = in_traits.split(",")
        for func in funcs:
            if func not in FUNC_TRAIT_OPTIONS:
                sys.exit("Error - specified category " + func + " is not " +
                         "one of the default categories.")

        func_tables = default_tables

    else:
        # Split paths to input custom trait tables and take the basename to be
        # the function id.
        funcs = []
        func_tables = {}

        for custom in custom_trait_tables.split(","):

            func_id = path.splitext(path.basename(custom))[0]
            funcs.append(func_id)
            func_tables[func_id] = custom

    # Add reaction function to be in set of gene families if it is not already
    # and as long as pathways are also to be predicted.
    if rxn_func not in funcs and not no_pathways:
        orig_rxn_func = rxn_func
        rxn_func = path.splitext(path.basename(rxn_func))[0]
        funcs.append(rxn_func)

        if rxn_func not in func_tables:
            func_tables[rxn_func] = orig_rxn_func

    if not skip_norm:
        # Append marker as well, since this also needs to be run.
        funcs.append("marker")
        func_tables["marker"] = marker_gene_table

    # Check that all input files exist.
    ref_msa, tree, hmm, model = identify_ref_files(ref_dir)
    files2check = [study_fasta, input_table, ref_msa, tree, hmm, model] + list(
        func_tables.values())

    if not no_pathways:
        files2check.append(pathway_map)

        # Throw warning if default pathway mapfile used with non-default
        # reference files.
        if pathway_map == default_pathway_map and ref_dir != default_ref_dir:
            print(
                "Warning - non-default reference files specified with "
                "default pathway mapfile of prokaryote-specific MetaCyc "
                "pathways (--pathway_map option). This usage may be "
                "unintended.",
                file=sys.stderr)

        if not no_regroup:
            files2check.append(regroup_map)

    # This will throw an error if any input files are not found.
    check_files_exist(files2check)

    # Check that sequence names in FASTA overlap with input table.
    check_overlapping_seqs(study_fasta, input_table, verbose)

    if path.exists(output_folder):
        sys.exit("Stopping since output directory " + output_folder +
                 " already exists.")

    # Make output folder.
    make_output_dir(output_folder)

    if verbose:
        print("Placing sequences onto reference tree", file=sys.stderr)

    # Define folders for intermediate files (unless --remove_intermediate set).
    if remove_intermediate:
        place_seqs_intermediate = ""
        pathways_intermediate = ""
    else:
        intermediate_dir = path.join(output_folder, "intermediate")
        make_output_dir(intermediate_dir)
        place_seqs_intermediate = path.join(intermediate_dir, "place_seqs")
        pathways_intermediate = path.join(intermediate_dir, "pathways")

    # Run place_seqs.py.
    place_seqs_cmd = [
        "place_seqs.py", "--study_fasta", study_fasta, "--ref_dir", ref_dir,
        "--out_tree", out_tree, "--processes",
        str(processes), "--intermediate", place_seqs_intermediate,
        "--min_align",
        str(min_align), "--chunk_size",
        str(5000)
    ]

    if verbose:
        place_seqs_cmd.append("--verbose")

    system_call_check(place_seqs_cmd,
                      print_command=verbose,
                      print_stdout=verbose,
                      print_stderr=True)

    if verbose:
        print("Finished placing sequences on output tree: " + out_tree,
              file=sys.stderr)

    # Get predictions for all specified functions and keep track of outfiles.
    predicted_funcs = {}

    if not skip_norm:
        # Make sure marker database is first in the list. This is because this will
        # be run on a single core and so will be easier to identify any errors
        # if the program exits when working on this function type.
        funcs.insert(0, funcs.pop(funcs.index("marker")))

    for func in funcs:
        # Change output filename for NSTI and non-NSTI containing files.
        hsp_outfile = path.join(output_folder, func + "_predicted")

        if (func == "marker" and not skip_nsti) or (skip_norm
                                                    and not skip_nsti):
            hsp_outfile = hsp_outfile + "_and_nsti.tsv.gz"
        else:
            hsp_outfile = hsp_outfile + ".tsv.gz"

        # Keep track of output filename for next step of pipeline.
        predicted_funcs[func] = hsp_outfile

        # Run hsp.py for each function database.
        hsp_cmd = [
            "hsp.py", "--tree", out_tree, "--output", hsp_outfile,
            "--observed_trait_table", func_tables[func], "--hsp_method",
            hsp_method, "--seed", "100"
        ]

        # Add flags to command if specified.
        if (func == "marker" and not skip_nsti) or (skip_norm
                                                    and not skip_nsti):
            hsp_cmd.append("--calculate_NSTI")

        # Run marker on only 1 processor.
        if func == "marker":
            hsp_cmd += ["--processes", "1"]
        else:
            hsp_cmd += ["--processes", str(processes)]

        if verbose:
            hsp_cmd.append("--verbose")

        system_call_check(hsp_cmd,
                          print_command=verbose,
                          print_stdout=verbose,
                          print_stderr=True)

    # Now run metagenome pipeline commands.
    # Inititalize dictionary of function names --> metagenome output files.
    func_output = {}

    # Loop over each function again and run metagenome pipeline.
    for func in funcs:

        if func == "marker":
            continue

        if verbose:
            print("Running metagenome pipeline for " + func, file=sys.stderr)

        func_output_dir = path.join(output_folder, func + "_metagenome_out")

        metagenome_pipeline_cmd = [
            "metagenome_pipeline.py", "--input", input_table, "--function",
            predicted_funcs[func], "--min_reads",
            str(min_reads), "--min_samples",
            str(min_samples), "--out_dir", func_output_dir
        ]

        # Initialize two-element list as value for each function.
        # First value will be unstratified output and second will be
        # stratified output.
        func_output[func] = [None, None]

        func_output[func][0] = path.join(func_output_dir,
                                         "pred_metagenome_unstrat.tsv.gz")

        if wide_table:
            metagenome_pipeline_cmd.append("--wide_table")

        if not skip_nsti:
            metagenome_pipeline_cmd += ["--max_nsti", str(max_nsti)]

        if skip_norm:
            metagenome_pipeline_cmd.append("--skip_norm")
        else:
            metagenome_pipeline_cmd += ["--marker", predicted_funcs["marker"]]

        if stratified:
            metagenome_pipeline_cmd.append("--strat_out")

            if wide_table:
                func_output[func][1] = path.join(
                    func_output_dir, "pred_metagenome_strat.tsv.gz")
            else:
                func_output[func][1] = path.join(
                    func_output_dir, "pred_metagenome_contrib.tsv.gz")

        system_call_check(metagenome_pipeline_cmd,
                          print_command=verbose,
                          print_stdout=verbose,
                          print_stderr=True)

    # Now infer pathway abundances and coverages unless --no_pathways set.
    pathway_outfiles = None

    if not no_pathways:

        path_output_dir = path.join(output_folder, "pathways_out")

        if verbose:
            print("Inferring pathways from predicted " + rxn_func)

        # Determine whether stratified or unstratified table should be input.
        if not stratified or per_sequence_contrib:
            rxn_input_metagenome = func_output[rxn_func][0]
        else:
            rxn_input_metagenome = func_output[rxn_func][1]

        pathway_pipeline_cmd = [
            "pathway_pipeline.py", "--input", rxn_input_metagenome,
            "--out_dir", path_output_dir, "--map", pathway_map,
            "--intermediate", pathways_intermediate, "--proc",
            str(processes)
        ]

        if no_gap_fill:
            pathway_pipeline_cmd.append("--no_gap_fill")

        if skip_minpath:
            pathway_pipeline_cmd.append("--skip_minpath")

        if coverage:
            pathway_pipeline_cmd.append("--coverage")

        if no_regroup:
            pathway_pipeline_cmd.append("--no_regroup")
        else:
            pathway_pipeline_cmd += ["--regroup_map", regroup_map]

        if wide_table:
            pathway_pipeline_cmd.append("--wide_table")

        if per_sequence_contrib:
            pathway_pipeline_cmd.append("--per_sequence_contrib")

            if skip_norm:
                norm_sequence_abun = input_table
            else:
                norm_sequence_abun = path.join(output_folder,
                                               rxn_func + "_metagenome_out",
                                               "seqtab_norm.tsv.gz")

            pathway_pipeline_cmd += ["--per_sequence_abun", norm_sequence_abun]

            pathway_pipeline_cmd += [
                "--per_sequence_function", predicted_funcs[rxn_func]
            ]

        if verbose:
            pathway_pipeline_cmd.append("--verbose")

        system_call_check(pathway_pipeline_cmd,
                          print_command=verbose,
                          print_stdout=False,
                          print_stderr=True)

        if verbose:
            print("Wrote predicted pathway abundances and coverages to " +
                  path_output_dir,
                  file=sys.stderr)

        # Keep track of output filenames if this function is being used in
        # a non-default way (e.g. with a QIIME2 plugin).
        pathway_outfiles = {}

        pathway_outfiles["unstrat_abun"] = path.join(
            path_output_dir, "path_abun_unstrat.tsv.gz")
        pathway_outfiles["unstrat_cov"] = path.join(path_output_dir,
                                                    "path_cov_unstrat.tsv.gz")

        pathway_outfiles["strat_abun"] = None
        pathway_outfiles["strat_cov"] = None

        if stratified or per_sequence_contrib:
            if wide_table:
                pathway_outfiles["strat_abun"] = path.join(
                    path_output_dir, "path_abun_strat.tsv.gz")

                if per_sequence_contrib:
                    pathway_outfiles["strat_cov"] = path.join(
                        path_output_dir, "path_cov_strat.tsv.gz")

            else:
                pathway_outfiles["strat_abun"] = path.join(
                    path_output_dir, "path_abun_contrib.tsv.gz")
                if per_sequence_contrib:
                    pathway_outfiles["strat_cov"] = path.join(
                        path_output_dir, "path_cov_contrib.tsv.gz")

    return (func_output, pathway_outfiles)
コード例 #15
0
def run_metagenome_pipeline(input_biom,
                            function,
                            marker,
                            max_nsti,
                            out_dir='metagenome_out',
                            proc=1,
                            output_normfile=False):
    '''Main function to run full metagenome pipeline. Meant to run modular
    functions largely listed below. Will return predicted metagenomes
    straitifed and unstratified by contributing genomes (i.e. taxa).'''

    # Read in input table of sequence abundances and convert to pandas df.
    study_seq_counts = biom_to_pandas_df(biom.load_table(input_biom))

    # Read in predicted function and marker gene abundances.
    pred_function = pd.read_table(function, sep="\t", index_col="sequence")
    pred_marker = pd.read_table(marker, sep="\t", index_col="sequence")

    pred_function.index = pred_function.index.astype(str)
    pred_marker.index = pred_marker.index.astype(str)

    # Initialize empty pandas dataframe to contain NSTI values.
    nsti_val = pd.DataFrame()

    # If NSTI column present then remove all rows with value above specified
    # max value. Also, remove NSTI column (in both dataframes).
    if "metadata_NSTI" in pred_function.columns:

        pred_function = pred_function[pred_function['metadata_NSTI'] <= max_nsti]
        nsti_val = pred_function[['metadata_NSTI']]
        pred_function.drop('metadata_NSTI', axis=1, inplace=True)

    if "metadata_NSTI" in pred_marker.columns:
        pred_marker = pred_marker[pred_marker['metadata_NSTI'] <= max_nsti]

        nsti_val = pred_marker[['metadata_NSTI']]

        pred_marker.drop('metadata_NSTI', axis=1, inplace=True)

    # Re-order predicted abundance tables to be in same order as study seqs.
    # Also, drop any sequence ids that don't overlap across all dataframes.
    study_seq_counts, pred_function, pred_marker = three_df_index_overlap_sort(study_seq_counts,
                                                                               pred_function,
                                                                               pred_marker)

    # Create output directory if it does not already exist.
    make_output_dir(out_dir)

    # Create normalized sequence abundance filename if outfile specified.
    if output_normfile:
        norm_output = path.join(out_dir, "seqtab_norm.tsv")
    else:
        norm_output = None

    # Normalize input study sequence abundances by predicted abundance of
    # marker genes and output normalized table if specified.
    study_seq_counts = norm_by_marker_copies(input_seq_counts=study_seq_counts,
                                             input_marker_num=pred_marker,
                                             norm_filename=norm_output)

    # If NSTI column input then output weighted NSTI values.
    if not nsti_val.empty:
        weighted_nsti_out = path.join(out_dir, "weighted_nsti.tsv")
        weighted_nsti = calc_weighted_nsti(seq_counts=study_seq_counts,
                                           nsti_input=nsti_val,
                                           outfile=weighted_nsti_out)

    # Get predicted function counts by sample, stratified by contributing
    # genomes and also separately unstratified.
    return(funcs_by_sample(input_seq_counts=study_seq_counts,
                           input_function_num=pred_function,
                           proc=proc))
コード例 #16
0
ファイル: run_minpath.py プロジェクト: misazaa/picrust2
def main():

    args = parser.parse_args()

    # Check that input files exist.
    check_files_exist([args.input, args.map])

    gap_fill_opt = not args.no_gap_fill

    # If no regrouping flag set then set input regrouping mapfile to be None.
    if args.no_regroup:
        args.regroup_map = None

    # If intermediate output directory set then create and output there.
    # Otherwise make a temporary directory for the intermediate files.
    if args.intermediate:

        make_output_dir(args.intermediate)

        unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline(
            inputfile=args.input,
            mapfile=args.map,
            regroup_mapfile=args.regroup_map,
            proc=args.proc,
            out_dir=args.intermediate,
            gap_fill=gap_fill_opt,
            per_sequence_contrib=args.per_sequence_contrib,
            print_cmds=args.print_cmds)
    else:
        with TemporaryDirectory() as temp_dir:
            unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline(
                inputfile=args.input,
                mapfile=args.map,
                regroup_mapfile=args.regroup_map,
                proc=args.proc,
                out_dir=temp_dir,
                gap_fill=gap_fill_opt,
                per_sequence_contrib=args.per_sequence_contrib,
                print_cmds=args.print_cmds)

    make_output_dir(args.out_dir)

    # Write output files.
    unstrat_abun_outfile = path.join(args.out_dir, "path_abun_unstrat.tsv")
    unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile,
                        sep="\t",
                        index_label="pathway")

    unstrat_cov_outfile = path.join(args.out_dir, "path_cov_unstrat.tsv")
    unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile,
                       sep="\t",
                       index_label="pathway")

    # Write stratified output only if something besides None was returned.
    if strat_abun is not None:
        strat_abun_outfile = path.join(args.out_dir, "path_abun_strat.tsv")
        strat_abun.to_csv(path_or_buf=strat_abun_outfile,
                          sep="\t",
                          index=False)

    if strat_cov is not None:
        strat_cov_outfile = path.join(args.out_dir, "path_cov_strat.tsv")
        strat_cov.to_csv(path_or_buf=strat_cov_outfile, sep="\t", index=False)
コード例 #17
0
ファイル: picrust2_pipeline.py プロジェクト: misazaa/picrust2
def main():

    args = parser.parse_args()

    # Get start time.
    start_time = time.time()

    # Check that input files exist.
    check_files_exist([args.study_fasta, args.input])

    # Make output folder.
    make_output_dir(args.output)

    out_tree = path.join(args.output, "out.tre")

    if args.custom_trait_tables is None:

        # Check that specified functional categories are allowed.
        FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM']
        funcs = args.in_traits.split(",")
        for func in funcs:
            if func not in FUNC_TRAIT_OPTIONS:
                sys.exit("Error - specified category " + func +
                         " is not one of "
                         "the default categories.")

        # Add EC to this set if pathways are to be predicted.
        if "EC" not in funcs and not args.no_pathways:
            funcs.append("EC")

        rxn_func = "EC"

        func_tables = default_tables

    else:
        funcs = []
        func_tables = {}

        table_i = 0

        for custom in args.custom_trait_tables.split(","):

            func_id = path.splitext(path.basename(custom))[0]
            funcs.append(func_id)
            func_tables[func_id] = custom

            if table_i == 0:
                rxn_func = func_id
                table_i += 1

    # Append marker as well, since this also needs to be run.
    funcs.append("marker")
    func_tables["marker"] = args.marker_gene_table

    # Methods for discrete trait prediction with CI enabled.
    discrete_set = set(['emp_prob', 'mp'])

    if args.confidence and args.hsp_method in discrete_set:
        ci_setting = True
    else:
        ci_setting = False

    gap_fill_opt = not args.no_gap_fill

    with TemporaryDirectory() as temp_dir:

        print("Placing sequences onto reference tree.")

        place_seqs_pipeline(study_fasta=args.study_fasta,
                            ref_msa=args.ref_msa,
                            tree=args.tree,
                            out_tree=out_tree,
                            threads=args.threads,
                            papara_output=None,
                            out_dir=temp_dir,
                            chunk_size=5000,
                            print_cmds=args.print_cmds)

        print("Finished placing sequences on output tree: " + out_tree)

    # Get predictions for all specified functions and keep track of outfiles.
    predicted_funcs = {}

    for func in funcs:

        # Only output NSTI in 16S table.
        nsti_setting = False
        if func == "marker" and args.calculate_NSTI:
            nsti_setting = True

        print("Running hidden-state prediction for " + func)

        hsp_table, ci_table = castor_hsp_workflow(
            tree_path=out_tree,
            trait_table_path=func_tables[func],
            hsp_method=args.hsp_method,
            calc_nsti=nsti_setting,
            calc_ci=ci_setting,
            check_input=False,
            num_proc=args.threads,
            ran_seed=args.seed)

        count_outfile = path.join(args.output, func + "_predicted.tsv")

        # Add "_nsti" to filename if output.
        if nsti_setting:
            count_outfile = path.join(args.output,
                                      func + "_nsti_predicted.tsv")

        # Keep track of output file name for next step of pipeline.
        predicted_funcs[func] = count_outfile

        print("Writing out predicted gene family abundances to " +
              count_outfile)

        hsp_table.to_csv(path_or_buf=count_outfile,
                         index_label="sequence",
                         sep="\t")

        # Output the CI file as well if option set.
        if ci_setting:
            ci_outfile = path.join(args.output, func + "_predicted_ci.tsv")
            print("Writing out predicted gene family CIs to " + ci_outfile)
            ci_table.to_csv(path_or_buf=ci_outfile,
                            index_label="sequence",
                            sep="\t")

    marker_infile = predicted_funcs["marker"]

    # Loop over each function again and run metagenome pipeline.
    for func in funcs:

        if func == "marker":
            continue

        func_infile = predicted_funcs[func]

        func_output_dir = path.join(args.output, func + "_metagenome_out")

        print("Running metagenome pipeline for " + func)

        # Infer metagenome abundances per-sample.
        with TemporaryDirectory() as temp_dir:

            # Pass arguments to key function and get predicted functions
            # stratified and unstratified by genomes.
            strat_pred, unstrat_pred = run_metagenome_pipeline(
                input_biom=args.input,
                function=func_infile,
                marker=marker_infile,
                out_dir=func_output_dir,
                max_nsti=args.max_nsti,
                min_reads=args.min_reads,
                min_samples=args.min_samples,
                strat_out=args.stratified,
                proc=args.threads,
                output_normfile=True)

            print("Writing metagenome output files for " + func + " to: " +
                  func_output_dir)

            # Generate output table filepaths and write out pandas dataframe.
            unstrat_outfile = path.join(func_output_dir,
                                        "pred_metagenome_unstrat.tsv")

            unstrat_pred.index.name = "function"
            unstrat_pred.reset_index(inplace=True)

            if args.custom_trait_tables is None:
                unstrat_pred = add_descrip_col(inputfile=unstrat_pred,
                                               mapfile=default_map[func],
                                               in_df=True)

            unstrat_pred.to_csv(path_or_buf=unstrat_outfile,
                                sep="\t",
                                index=False)

            # Write out stratified table only if that option was specified.
            if args.stratified:
                strat_outfile = path.join(func_output_dir,
                                          "pred_metagenome_strat.tsv")
                strat_pred.reset_index(inplace=True)

                if args.custom_trait_tables is None:
                    strat_pred = add_descrip_col(inputfile=strat_pred,
                                                 mapfile=default_map[func],
                                                 in_df=True)

                strat_pred.to_csv(path_or_buf=strat_outfile,
                                  sep="\t",
                                  index=False)

    # Infer pathway abundances and coverages unless --no_pathways set.
    if not args.no_pathways:

        if args.stratified:
            in_metagenome = path.join(args.output,
                                      rxn_func + "_metagenome_out",
                                      "pred_metagenome_strat.tsv")
        else:
            in_metagenome = path.join(args.output,
                                      rxn_func + "_metagenome_out",
                                      "pred_metagenome_unstrat.tsv")

        print("Inferring MetaCyc pathways from predicted functions in this "
              "file: " + in_metagenome)

        with TemporaryDirectory() as temp_dir:
            unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline(
                inputfile=in_metagenome,
                mapfile=default_pathway_map,
                regroup_mapfile=default_regroup_map,
                proc=args.threads,
                out_dir=temp_dir,
                gap_fill=gap_fill_opt,
                per_sequence_contrib=args.per_sequence_contrib,
                print_cmds=args.print_cmds)

        pathways_out = path.join(args.output, "pathways_out")

        make_output_dir(pathways_out)

        print("Writing predicted pathway abundances and coverages to " +
              pathways_out)

        # Write output files.
        unstrat_abun_outfile = path.join(pathways_out, "path_abun_unstrat.tsv")
        unstrat_abun.reset_index(inplace=True)

        if args.custom_trait_tables is None:
            unstrat_abun = add_descrip_col(inputfile=unstrat_abun,
                                           mapfile=default_map["METACYC"],
                                           in_df=True)

        unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile,
                            sep="\t",
                            index=False)

        unstrat_cov_outfile = path.join(pathways_out, "path_cov_unstrat.tsv")
        unstrat_cov.reset_index(inplace=True)

        if args.custom_trait_tables is None:
            unstrat_cov = add_descrip_col(inputfile=unstrat_cov,
                                          mapfile=default_map["METACYC"],
                                          in_df=True)

        unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile,
                           sep="\t",
                           index=False)

        # Write stratified output only if something besides None was returned.
        if strat_abun is not None:
            strat_abun_outfile = path.join(pathways_out, "path_abun_strat.tsv")

            if args.custom_trait_tables is None:
                strat_abun = add_descrip_col(inputfile=strat_abun,
                                             mapfile=default_map["METACYC"],
                                             in_df=True)
            strat_abun.to_csv(path_or_buf=strat_abun_outfile,
                              sep="\t",
                              index=False)

        if strat_cov is not None:
            strat_cov_outfile = path.join(pathways_out, "path_cov_strat.tsv")

            if args.custom_trait_tables is None:
                strat_cov = add_descrip_col(inputfile=strat_cov,
                                            mapfile=default_map["METACYC"],
                                            in_df=True)

            strat_cov.to_csv(path_or_buf=strat_cov_outfile,
                             sep="\t",
                             index=False)

    # Print out elapsed time.
    elapsed_time = time.time() - start_time
    print("Completed PICRUSt2 pipeline in " + "%.2f" % elapsed_time +
          " seconds.")