コード例 #1
0
def main():

    args = parser.parse_args()

    # Check that input files exist.
    check_files_exist([args.input, args.function, args.marker])

    # Pass arguments to key function and get predicted functions
    # stratified and unstratified by genomes.
    strat_pred, unstrat_pred = run_metagenome_pipeline(input_biom=args.input,
                                                       function=args.function,
                                                       marker=args.marker,
                                                       out_dir=args.out_dir,
                                                       max_nsti=args.max_nsti,
                                                       min_reads=args.min_reads,
                                                       min_samples=args.min_samples,
                                                       strat_out=args.strat_out,
                                                       proc=args.proc,
                                                       output_normfile=True)

    # Generate output table filepaths and write out pandas dataframe.
    unstrat_outfile = path.join(args.out_dir, "pred_metagenome_unstrat.tsv")
    unstrat_pred.to_csv(path_or_buf=unstrat_outfile, sep="\t", index=True,
                        index_label="function")

    # Write out stratified table only if that option was specified.
    if args.strat_out:
        strat_outfile = path.join(args.out_dir, "pred_metagenome_strat.tsv")
        strat_pred.to_csv(path_or_buf=strat_outfile, sep="\t", index=True)
コード例 #2
0
    def test_full_pipeline_unstrat_tsv(self):
        '''Test that run_metagenome_pipeline works on tsv input seqtab.'''

        with TemporaryDirectory() as temp_dir:
            strat_out, unstrat_out = run_metagenome_pipeline(input_biom=seqtab_tsv,
                                                             function=func_predict,
                                                             marker=marker_predict,
                                                             max_nsti=2,
                                                             out_dir=temp_dir)

        pd.testing.assert_frame_equal(unstrat_out, exp_unstrat_in)
コード例 #3
0
    def test_full_pipeline_strat_biom(self):
        '''Test that run_metagenome_pipeline works on tsv input seqtab.'''

        with TemporaryDirectory() as temp_dir:
            strat_out, unstrat_out = run_metagenome_pipeline(input_biom=seqtab_biom,
                                                             function=func_predict,
                                                             marker=marker_predict,
                                                             max_nsti=2,
                                                             out_dir=temp_dir)
        # Need to reset index names since these aren't in output files.
        strat_out.index = range(30)

        pd.testing.assert_frame_equal(strat_out, exp_strat_in)
コード例 #4
0
def metagenome_pipeline_steps(input_table, func_infile, marker_infile,
                              func_output_dir, no_descrip, max_nsti, min_reads,
                              min_samples, stratified, threads, func_map,
                              verbose):
    '''Steps wraping metagenome pipeline moved to separate function to decrease
    memory usage.'''

    # Infer metagenome abundances per-sample.
    # Pass arguments to key function and get predicted functions
    # stratified and unstratified by genomes.
    strat_pred, unstrat_pred = run_metagenome_pipeline(input_biom=input_table,
                                                       function=func_infile,
                                                       marker=marker_infile,
                                                       out_dir=func_output_dir,
                                                       max_nsti=max_nsti,
                                                       min_reads=min_reads,
                                                       min_samples=min_samples,
                                                       strat_out=stratified,
                                                       proc=threads,
                                                       output_normfile=True)
    unstrat_pred.index.name = "function"
    unstrat_pred.reset_index(inplace=True)

    if not no_descrip and func_map:
        unstrat_pred = add_descrip_col(inputfile=unstrat_pred,
                                       mapfile=func_map,
                                       in_df=True)

    # Write out stratified table only if that option was specified.
    if stratified:
        strat_pred.reset_index(inplace=True)

        if not no_descrip and func_map:
            strat_pred = add_descrip_col(inputfile=strat_pred,
                                         mapfile=func_map,
                                         in_df=True)

    if verbose:
        print("Writing metagenome output files for " + func + " to: " +
              func_output_dir)

    unstrat_outfile = path.join(func_output_dir, "pred_metagenome_unstrat.tsv")
    unstrat_pred.to_csv(path_or_buf=unstrat_outfile, sep="\t", index=False)

    strat_outfile = None
    if stratified:
        strat_outfile = path.join(func_output_dir, "pred_metagenome_strat.tsv")
        strat_pred.to_csv(path_or_buf=strat_outfile, sep="\t", index=False)

    # Return output filenames.
    return (strat_outfile, unstrat_outfile)
    def test_full_pipeline_unstrat_biom(self):
        '''Test that run_metagenome_pipeline create corrected unstratified
        output on biom input seqtab.'''

        with TemporaryDirectory() as temp_dir:
            strat_out, unstrat_out = run_metagenome_pipeline(input_seqabun=seqtab_biom,
                                                             function=func_predict,
                                                             marker=marker_predict,
                                                             max_nsti=2.1,
                                                             out_dir=temp_dir,
                                                             strat_out=False)

        pd.testing.assert_frame_equal(unstrat_out, exp_unstrat_in,
                                      check_like=True)
    def test_full_pipeline_unstrat_msf_when_no_strat(self):
        '''Test that run_metagenome_pipeline works on mothur shared file input
        seqtab when strat_out=False.'''

        with TemporaryDirectory() as temp_dir:
            strat_out, unstrat_out = run_metagenome_pipeline(input_seqabun=seqtab_msf,
                                                             function=func_predict,
                                                             marker=marker_predict,
                                                             max_nsti=2.1,
                                                             out_dir=temp_dir,
                                                             strat_out=False)

        pd.testing.assert_frame_equal(unstrat_out, exp_unstrat_in,
                                      check_like=True)
    def test_full_pipeline_strat_wide_biom(self):
        '''Test that run_metagenome_pipeline creates correct stratified output
        on biom input seqtab. Compare with wide-format table in this case.'''

        with TemporaryDirectory() as temp_dir:
            strat_out, unstrat_out = run_metagenome_pipeline(input_seqabun=seqtab_biom,
                                                             function=func_predict,
                                                             marker=marker_predict,
                                                             max_nsti=2.0,
                                                             out_dir=temp_dir,
                                                             strat_out=True,
                                                             wide_table=True)

        pd.testing.assert_frame_equal(strat_out, exp_strat_wide_in,
                                      check_like=True)
コード例 #8
0
    def test_full_pipeline_strat_tsv_2proc(self):
        '''Test that run_metagenome_pipeline works on tsv input seqtab and
        running on 2 processes.'''

        with TemporaryDirectory() as temp_dir:
            strat_out, unstrat_out = run_metagenome_pipeline(
                input_biom=seqtab_tsv,
                function=func_predict,
                marker=marker_predict,
                max_nsti=2,
                out_dir=temp_dir,
                proc=2,
                strat_out=True)

        pd.testing.assert_frame_equal(strat_out, exp_strat_in, check_like=True)
    def test_full_pipeline_strat_tsv(self):
        '''Test that run_metagenome_pipeline works on tsv input seqtab.'''

        with TemporaryDirectory() as temp_dir:
            strat_out, unstrat_out = run_metagenome_pipeline(input_seqabun=seqtab_tsv_simple,
                                                             function=func_simple_in,
                                                             marker=marker_simple_in,
                                                             max_nsti=1.9,
                                                             out_dir=temp_dir,
                                                             strat_out=True,
                                                             wide_table=False)

        pd.testing.assert_frame_equal(strat_out.reset_index(drop=True),
                                      exp_strat_simple_in.reset_index(drop=True),
                                      check_like=True)
        pd.testing.assert_frame_equal(unstrat_out, exp_unstrat_simple_in,
                                      check_like=True)
    def test_full_pipeline_strat_wide_tsv(self):
        '''Test that run_metagenome_pipeline works on tsv input seqtab. Compare
        with wide-format table in this case.'''

        with TemporaryDirectory() as temp_dir:
            strat_out, unstrat_out = run_metagenome_pipeline(input_seqabun=seqtab_tsv,
                                                             function=func_predict,
                                                             marker=marker_predict,
                                                             max_nsti=1.9,
                                                             out_dir=temp_dir,
                                                             strat_out=True,
                                                             wide_table=True)

        pd.testing.assert_frame_equal(strat_out, exp_strat_wide_in,
                                      check_like=True)

        pd.testing.assert_frame_equal(unstrat_out, exp_unstrat_in,
                                      check_like=True)
    def test_full_pipeline_strat_rare_category_tsv(self):
        '''Test that run_metagenome_pipeline works on tsv input seqtab and when
        rare seqs are collapsed into RARE category'''

        with TemporaryDirectory() as temp_dir:
            strat_out, unstrat_out = run_metagenome_pipeline(input_seqabun=seqtab_tsv_simple,
                                                             function=func_simple_in,
                                                             marker=marker_simple_in,
                                                             max_nsti=2.1,
                                                             min_reads=10,
                                                             min_samples=2,
                                                             out_dir=temp_dir,
                                                             strat_out=True,
                                                             wide_table=False)

        pd.testing.assert_frame_equal(strat_out.reset_index(drop=True),
                                      exp_strat_simple_rare_in.reset_index(drop=True),
                                      check_like=True)

        pd.testing.assert_frame_equal(unstrat_out, exp_unstrat_simple_in,
                                      check_like=True)
コード例 #12
0
def main():

    args = parser.parse_args()

    check_files_exist([args.input, args.function])

    strat_pred, unstrat_pred = run_metagenome_pipeline(
        input_seqabun=args.input,
        function=args.function,
        max_nsti=args.max_nsti,
        marker=args.marker,
        out_dir=args.out_dir,
        min_reads=args.min_reads,
        min_samples=args.min_samples,
        strat_out=args.strat_out,
        wide_table=args.wide_table,
        skip_norm=args.skip_norm)

    unstrat_outfile = path.join(args.out_dir, "pred_metagenome_unstrat.tsv.gz")
    unstrat_pred.to_csv(path_or_buf=unstrat_outfile,
                        sep="\t",
                        index=True,
                        index_label="function",
                        compression="gzip")

    if args.strat_out and not args.wide_table:
        strat_outfile = path.join(args.out_dir,
                                  "pred_metagenome_contrib.tsv.gz")
        strat_pred.to_csv(path_or_buf=strat_outfile,
                          sep="\t",
                          index=False,
                          compression="gzip")

    elif args.strat_out and args.wide_table:
        strat_outfile = path.join(args.out_dir, "pred_metagenome_strat.tsv.gz")
        strat_pred.to_csv(path_or_buf=strat_outfile,
                          sep="\t",
                          index=True,
                          compression="gzip")
コード例 #13
0
    def test_full_pipeline_strat_tsv_rare_category(self):
        '''Test that run_metagenome_pipeline works on tsv input seqtab and when
        rare seqs are collapsed into RARE category'''

        with TemporaryDirectory() as temp_dir:
            strat_out, unstrat_out = run_metagenome_pipeline(
                input_seqabun=seqtab_tsv,
                function=func_predict,
                marker=marker_predict,
                max_nsti=2.1,
                min_reads=4,
                min_samples=2,
                out_dir=temp_dir,
                strat_out=True)

        pd.testing.assert_frame_equal(strat_out,
                                      exp_strat_in_rare,
                                      check_like=True)

        pd.testing.assert_frame_equal(unstrat_out,
                                      exp_unstrat_in,
                                      check_like=True)
コード例 #14
0
def main():

    args = parser.parse_args()

    # Check that input files exist.
    check_files_exist([args.input, args.function, args.marker])

    # Pass arguments to key function and get predicted functions
    # stratified and unstratified by genomes.
    strat_pred, unstrat_pred = run_metagenome_pipeline(input_biom=args.input,
                                                       function=args.function,
                                                       marker=args.marker,
                                                       out_dir=args.out_dir,
                                                       max_nsti=args.max_nsti,
                                                       proc=args.proc,
                                                       output_normfile=True)

    # Generate output table filepaths and write out pandas dataframes.
    strat_outfile = path.join(args.out_dir, "pred_metagenome_strat.tsv")
    unstrat_outfile = path.join(args.out_dir, "pred_metagenome_unstrat.tsv")

    # Note that no index labels are written for stratified output.
    strat_pred.to_csv(path_or_buf=strat_outfile, sep="\t", index=False)
    unstrat_pred.to_csv(path_or_buf=unstrat_outfile, sep="\t")
コード例 #15
0
ファイル: picrust2_pipeline.py プロジェクト: misazaa/picrust2
def main():

    args = parser.parse_args()

    # Get start time.
    start_time = time.time()

    # Check that input files exist.
    check_files_exist([args.study_fasta, args.input])

    # Make output folder.
    make_output_dir(args.output)

    out_tree = path.join(args.output, "out.tre")

    if args.custom_trait_tables is None:

        # Check that specified functional categories are allowed.
        FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM']
        funcs = args.in_traits.split(",")
        for func in funcs:
            if func not in FUNC_TRAIT_OPTIONS:
                sys.exit("Error - specified category " + func +
                         " is not one of "
                         "the default categories.")

        # Add EC to this set if pathways are to be predicted.
        if "EC" not in funcs and not args.no_pathways:
            funcs.append("EC")

        rxn_func = "EC"

        func_tables = default_tables

    else:
        funcs = []
        func_tables = {}

        table_i = 0

        for custom in args.custom_trait_tables.split(","):

            func_id = path.splitext(path.basename(custom))[0]
            funcs.append(func_id)
            func_tables[func_id] = custom

            if table_i == 0:
                rxn_func = func_id
                table_i += 1

    # Append marker as well, since this also needs to be run.
    funcs.append("marker")
    func_tables["marker"] = args.marker_gene_table

    # Methods for discrete trait prediction with CI enabled.
    discrete_set = set(['emp_prob', 'mp'])

    if args.confidence and args.hsp_method in discrete_set:
        ci_setting = True
    else:
        ci_setting = False

    gap_fill_opt = not args.no_gap_fill

    with TemporaryDirectory() as temp_dir:

        print("Placing sequences onto reference tree.")

        place_seqs_pipeline(study_fasta=args.study_fasta,
                            ref_msa=args.ref_msa,
                            tree=args.tree,
                            out_tree=out_tree,
                            threads=args.threads,
                            papara_output=None,
                            out_dir=temp_dir,
                            chunk_size=5000,
                            print_cmds=args.print_cmds)

        print("Finished placing sequences on output tree: " + out_tree)

    # Get predictions for all specified functions and keep track of outfiles.
    predicted_funcs = {}

    for func in funcs:

        # Only output NSTI in 16S table.
        nsti_setting = False
        if func == "marker" and args.calculate_NSTI:
            nsti_setting = True

        print("Running hidden-state prediction for " + func)

        hsp_table, ci_table = castor_hsp_workflow(
            tree_path=out_tree,
            trait_table_path=func_tables[func],
            hsp_method=args.hsp_method,
            calc_nsti=nsti_setting,
            calc_ci=ci_setting,
            check_input=False,
            num_proc=args.threads,
            ran_seed=args.seed)

        count_outfile = path.join(args.output, func + "_predicted.tsv")

        # Add "_nsti" to filename if output.
        if nsti_setting:
            count_outfile = path.join(args.output,
                                      func + "_nsti_predicted.tsv")

        # Keep track of output file name for next step of pipeline.
        predicted_funcs[func] = count_outfile

        print("Writing out predicted gene family abundances to " +
              count_outfile)

        hsp_table.to_csv(path_or_buf=count_outfile,
                         index_label="sequence",
                         sep="\t")

        # Output the CI file as well if option set.
        if ci_setting:
            ci_outfile = path.join(args.output, func + "_predicted_ci.tsv")
            print("Writing out predicted gene family CIs to " + ci_outfile)
            ci_table.to_csv(path_or_buf=ci_outfile,
                            index_label="sequence",
                            sep="\t")

    marker_infile = predicted_funcs["marker"]

    # Loop over each function again and run metagenome pipeline.
    for func in funcs:

        if func == "marker":
            continue

        func_infile = predicted_funcs[func]

        func_output_dir = path.join(args.output, func + "_metagenome_out")

        print("Running metagenome pipeline for " + func)

        # Infer metagenome abundances per-sample.
        with TemporaryDirectory() as temp_dir:

            # Pass arguments to key function and get predicted functions
            # stratified and unstratified by genomes.
            strat_pred, unstrat_pred = run_metagenome_pipeline(
                input_biom=args.input,
                function=func_infile,
                marker=marker_infile,
                out_dir=func_output_dir,
                max_nsti=args.max_nsti,
                min_reads=args.min_reads,
                min_samples=args.min_samples,
                strat_out=args.stratified,
                proc=args.threads,
                output_normfile=True)

            print("Writing metagenome output files for " + func + " to: " +
                  func_output_dir)

            # Generate output table filepaths and write out pandas dataframe.
            unstrat_outfile = path.join(func_output_dir,
                                        "pred_metagenome_unstrat.tsv")

            unstrat_pred.index.name = "function"
            unstrat_pred.reset_index(inplace=True)

            if args.custom_trait_tables is None:
                unstrat_pred = add_descrip_col(inputfile=unstrat_pred,
                                               mapfile=default_map[func],
                                               in_df=True)

            unstrat_pred.to_csv(path_or_buf=unstrat_outfile,
                                sep="\t",
                                index=False)

            # Write out stratified table only if that option was specified.
            if args.stratified:
                strat_outfile = path.join(func_output_dir,
                                          "pred_metagenome_strat.tsv")
                strat_pred.reset_index(inplace=True)

                if args.custom_trait_tables is None:
                    strat_pred = add_descrip_col(inputfile=strat_pred,
                                                 mapfile=default_map[func],
                                                 in_df=True)

                strat_pred.to_csv(path_or_buf=strat_outfile,
                                  sep="\t",
                                  index=False)

    # Infer pathway abundances and coverages unless --no_pathways set.
    if not args.no_pathways:

        if args.stratified:
            in_metagenome = path.join(args.output,
                                      rxn_func + "_metagenome_out",
                                      "pred_metagenome_strat.tsv")
        else:
            in_metagenome = path.join(args.output,
                                      rxn_func + "_metagenome_out",
                                      "pred_metagenome_unstrat.tsv")

        print("Inferring MetaCyc pathways from predicted functions in this "
              "file: " + in_metagenome)

        with TemporaryDirectory() as temp_dir:
            unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline(
                inputfile=in_metagenome,
                mapfile=default_pathway_map,
                regroup_mapfile=default_regroup_map,
                proc=args.threads,
                out_dir=temp_dir,
                gap_fill=gap_fill_opt,
                per_sequence_contrib=args.per_sequence_contrib,
                print_cmds=args.print_cmds)

        pathways_out = path.join(args.output, "pathways_out")

        make_output_dir(pathways_out)

        print("Writing predicted pathway abundances and coverages to " +
              pathways_out)

        # Write output files.
        unstrat_abun_outfile = path.join(pathways_out, "path_abun_unstrat.tsv")
        unstrat_abun.reset_index(inplace=True)

        if args.custom_trait_tables is None:
            unstrat_abun = add_descrip_col(inputfile=unstrat_abun,
                                           mapfile=default_map["METACYC"],
                                           in_df=True)

        unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile,
                            sep="\t",
                            index=False)

        unstrat_cov_outfile = path.join(pathways_out, "path_cov_unstrat.tsv")
        unstrat_cov.reset_index(inplace=True)

        if args.custom_trait_tables is None:
            unstrat_cov = add_descrip_col(inputfile=unstrat_cov,
                                          mapfile=default_map["METACYC"],
                                          in_df=True)

        unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile,
                           sep="\t",
                           index=False)

        # Write stratified output only if something besides None was returned.
        if strat_abun is not None:
            strat_abun_outfile = path.join(pathways_out, "path_abun_strat.tsv")

            if args.custom_trait_tables is None:
                strat_abun = add_descrip_col(inputfile=strat_abun,
                                             mapfile=default_map["METACYC"],
                                             in_df=True)
            strat_abun.to_csv(path_or_buf=strat_abun_outfile,
                              sep="\t",
                              index=False)

        if strat_cov is not None:
            strat_cov_outfile = path.join(pathways_out, "path_cov_strat.tsv")

            if args.custom_trait_tables is None:
                strat_cov = add_descrip_col(inputfile=strat_cov,
                                            mapfile=default_map["METACYC"],
                                            in_df=True)

            strat_cov.to_csv(path_or_buf=strat_cov_outfile,
                             sep="\t",
                             index=False)

    # Print out elapsed time.
    elapsed_time = time.time() - start_time
    print("Completed PICRUSt2 pipeline in " + "%.2f" % elapsed_time +
          " seconds.")