コード例 #1
0
def main():

    args = parser.parse_args()

    # Check that input files exist.
    check_files_exist([args.input, args.function, args.marker])

    # Pass arguments to key function and get predicted functions
    # stratified and unstratified by genomes.
    strat_pred, unstrat_pred = run_metagenome_pipeline(input_biom=args.input,
                                                       function=args.function,
                                                       marker=args.marker,
                                                       out_dir=args.out_dir,
                                                       max_nsti=args.max_nsti,
                                                       min_reads=args.min_reads,
                                                       min_samples=args.min_samples,
                                                       strat_out=args.strat_out,
                                                       proc=args.proc,
                                                       output_normfile=True)

    # Generate output table filepaths and write out pandas dataframe.
    unstrat_outfile = path.join(args.out_dir, "pred_metagenome_unstrat.tsv")
    unstrat_pred.to_csv(path_or_buf=unstrat_outfile, sep="\t", index=True,
                        index_label="function")

    # Write out stratified table only if that option was specified.
    if args.strat_out:
        strat_outfile = path.join(args.out_dir, "pred_metagenome_strat.tsv")
        strat_pred.to_csv(path_or_buf=strat_outfile, sep="\t", index=True)
コード例 #2
0
def main():

    args = parser.parse_args()

    # Check that input files exist.
    check_files_exist(args.input_files)

    if args.conversion == 'contrib_to_legacy':
        if args.raw_abun:
            rel_abun_set = False
        else:
            rel_abun_set = True

        contrib_to_legacy(args.input_files, args.output, rel_abun_set)

    elif args.conversion == 'humann2_unstrat_to_picrust2':
        convert_humann2_to_picrust2(args.input_files, args.output, False)

    elif args.conversion == 'humann2_strat_to_picrust2':
        convert_humann2_to_picrust2(args.input_files, args.output, True)

    elif args.conversion == 'picrust2_unstrat_to_humann2_split':

        convert_picrust2_to_humann2(args.input_files, args.output, False)

    elif args.conversion == 'picrust2_strat_to_humann2_split':

        convert_picrust2_to_humann2(args.input_files, args.output, True)

    elif args.conversion == 'picrust2_to_humann2_merged':

        convert_picrust2_to_humann2_merged(args.input_files, args.output)
コード例 #3
0
def main():

    args = parser.parse_args()

    # Check that input filenames exist.
    check_files_exist([args.study_fasta, args.ref_msa, args.tree])	

    # If intermediate output directory set then create and output there.
    # Otherwise make a temporary directory for the intermediate files.
    if args.intermediate:

        make_output_dir(args.intermediate)

        place_seqs_pipeline(study_fasta=args.study_fasta,
                            ref_msa=args.ref_msa,
                            tree=args.tree,
                            out_tree=args.out_tree,
                            threads=args.threads,
                            papara_output=args.papara_output,
                            out_dir=args.intermediate,
                            chunk_size=args.chunk_size,
                            print_cmds=args.print_cmds)

    else:
        with TemporaryDirectory() as temp_dir:
                place_seqs_pipeline(study_fasta=args.study_fasta,
                                    ref_msa=args.ref_msa,
                                    tree=args.tree,
                                    out_tree=args.out_tree,
                                    threads=args.threads,
                                    papara_output=args.papara_output,
                                    out_dir=temp_dir,
                                    chunk_size=args.chunk_size,
                                    print_cmds=args.print_cmds)
コード例 #4
0
def main():

    args = parser.parse_args()

    # Determine which input trait table was specified. If neither a default
    # or custom table was specified then throw an error.
    if args.in_trait:
        trait_table = default_tables[args.in_trait]
    elif args.observed_trait_table:
        trait_table = args.observed_trait_table
    else:
        raise RuntimeError(
            "A default input trait table needs to be specified with the " +
            "--in_trait option, or alternatively a custom table can be " +
            "specified with the --observed_trait_table option")

    # Check that input filenames exist.
    check_files_exist([args.tree, trait_table])

    # Methods for discrete trait prediction with CI enabled.
    discrete_set = set(['emp_prob', 'mp'])

    if args.confidence and args.hsp_method in discrete_set:
        ci_setting = True
    else:
        ci_setting = False

    count_outfile = args.output_prefix + ".tsv"
    ci_outfile = args.output_prefix + "_ci.tsv"

    hsp_table, ci_table = castor_hsp_workflow(tree_path=args.tree,
                                              trait_table_path=trait_table,
                                              hsp_method=args.hsp_method,
                                              chunk_size=args.chunk_size,
                                              calc_nsti=args.calculate_NSTI,
                                              calc_ci=ci_setting,
                                              check_input=args.check,
                                              num_proc=args.processes,
                                              ran_seed=args.seed)

    # Output the table to file.
    make_output_dir_for_file(count_outfile)
    hsp_table.to_csv(path_or_buf=count_outfile,
                     index_label="sequence",
                     sep="\t")

    # Output the CI file as well if option set.
    if ci_setting:
        make_output_dir_for_file(ci_outfile)
        ci_table.to_csv(path_or_buf=ci_outfile,
                        index_label="sequence",
                        sep="\t")
コード例 #5
0
ファイル: run_minpath.py プロジェクト: misazaa/picrust2
    def __init__(self, database=None, reaction_names=[]):
        '''Load in the pathways data from the database file.'''

        self.__pathways_to_reactions = {}
        self.__reactions_to_pathways = {}
        self.__pathways_structure = {}
        self.__key_reactions = {}

        if not database is None:

            # Check that database file exists.
            check_files_exist([database])

            file_handle = open(database, "rt")

            line = file_handle.readline()

            # The database is expected to contain a single line per pathway.
            # This line begins with the pathway name and is followed by all
            # reactions. Alternatively it could also contain a since pathway
            # and reaction link per line if the pathways aren't structured.
            reactions = defaultdict(list)
            structured_pathway = False
            while line:
                data = line.strip().split("\t")

                if len(data) > 1:

                    # Remove pathway from this list.
                    pathway = data.pop(0)

                    # Add new key-value pair in reactions dict of pathway to
                    # all reactions.
                    reactions[pathway] += data

                    # Check to see if this pathway has structure.
                    if "(" in data[0]:
                        structured_pathway = True

                line = file_handle.readline()

            file_handle.close()

            # If this is a structured pathways set, then store the structure.
            if structured_pathway:
                reactions = self._set_pathways_structure(
                    reactions, reaction_names)

            self._store_pathways(reactions)
コード例 #6
0
ファイル: run_minpath.py プロジェクト: misazaa/picrust2
def read_reaction_names(reactions_database):
    '''Read in the reactions from a table that contains links between reactions
    and gene family ids. Will return a list of reactions (which are assumed to
    be the first field of the file after splitting by " ").'''

    # Check that the input file exists.
    check_files_exist([reactions_database])

    reactions = []

    with open(reactions_database, "rt") as infile:
        for line in infile:
            line_split = line.strip().split(" ")
            if len(line_split) > 1:
                reactions += line_split[0]

    return reactions
def main():

    args = parser.parse_args()

    # Determine which input trait table was specified. If neither a default
    # or custom table was specified then throw an error.
    if args.in_trait:
        trait_table = default_tables[args.in_trait]
    elif args.observed_trait_table:
        trait_table = args.observed_trait_table
    else:
        raise RuntimeError(
            "A default input trait table needs to be specified with the " +
            "--in_trait option, or alternatively a custom table can be " +
            "specified with the --observed_trait_table option")

    # Check that input filenames exist.
    check_files_exist([args.tree, trait_table])

    # No longer support outputting CIs with this script.
    ci_setting = False

    hsp_table, ci_table = castor_hsp_workflow(tree_path=args.tree,
                                              trait_table_path=trait_table,
                                              hsp_method=args.hsp_method,
                                              chunk_size=args.chunk_size,
                                              calc_nsti=args.calculate_NSTI,
                                              calc_ci=ci_setting,
                                              check_input=args.check,
                                              num_proc=args.processes,
                                              ran_seed=args.seed,
                                              verbose=args.verbose)

    # Output the table to file.
    make_output_dir_for_file(args.output)
    hsp_table.to_csv(path_or_buf=args.output,
                     index_label="sequence",
                     sep="\t",
                     compression="infer")
コード例 #8
0
def main():

    args = parser.parse_args()

    check_files_exist([args.input, args.function])

    strat_pred, unstrat_pred = run_metagenome_pipeline(
        input_seqabun=args.input,
        function=args.function,
        max_nsti=args.max_nsti,
        marker=args.marker,
        out_dir=args.out_dir,
        min_reads=args.min_reads,
        min_samples=args.min_samples,
        strat_out=args.strat_out,
        wide_table=args.wide_table,
        skip_norm=args.skip_norm)

    unstrat_outfile = path.join(args.out_dir, "pred_metagenome_unstrat.tsv.gz")
    unstrat_pred.to_csv(path_or_buf=unstrat_outfile,
                        sep="\t",
                        index=True,
                        index_label="function",
                        compression="gzip")

    if args.strat_out and not args.wide_table:
        strat_outfile = path.join(args.out_dir,
                                  "pred_metagenome_contrib.tsv.gz")
        strat_pred.to_csv(path_or_buf=strat_outfile,
                          sep="\t",
                          index=False,
                          compression="gzip")

    elif args.strat_out and args.wide_table:
        strat_outfile = path.join(args.out_dir, "pred_metagenome_strat.tsv.gz")
        strat_pred.to_csv(path_or_buf=strat_outfile,
                          sep="\t",
                          index=True,
                          compression="gzip")
コード例 #9
0
def main():

    args = parser.parse_args()

    # Check that input files exist.
    check_files_exist([args.input, args.function, args.marker])

    # Pass arguments to key function and get predicted functions
    # stratified and unstratified by genomes.
    strat_pred, unstrat_pred = run_metagenome_pipeline(input_biom=args.input,
                                                       function=args.function,
                                                       marker=args.marker,
                                                       out_dir=args.out_dir,
                                                       max_nsti=args.max_nsti,
                                                       proc=args.proc,
                                                       output_normfile=True)

    # Generate output table filepaths and write out pandas dataframes.
    strat_outfile = path.join(args.out_dir, "pred_metagenome_strat.tsv")
    unstrat_outfile = path.join(args.out_dir, "pred_metagenome_unstrat.tsv")

    # Note that no index labels are written for stratified output.
    strat_pred.to_csv(path_or_buf=strat_outfile, sep="\t", index=False)
    unstrat_pred.to_csv(path_or_buf=unstrat_outfile, sep="\t")
コード例 #10
0
def main():

    args = parser.parse_args()

    # Check that input files exist.
    check_files_exist([args.input, args.map])

    # If intermediate output directory set then create and output there.
    # Otherwise make a temporary directory for the intermediate files.
    if args.intermediate:
        make_output_dir(args.intermediate)

        unstrat_out, strat_out = run_minpath_pipeline(
            inputfile=args.input,
            mapfile=args.map,
            proc=args.proc,
            out_dir=args.intermediate,
            print_cmds=args.print_cmds)
    else:
        with TemporaryDirectory() as temp_dir:
            unstrat_out, strat_out = run_minpath_pipeline(
                inputfile=args.input,
                mapfile=args.map,
                proc=args.proc,
                out_dir=temp_dir,
                print_cmds=args.print_cmds)

    # Write output files.
    unstrat_outfile = args.out_prefix + "_unstrat_path.tsv"
    strat_outfile = args.out_prefix + "_strat_path.tsv"

    unstrat_out.to_csv(path_or_buf=unstrat_outfile,
                       sep="\t",
                       index_label="pathway")

    strat_out.to_csv(path_or_buf=strat_outfile, sep="\t", index=False)
コード例 #11
0
def main():

    args = parser.parse_args()

    # Check that input files exist.
    check_files_exist(args.input_files)

    if args.conversion == 'humann2_unstrat_to_picrust2':
        convert_humann2_to_picrust2(args.input_files, args.output, False)

    elif args.conversion == 'humann2_strat_to_picrust2':
        convert_humann2_to_picrust2(args.input_files, args.output, True)

    elif args.conversion == 'picrust2_unstrat_to_humann2_split':

        convert_picrust2_to_humann2(args.input_files, args.output, False)

    elif args.conversion == 'picrust2_strat_to_humann2_split':

        convert_picrust2_to_humann2(args.input_files, args.output, True)

    elif args.conversion == 'picrust2_to_humann2_merged':

        convert_picrust2_to_humann2_merged(args.input_files, args.output)
コード例 #12
0
def run_metagenome_pipeline(input_seqabun,
                            function,
                            max_nsti,
                            marker=None,
                            min_reads=1,
                            min_samples=1,
                            strat_out=False,
                            wide_table=False,
                            skip_norm=False,
                            out_dir='metagenome_out'):
    '''Main function to run full metagenome pipeline. Meant to run modular
    functions largely listed below. Will return predicted metagenomes
    straitifed and unstratified by contributing genomes (i.e. taxa).'''

    if not marker and not skip_norm:
        sys.exit("Table of predicted marker gene copy numbers is required "
                 "unless --skip_norm is specified.")
    elif marker and skip_norm:
        sys.exit("Table of predicted marker gene copy numbers should not be "
                 "specified when --skip_norm option is set.")

    make_output_dir(out_dir)

    # Initialize empty pandas dataframe to contain NSTI values.
    nsti_val = pd.DataFrame()

    study_seq_counts = read_seqabun(input_seqabun)

    pred_function = pd.read_csv(function, sep="\t", dtype={'sequence': str})
    pred_function.set_index('sequence', drop=True, inplace=True)

    # If NSTI column present then remove all rows with value above specified
    # max value. Also, remove NSTI column (in both dataframes).
    if 'metadata_NSTI' in pred_function.columns:
        pred_function, nsti_val = drop_tips_by_nsti(tab=pred_function,
                                                    nsti_col='metadata_NSTI',
                                                    max_nsti=max_nsti)
    if not skip_norm:
        check_files_exist([marker])
        pred_marker = pd.read_csv(marker, sep="\t", dtype={'sequence': str})
        pred_marker.set_index('sequence', drop=True, inplace=True)

        if 'metadata_NSTI' in pred_marker.columns:
            pred_marker, nsti_val = drop_tips_by_nsti(tab=pred_marker,
                                                      nsti_col='metadata_NSTI',
                                                      max_nsti=max_nsti)

        # Re-order predicted abundance tables to be in same order as study seqs.
        # Also, drop any sequence ids that don't overlap across all dataframes.
        study_seq_counts, pred_function, pred_marker = three_df_index_overlap_sort(
            study_seq_counts, pred_function, pred_marker)
        norm_output = path.join(out_dir, "seqtab_norm.tsv.gz")

        # Normalize input study sequence abundances by predicted abundance of
        # marker genes and output normalized table if specified.
        study_seq_counts = norm_by_marker_copies(
            input_seq_counts=study_seq_counts,
            input_marker_num=pred_marker,
            norm_filename=norm_output)
    else:
        # Get intersecting rows between input files and sort.
        label_overlap = pred_function.index.intersection(
            study_seq_counts.index).sort_values()

        if len(label_overlap) == 0:
            sys.exit("No sequence ids overlap between both input files.")

        pred_function = pred_function.reindex(label_overlap)
        study_seq_counts = study_seq_counts.reindex(label_overlap)

    # If NSTI column input then output weighted NSTI values.
    if not nsti_val.empty:
        weighted_nsti_out = path.join(out_dir, "weighted_nsti.tsv.gz")
        calc_weighted_nsti(seq_counts=study_seq_counts,
                           nsti_input=nsti_val,
                           outfile=weighted_nsti_out)

    # Determine which sequences should be in the "RARE" category if stratified
    # table is specified.
    if strat_out:
        rare_seqs = []

        if min_reads != 1 or min_samples != 1:
            rare_seqs = id_rare_seqs(in_counts=study_seq_counts,
                                     min_reads=min_reads,
                                     min_samples=min_samples)

    # Generate and return final tables.
    if not strat_out:
        return (None,
                unstrat_funcs_only_by_samples(pred_function, study_seq_counts))

    elif strat_out and not wide_table:
        return (metagenome_contributions(pred_function, study_seq_counts,
                                         rare_seqs),
                unstrat_funcs_only_by_samples(pred_function, study_seq_counts))

    elif strat_out and wide_table:
        return (strat_funcs_by_samples(pred_function, study_seq_counts,
                                       rare_seqs))
コード例 #13
0
def main():

    args = parser.parse_args()

    # Check that input files exist.
    check_files_exist([args.input, args.map])

    gap_fill_opt = not args.no_gap_fill

    run_minpath_opt = not args.skip_minpath

    # If intermediate output directory set then create and output there.
    # Otherwise make a temporary directory for the intermediate files.
    if args.intermediate:

        make_output_dir(args.intermediate)

        unstrat_abun, \
            unstrat_cov, \
            strat_abun, \
            strat_cov, \
            path_abun_by_seq, \
            path_cov_by_seq, \
            unstrat_abun_per_seq = pathway_pipeline(
                            inputfile=args.input,
                            mapfile=args.map,
                            regroup_mapfile=args.regroup_map,
                            proc=args.processes,
                            out_dir=args.intermediate,
                            run_minpath=run_minpath_opt,
                            coverage=args.coverage,
                            gap_fill_on=gap_fill_opt,
                            no_regroup=args.no_regroup,
                            per_sequence_contrib=args.per_sequence_contrib,
                            per_sequence_abun=args.per_sequence_abun,
                            per_sequence_function=args.per_sequence_function,
                            wide_table=args.wide_table,
                            print_cmds=args.print_cmds)
    else:
        with TemporaryDirectory() as temp_dir:
            unstrat_abun, \
                unstrat_cov, \
                strat_abun, \
                strat_cov, \
                path_abun_by_seq, \
                path_cov_by_seq, \
                unstrat_abun_per_seq = pathway_pipeline(
                            inputfile=args.input,
                            mapfile=args.map,
                            regroup_mapfile=args.regroup_map,
                            proc=args.processes,
                            out_dir=temp_dir,
                            run_minpath=run_minpath_opt,
                            coverage=args.coverage,
                            gap_fill_on=gap_fill_opt,
                            no_regroup=args.no_regroup,
                            per_sequence_contrib=args.per_sequence_contrib,
                            per_sequence_abun=args.per_sequence_abun,
                            per_sequence_function=args.per_sequence_function,
                            wide_table=args.wide_table,
                            print_cmds=args.print_cmds)

    make_output_dir(args.out_dir)

    # Write output files. The unstratified abundance table will always be
    # written, but the other files will only be written if applicable.
    unstrat_abun_outfile = path.join(args.out_dir, "path_abun_unstrat.tsv.gz")
    unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile,
                        sep="\t",
                        index_label="pathway",
                        compression="gzip")

    if args.coverage:
        unstrat_cov_outfile = path.join(args.out_dir,
                                        "path_cov_unstrat.tsv.gz")
        unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile,
                           sep="\t",
                           index_label="pathway",
                           compression="gzip")

    if strat_abun is not None:

        if args.wide_table:
            strat_abun_outfile = path.join(args.out_dir,
                                           "path_abun_strat.tsv.gz")
        else:
            strat_abun_outfile = path.join(args.out_dir,
                                           "path_abun_contrib.tsv.gz")

        strat_abun.to_csv(path_or_buf=strat_abun_outfile,
                          sep="\t",
                          index=False,
                          compression="gzip")

    if args.coverage and strat_cov is not None:
        if args.wide_table:
            strat_cov_outfile = path.join(args.out_dir,
                                          "path_cov_strat.tsv.gz")
        else:
            strat_cov_outfile = path.join(args.out_dir,
                                          "path_cov_contrib.tsv.gz")

        strat_cov.to_csv(path_or_buf=strat_cov_outfile,
                         sep="\t",
                         index=False,
                         compression="gzip")

    if path_abun_by_seq is not None:
        genome_path_abun_outfile = path.join(args.out_dir,
                                             "path_abun_predictions.tsv.gz")
        path_abun_by_seq.to_csv(path_or_buf=genome_path_abun_outfile,
                                sep="\t",
                                index=True,
                                compression="gzip",
                                index_label="sequence")

    if args.coverage and path_cov_by_seq is not None:
        genome_path_cov_outfile = path.join(args.out_dir,
                                            "path_cov_predictions.tsv.gz")
        path_cov_by_seq.to_csv(path_or_buf=genome_path_cov_outfile,
                               sep="\t",
                               index=True,
                               compression="gzip",
                               index_label="sequence")

    if unstrat_abun_per_seq is not None:
        unstrat_abun_per_seq_outfile = path.join(
            args.out_dir, "path_abun_unstrat_per_seq.tsv.gz")
        unstrat_abun_per_seq.to_csv(path_or_buf=unstrat_abun_per_seq_outfile,
                                    sep="\t",
                                    index_label="pathway",
                                    compression="gzip")
コード例 #14
0
ファイル: pathway_pipeline.py プロジェクト: semir2/picrust2
def main():

    args = parser.parse_args()

    # Check that input files exist.
    check_files_exist([args.input, args.map])

    gap_fill_opt = not args.no_gap_fill

    run_minpath_opt = not args.skip_minpath

    # If intermediate output directory set then create and output there.
    # Otherwise make a temporary directory for the intermediate files.
    if args.intermediate:

        make_output_dir(args.intermediate)

        unstrat_abun, unstrat_cov, strat_abun, strat_cov = pathway_pipeline(
                                                      inputfile=args.input,
                                                      mapfile=args.map,
                                                      regroup_mapfile=args.regroup_map,
                                                      proc=args.proc,
                                                      out_dir=args.intermediate,
                                                      run_minpath=run_minpath_opt,
                                                      coverage=args.coverage,
                                                      gap_fill_on=gap_fill_opt,
                                                      no_regroup=args.no_regroup,
                                                      per_sequence_contrib=args.per_sequence_contrib,
                                                      per_sequence_abun=args.per_sequence_abun,
                                                      per_sequence_function=args.per_sequence_function,
                                                      print_cmds=args.print_cmds)
    else:
        with TemporaryDirectory() as temp_dir:
            unstrat_abun, unstrat_cov, strat_abun, strat_cov = pathway_pipeline(
                                                            inputfile=args.input,
                                                            mapfile=args.map,
                                                            regroup_mapfile=args.regroup_map,
                                                            proc=args.proc,
                                                            out_dir=temp_dir,
                                                            run_minpath=run_minpath_opt,
                                                            coverage=args.coverage,
                                                            gap_fill_on=gap_fill_opt,
                                                            no_regroup=args.no_regroup,
                                                            per_sequence_contrib=args.per_sequence_contrib,
                                                            per_sequence_abun=args.per_sequence_abun,
                                                            per_sequence_function=args.per_sequence_function,
                                                            print_cmds=args.print_cmds)

    make_output_dir(args.out_dir)

    # Write output files.
    unstrat_abun_outfile = path.join(args.out_dir, "path_abun_unstrat.tsv.gz")
    unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile,  sep="\t",
                       index_label="pathway", compression="gzip")

    if args.coverage:
        unstrat_cov_outfile = path.join(args.out_dir, "path_cov_unstrat.tsv.gz")
        unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile,  sep="\t",
                           index_label="pathway", compression="gzip")

    # Write stratified output only if something besides None was returned.
    if strat_abun is not None:
        strat_abun_outfile = path.join(args.out_dir, "path_abun_strat.tsv.gz")
        strat_abun.to_csv(path_or_buf=strat_abun_outfile,  sep="\t",
                          index=False, compression="gzip")

    if args.coverage and strat_cov is not None:
        strat_cov_outfile = path.join(args.out_dir, "path_cov_strat.tsv.gz")
        strat_cov.to_csv(path_or_buf=strat_cov_outfile,  sep="\t",
                         index=False, compression="gzip")
コード例 #15
0
def pathway_pipeline(inputfile,
                     mapfile,
                     out_dir,
                     proc=1,
                     run_minpath=True,
                     coverage=False,
                     no_regroup=False,
                     regroup_mapfile=None,
                     gap_fill_on=True,
                     per_sequence_contrib=False,
                     per_sequence_abun=None,
                     per_sequence_function=None,
                     print_cmds=False):
    '''Pipeline containing full pipeline for reading input files, making
    calls to functions to run MinPath and calculate pathway abundances and
    coverages. Will return 3 output Pandas dataframes: (1) unstratified pathway
    abundances, (2) unstratified pathway coverages, and (3) stratified pathway
    abundances.'''

    # If no regrouping flag set then set input regrouping mapfile to be None.
    if no_regroup:
        regroup_mapfile = None

    # Read in table of gene family abundances and determine if in stratified
    # format or not.
    in_metagenome, strat_format = read_metagenome_input(inputfile)

    # Basic checks if --per_sequence_contrib set.
    if per_sequence_contrib:

        # Throw error if --per_sequence_contrib set, but --per_sequence_abun
        # and/or --per_sequence_function not set.
        if not per_sequence_abun or not per_sequence_function:
            sys.exit("Error: \"per_sequence_contrib\" option set, but at "
                     "least one of \"per_sequence_abun\" or "
                     "\"per_sequence_function\" were not set. These input "
                     "arguments need to be specified when "
                     "\"per_sequence_contrib\" is used")

        # Make sure that the input files for --per_sequence_contrib exist.
        check_files_exist([per_sequence_abun, per_sequence_function])

    # Remove 'description' column if it exists.
    if "description" in in_metagenome.columns:
        in_metagenome.drop("description", axis=1, inplace=True)

    # Get list of sample ids.
    samples = [
        col for col in in_metagenome.columns
        if col not in ["function", "sequence"]
    ]

    # Initialize reactions to be empty unless regroup mapfile given.
    reactions = []

    # Regroup functions in input table to different ids if regroup mapfile is
    # provided.
    if regroup_mapfile:
        reactions = read_reaction_names(regroup_mapfile)

        in_metagenome = regroup_func_ids(in_metagenome, strat_format,
                                         regroup_mapfile, proc)
        regrouped_outfile = path.join(out_dir, "regrouped_infile.tsv")
        in_metagenome.to_csv(path_or_buf=regrouped_outfile,
                             sep="\t",
                             index=False)

    # Read in pathway structures.
    pathways_in = PathwaysDatabase(database=mapfile, reaction_names=reactions)

    # Write out mapfile with all structure removed.
    if run_minpath:
        minpath_mapfile = path.join(out_dir, "parsed_mapfile.tsv")
        with open(minpath_mapfile, "w") as out_map:
            out_map.write(pathways_in.get_database())
    else:
        minpath_mapfile = None

    # Subset input table of reactions to only those found in pathway database.
    in_metagenome = in_metagenome[in_metagenome.function.isin(
        pathways_in.reaction_list())]

    # Initialize output objects to be None (expect for usntratified abundance).
    path_cov_unstrat = None
    path_cov_strat = None
    path_abun_strat = None

    minpath_out_dir = path.join(out_dir, "minpath_running")
    make_output_dir(minpath_out_dir)

    # Run minpath wrapper on all samples if table is stratified. Note that
    # input stratified table is subsetted to required columns only.
    if strat_format:

        # Get unstratified and stratified pathway levels.
        # Note that stratified tables will only be returned by this step if
        # per_sequence_contrib=False (extra step required below).
        path_out_raw = Parallel(n_jobs=proc)(
            delayed(basic_strat_pathway_levels)
            (sample_id, in_metagenome[["function", "sequence", sample_id]],
             minpath_mapfile, minpath_out_dir, pathways_in, run_minpath,
             coverage, gap_fill_on, per_sequence_contrib, print_cmds)
            for sample_id in samples)

        # Split the output into unstratified and stratified.
        path_raw_abun_unstrat = []
        path_raw_cov_unstrat = []

        if not per_sequence_contrib:
            path_raw_abun_strat = []

            for sample_output in path_out_raw:
                path_raw_abun_unstrat += [sample_output[0]]
                path_raw_cov_unstrat += [sample_output[1]]
                path_raw_abun_strat += [sample_output[2]]

            # If --per_sequence_contrib not sent then prep output stratified
            # table the same as the unstratified table(s) below.
            path_abun_strat = prep_pathway_df_out(path_raw_abun_strat,
                                                  strat_index=True)

            path_abun_strat.columns = ["pathway", "sequence"] + samples

            path_abun_strat.sort_values(['pathway', 'sequence'], inplace=True)

        else:

            for sample_output in path_out_raw:
                path_raw_abun_unstrat += [sample_output[0]]
                path_raw_cov_unstrat += [sample_output[1]]

        # Prep unstratified output tables.
        path_abun_unstrat = prep_pathway_df_out(path_raw_abun_unstrat)
        path_abun_unstrat.columns = samples

        if coverage:
            path_cov_unstrat = prep_pathway_df_out(path_raw_cov_unstrat,
                                                   num_digits=10)
            path_cov_unstrat.columns = samples

    # Otherwise the data is in unstratified format, which is more straight-
    # forward to process.
    else:
        path_raw_unstrat = Parallel(n_jobs=proc)(
            delayed(unstrat_pathway_levels)(sample_id, in_metagenome[[
                "function", sample_id
            ]], minpath_mapfile, minpath_out_dir, pathways_in, run_minpath,
                                            coverage, gap_fill_on, print_cmds)
            for sample_id in samples)

        # Prep output df.
        path_raw_abun_unstrat = []
        path_raw_cov_unstrat = []

        for sample_output in path_raw_unstrat:
            path_raw_abun_unstrat += [sample_output[0]]
            path_raw_cov_unstrat += [sample_output[1]]

        path_abun_unstrat = prep_pathway_df_out(path_raw_abun_unstrat)

        if coverage:
            path_cov_unstrat = prep_pathway_df_out(path_raw_cov_unstrat,
                                                   num_digits=10)
            path_cov_unstrat.columns = samples
        else:
            path_cov_unstrat = None

        # Set column labels of unstratified dataframe to be sample names.
        path_abun_unstrat.columns = samples

    # Sort unstratified output tables by index name.
    path_abun_unstrat.sort_index(axis=0, inplace=True)

    if coverage:
        path_cov_unstrat.sort_index(axis=0, inplace=True)

    # Calculate pathway levels for each individual sequence (in parallel)
    # and then multiply this table by the abundance of each sequence
    # within each sample (using same approach as in metagenome pipeline).
    if per_sequence_contrib:

        per_seq_out_dir = path.join(out_dir, "minpath_running_per_seq")
        make_output_dir(per_seq_out_dir)

        path_abun_strat, path_cov_strat = per_sequence_contrib_levels(
            sequence_abun=per_sequence_abun,
            sequence_func=per_sequence_function,
            minpath_map=minpath_mapfile,
            per_seq_out_dir=per_seq_out_dir,
            pathway_db=pathways_in,
            run_minpath=run_minpath,
            calc_coverage=coverage,
            gap_fill_on=gap_fill_on,
            nproc=proc,
            regroup_map=regroup_mapfile,
            print_opt=print_cmds)

    return (path_abun_unstrat, path_cov_unstrat, path_abun_strat,
            path_cov_strat)
def full_pipeline(study_fasta, input_table, output_folder, processes, ref_dir,
                  in_traits, custom_trait_tables, marker_gene_table,
                  pathway_map, rxn_func, no_pathways, regroup_map, no_regroup,
                  stratified, max_nsti, min_reads, min_samples, hsp_method,
                  min_align, skip_nsti, skip_minpath, no_gap_fill, coverage,
                  per_sequence_contrib, wide_table, skip_norm,
                  remove_intermediate, verbose):
    '''Function that contains wrapper commands for full PICRUSt2 pipeline.
    Descriptions of all of these input arguments/options are given in the
    picrust2_pipeline.py script.'''

    # Throw warning if --per_sequence_contrib set but --stratified unset.
    if per_sequence_contrib and not stratified:
        print(
            "\nThe option --per_sequence_contrib was set, but not the option "
            "--stratified. This means that a stratified pathway table will "
            "be output only (i.e. a stratified metagenome table will NOT "
            "be output).\n",
            file=sys.stderr)

    out_tree = path.join(output_folder, "out.tre")

    if custom_trait_tables is None:

        # Check that specified functional categories are allowed.
        FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM']
        funcs = in_traits.split(",")
        for func in funcs:
            if func not in FUNC_TRAIT_OPTIONS:
                sys.exit("Error - specified category " + func + " is not " +
                         "one of the default categories.")

        func_tables = default_tables

    else:
        # Split paths to input custom trait tables and take the basename to be
        # the function id.
        funcs = []
        func_tables = {}

        for custom in custom_trait_tables.split(","):

            func_id = path.splitext(path.basename(custom))[0]
            funcs.append(func_id)
            func_tables[func_id] = custom

    # Add reaction function to be in set of gene families if it is not already
    # and as long as pathways are also to be predicted.
    if rxn_func not in funcs and not no_pathways:
        orig_rxn_func = rxn_func
        rxn_func = path.splitext(path.basename(rxn_func))[0]
        funcs.append(rxn_func)

        if rxn_func not in func_tables:
            func_tables[rxn_func] = orig_rxn_func

    if not skip_norm:
        # Append marker as well, since this also needs to be run.
        funcs.append("marker")
        func_tables["marker"] = marker_gene_table

    # Check that all input files exist.
    ref_msa, tree, hmm, model = identify_ref_files(ref_dir)
    files2check = [study_fasta, input_table, ref_msa, tree, hmm, model] + list(
        func_tables.values())

    if not no_pathways:
        files2check.append(pathway_map)

        # Throw warning if default pathway mapfile used with non-default
        # reference files.
        if pathway_map == default_pathway_map and ref_dir != default_ref_dir:
            print(
                "Warning - non-default reference files specified with "
                "default pathway mapfile of prokaryote-specific MetaCyc "
                "pathways (--pathway_map option). This usage may be "
                "unintended.",
                file=sys.stderr)

        if not no_regroup:
            files2check.append(regroup_map)

    # This will throw an error if any input files are not found.
    check_files_exist(files2check)

    # Check that sequence names in FASTA overlap with input table.
    check_overlapping_seqs(study_fasta, input_table, verbose)

    if path.exists(output_folder):
        sys.exit("Stopping since output directory " + output_folder +
                 " already exists.")

    # Make output folder.
    make_output_dir(output_folder)

    if verbose:
        print("Placing sequences onto reference tree", file=sys.stderr)

    # Define folders for intermediate files (unless --remove_intermediate set).
    if remove_intermediate:
        place_seqs_intermediate = ""
        pathways_intermediate = ""
    else:
        intermediate_dir = path.join(output_folder, "intermediate")
        make_output_dir(intermediate_dir)
        place_seqs_intermediate = path.join(intermediate_dir, "place_seqs")
        pathways_intermediate = path.join(intermediate_dir, "pathways")

    # Run place_seqs.py.
    place_seqs_cmd = [
        "place_seqs.py", "--study_fasta", study_fasta, "--ref_dir", ref_dir,
        "--out_tree", out_tree, "--processes",
        str(processes), "--intermediate", place_seqs_intermediate,
        "--min_align",
        str(min_align), "--chunk_size",
        str(5000)
    ]

    if verbose:
        place_seqs_cmd.append("--verbose")

    system_call_check(place_seqs_cmd,
                      print_command=verbose,
                      print_stdout=verbose,
                      print_stderr=True)

    if verbose:
        print("Finished placing sequences on output tree: " + out_tree,
              file=sys.stderr)

    # Get predictions for all specified functions and keep track of outfiles.
    predicted_funcs = {}

    if not skip_norm:
        # Make sure marker database is first in the list. This is because this will
        # be run on a single core and so will be easier to identify any errors
        # if the program exits when working on this function type.
        funcs.insert(0, funcs.pop(funcs.index("marker")))

    for func in funcs:
        # Change output filename for NSTI and non-NSTI containing files.
        hsp_outfile = path.join(output_folder, func + "_predicted")

        if (func == "marker" and not skip_nsti) or (skip_norm
                                                    and not skip_nsti):
            hsp_outfile = hsp_outfile + "_and_nsti.tsv.gz"
        else:
            hsp_outfile = hsp_outfile + ".tsv.gz"

        # Keep track of output filename for next step of pipeline.
        predicted_funcs[func] = hsp_outfile

        # Run hsp.py for each function database.
        hsp_cmd = [
            "hsp.py", "--tree", out_tree, "--output", hsp_outfile,
            "--observed_trait_table", func_tables[func], "--hsp_method",
            hsp_method, "--seed", "100"
        ]

        # Add flags to command if specified.
        if (func == "marker" and not skip_nsti) or (skip_norm
                                                    and not skip_nsti):
            hsp_cmd.append("--calculate_NSTI")

        # Run marker on only 1 processor.
        if func == "marker":
            hsp_cmd += ["--processes", "1"]
        else:
            hsp_cmd += ["--processes", str(processes)]

        if verbose:
            hsp_cmd.append("--verbose")

        system_call_check(hsp_cmd,
                          print_command=verbose,
                          print_stdout=verbose,
                          print_stderr=True)

    # Now run metagenome pipeline commands.
    # Inititalize dictionary of function names --> metagenome output files.
    func_output = {}

    # Loop over each function again and run metagenome pipeline.
    for func in funcs:

        if func == "marker":
            continue

        if verbose:
            print("Running metagenome pipeline for " + func, file=sys.stderr)

        func_output_dir = path.join(output_folder, func + "_metagenome_out")

        metagenome_pipeline_cmd = [
            "metagenome_pipeline.py", "--input", input_table, "--function",
            predicted_funcs[func], "--min_reads",
            str(min_reads), "--min_samples",
            str(min_samples), "--out_dir", func_output_dir
        ]

        # Initialize two-element list as value for each function.
        # First value will be unstratified output and second will be
        # stratified output.
        func_output[func] = [None, None]

        func_output[func][0] = path.join(func_output_dir,
                                         "pred_metagenome_unstrat.tsv.gz")

        if wide_table:
            metagenome_pipeline_cmd.append("--wide_table")

        if not skip_nsti:
            metagenome_pipeline_cmd += ["--max_nsti", str(max_nsti)]

        if skip_norm:
            metagenome_pipeline_cmd.append("--skip_norm")
        else:
            metagenome_pipeline_cmd += ["--marker", predicted_funcs["marker"]]

        if stratified:
            metagenome_pipeline_cmd.append("--strat_out")

            if wide_table:
                func_output[func][1] = path.join(
                    func_output_dir, "pred_metagenome_strat.tsv.gz")
            else:
                func_output[func][1] = path.join(
                    func_output_dir, "pred_metagenome_contrib.tsv.gz")

        system_call_check(metagenome_pipeline_cmd,
                          print_command=verbose,
                          print_stdout=verbose,
                          print_stderr=True)

    # Now infer pathway abundances and coverages unless --no_pathways set.
    pathway_outfiles = None

    if not no_pathways:

        path_output_dir = path.join(output_folder, "pathways_out")

        if verbose:
            print("Inferring pathways from predicted " + rxn_func)

        # Determine whether stratified or unstratified table should be input.
        if not stratified or per_sequence_contrib:
            rxn_input_metagenome = func_output[rxn_func][0]
        else:
            rxn_input_metagenome = func_output[rxn_func][1]

        pathway_pipeline_cmd = [
            "pathway_pipeline.py", "--input", rxn_input_metagenome,
            "--out_dir", path_output_dir, "--map", pathway_map,
            "--intermediate", pathways_intermediate, "--proc",
            str(processes)
        ]

        if no_gap_fill:
            pathway_pipeline_cmd.append("--no_gap_fill")

        if skip_minpath:
            pathway_pipeline_cmd.append("--skip_minpath")

        if coverage:
            pathway_pipeline_cmd.append("--coverage")

        if no_regroup:
            pathway_pipeline_cmd.append("--no_regroup")
        else:
            pathway_pipeline_cmd += ["--regroup_map", regroup_map]

        if wide_table:
            pathway_pipeline_cmd.append("--wide_table")

        if per_sequence_contrib:
            pathway_pipeline_cmd.append("--per_sequence_contrib")

            if skip_norm:
                norm_sequence_abun = input_table
            else:
                norm_sequence_abun = path.join(output_folder,
                                               rxn_func + "_metagenome_out",
                                               "seqtab_norm.tsv.gz")

            pathway_pipeline_cmd += ["--per_sequence_abun", norm_sequence_abun]

            pathway_pipeline_cmd += [
                "--per_sequence_function", predicted_funcs[rxn_func]
            ]

        if verbose:
            pathway_pipeline_cmd.append("--verbose")

        system_call_check(pathway_pipeline_cmd,
                          print_command=verbose,
                          print_stdout=False,
                          print_stderr=True)

        if verbose:
            print("Wrote predicted pathway abundances and coverages to " +
                  path_output_dir,
                  file=sys.stderr)

        # Keep track of output filenames if this function is being used in
        # a non-default way (e.g. with a QIIME2 plugin).
        pathway_outfiles = {}

        pathway_outfiles["unstrat_abun"] = path.join(
            path_output_dir, "path_abun_unstrat.tsv.gz")
        pathway_outfiles["unstrat_cov"] = path.join(path_output_dir,
                                                    "path_cov_unstrat.tsv.gz")

        pathway_outfiles["strat_abun"] = None
        pathway_outfiles["strat_cov"] = None

        if stratified or per_sequence_contrib:
            if wide_table:
                pathway_outfiles["strat_abun"] = path.join(
                    path_output_dir, "path_abun_strat.tsv.gz")

                if per_sequence_contrib:
                    pathway_outfiles["strat_cov"] = path.join(
                        path_output_dir, "path_cov_strat.tsv.gz")

            else:
                pathway_outfiles["strat_abun"] = path.join(
                    path_output_dir, "path_abun_contrib.tsv.gz")
                if per_sequence_contrib:
                    pathway_outfiles["strat_cov"] = path.join(
                        path_output_dir, "path_cov_contrib.tsv.gz")

    return (func_output, pathway_outfiles)
コード例 #17
0
def full_pipeline(study_fasta, input_table, output_folder, threads, ref_msa,
                  tree, hmm, in_traits, custom_trait_tables, marker_gene_table,
                  pathway_map, no_pathways, regroup_map, no_regroup,
                  stratified, alignment_tool, max_nsti, min_reads, min_samples,
                  hsp_method, calculate_NSTI, confidence, seed, no_gap_fill,
                  per_sequence_contrib, no_descrip, verbose):
    '''Function that contains wrapper commands for full PICRUSt2 pipeline.
    Descriptions of all of these input arguments/options are given in the
    picrust2_pipeline.py script.'''

    # Check that input files exist.
    check_files_exist([study_fasta, input_table])

    if path.exists(output_folder):
        sys.exit("Stopping - output directory " + output_folder +
                 " already exists.")

    # Make output folder.
    make_output_dir(output_folder)

    out_tree = path.join(output_folder, "out.tre")

    if custom_trait_tables is None:

        # Check that specified functional categories are allowed.
        FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM']
        funcs = in_traits.split(",")
        for func in funcs:
            if func not in FUNC_TRAIT_OPTIONS:
                sys.exit("Error - specified category " + func +
                         " is not one of "
                         "the default categories.")

        # Add EC to this set if pathways are to be predicted.
        if "EC" not in funcs and not no_pathways:
            funcs.append("EC")

        rxn_func = "EC"

        func_tables = default_tables

    else:

        no_descrip = True

        funcs = []
        func_tables = {}

        table_i = 0

        for custom in custom_trait_tables.split(","):

            func_id = path.splitext(path.basename(custom))[0]
            funcs.append(func_id)
            func_tables[func_id] = custom

            if table_i == 0:
                rxn_func = func_id
                table_i += 1

    # Append marker as well, since this also needs to be run.
    funcs.append("marker")
    func_tables["marker"] = marker_gene_table

    # Methods for discrete trait prediction with CI enabled.
    discrete_set = set(['emp_prob', 'mp'])

    if confidence and hsp_method in discrete_set:
        ci_setting = True
    else:
        ci_setting = False

    gap_fill_opt = not no_gap_fill

    if verbose:
        print("Placing sequences onto reference tree.")

    # Define folders for intermediate files.
    intermediate_dir = path.join(output_folder, "intermediate")
    place_seqs_intermediate = path.join(intermediate_dir, "place_seqs")
    make_output_dir(intermediate_dir)
    make_output_dir(place_seqs_intermediate)

    place_seqs_pipeline(study_fasta=study_fasta,
                        ref_msa=ref_msa,
                        tree=tree,
                        hmm=hmm,
                        out_tree=out_tree,
                        alignment_tool=alignment_tool,
                        threads=threads,
                        out_dir=place_seqs_intermediate,
                        chunk_size=5000,
                        print_cmds=verbose)

    if verbose:
        print("Finished placing sequences on output tree: " + out_tree)

    # Get predictions for all specified functions and keep track of outfiles.
    predicted_funcs = {}

    for func in funcs:

        count_outfile = hsp_pipeline_steps(func=func,
                                           calculate_NSTI=calculate_NSTI,
                                           out_tree=out_tree,
                                           func_table_in=func_tables[func],
                                           hsp_method=hsp_method,
                                           ci_setting=ci_setting,
                                           threads=threads,
                                           seed=seed,
                                           output_folder=output_folder,
                                           verbose=verbose)

        # Keep track of output file name for next step of pipeline.
        predicted_funcs[func] = count_outfile

    marker_infile = predicted_funcs["marker"]

    # Inititalize dictionary of function names to output filenames to return.
    func_output = {}

    # Each value will be a list of 2 elements corresponding to the unstratified
    # and stratified tables respectively (stratified will be None of not calculated).

    # Loop over each function again and run metagenome pipeline.
    for func in funcs:

        if func == "marker":
            continue

        if verbose:
            print("Running metagenome pipeline for " + func)

        func_infile = predicted_funcs[func]

        func_output_dir = path.join(output_folder, func + "_metagenome_out")

        func_map = None

        if func in default_map:
            func_map = default_map[func]

        func_strat_out, func_unstrat_out = metagenome_pipeline_steps(
            input_table=input_table,
            func_infile=func_infile,
            marker_infile=marker_infile,
            func_output_dir=func_output_dir,
            no_descrip=no_descrip,
            max_nsti=max_nsti,
            min_reads=min_reads,
            min_samples=min_samples,
            stratified=stratified,
            threads=threads,
            func_map=func_map,
            verbose=verbose)
        if stratified:
            func_output[func] = func_strat_out
        else:
            func_output[func] = func_unstrat_out

    pathway_outfiles = None

    # Infer pathway abundances and coverages unless --no_pathways set.
    if not no_pathways:

        pathways_intermediate = path.join(intermediate_dir, "pathways")
        make_output_dir(pathways_intermediate)

        if verbose:
            print("Inferring pathways from predicted " + rxn_func)

        predicted_rxn = func_output[rxn_func]

        # Set regrouping mapfile to be empty if no_regroup set.
        if no_regroup:
            regroup_map = None

        unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline(
            inputfile=predicted_rxn,
            mapfile=pathway_map,
            regroup_mapfile=regroup_map,
            proc=threads,
            out_dir=pathways_intermediate,
            gap_fill=gap_fill_opt,
            per_sequence_contrib=per_sequence_contrib,
            print_cmds=verbose)

        pathways_out = path.join(output_folder, "pathways_out")

        unstrat_abun.index.name = 'pathway'
        unstrat_cov.index.name = 'pathway'
        unstrat_abun.reset_index(inplace=True)
        unstrat_cov.reset_index(inplace=True)

        pathway_outfiles = {}

        if not no_descrip:
            unstrat_abun = add_descrip_col(inputfile=unstrat_abun,
                                           mapfile=default_map["METACYC"],
                                           in_df=True)
        if not no_descrip:
            unstrat_cov = add_descrip_col(inputfile=unstrat_cov,
                                          mapfile=default_map["METACYC"],
                                          in_df=True)

        if verbose:
            print("Writing predicted pathway abundances and coverages to " +
                  pathways_out)

        make_output_dir(pathways_out)

        unstrat_abun_outfile = path.join(pathways_out, "path_abun_unstrat.tsv")
        unstrat_cov_outfile = path.join(pathways_out, "path_cov_unstrat.tsv")

        unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile,
                            sep="\t",
                            index=False)
        unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile,
                           sep="\t",
                           index=False)

        pathway_outfiles["unstrat_abun"] = unstrat_abun_outfile
        pathway_outfiles["unstrat_cov"] = unstrat_cov_outfile

        strat_abun_outfile = None
        strat_cov_outfile = None

        # Write stratified output only if something besides None was returned.
        if strat_abun is not None:

            if not no_descrip:
                strat_abun = add_descrip_col(inputfile=strat_abun,
                                             mapfile=default_map["METACYC"],
                                             in_df=True)
            strat_abun_outfile = path.join(pathways_out, "path_abun_strat.tsv")
            strat_abun.to_csv(path_or_buf=strat_abun_outfile,
                              sep="\t",
                              index=False)

        if strat_cov is not None:

            if not no_descrip:
                strat_cov = add_descrip_col(inputfile=strat_cov,
                                            mapfile=default_map["METACYC"],
                                            in_df=True)

            strat_cov_outfile = path.join(pathways_out, "path_cov_strat.tsv")
            strat_cov.to_csv(path_or_buf=strat_cov_outfile,
                             sep="\t",
                             index=False)

        pathway_outfiles["strat_abun"] = strat_abun_outfile
        pathway_outfiles["strat_cov"] = strat_cov_outfile

    return (func_output, pathway_outfiles)
コード例 #18
0
ファイル: run_minpath.py プロジェクト: misazaa/picrust2
def main():

    args = parser.parse_args()

    # Check that input files exist.
    check_files_exist([args.input, args.map])

    gap_fill_opt = not args.no_gap_fill

    # If no regrouping flag set then set input regrouping mapfile to be None.
    if args.no_regroup:
        args.regroup_map = None

    # If intermediate output directory set then create and output there.
    # Otherwise make a temporary directory for the intermediate files.
    if args.intermediate:

        make_output_dir(args.intermediate)

        unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline(
            inputfile=args.input,
            mapfile=args.map,
            regroup_mapfile=args.regroup_map,
            proc=args.proc,
            out_dir=args.intermediate,
            gap_fill=gap_fill_opt,
            per_sequence_contrib=args.per_sequence_contrib,
            print_cmds=args.print_cmds)
    else:
        with TemporaryDirectory() as temp_dir:
            unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline(
                inputfile=args.input,
                mapfile=args.map,
                regroup_mapfile=args.regroup_map,
                proc=args.proc,
                out_dir=temp_dir,
                gap_fill=gap_fill_opt,
                per_sequence_contrib=args.per_sequence_contrib,
                print_cmds=args.print_cmds)

    make_output_dir(args.out_dir)

    # Write output files.
    unstrat_abun_outfile = path.join(args.out_dir, "path_abun_unstrat.tsv")
    unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile,
                        sep="\t",
                        index_label="pathway")

    unstrat_cov_outfile = path.join(args.out_dir, "path_cov_unstrat.tsv")
    unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile,
                       sep="\t",
                       index_label="pathway")

    # Write stratified output only if something besides None was returned.
    if strat_abun is not None:
        strat_abun_outfile = path.join(args.out_dir, "path_abun_strat.tsv")
        strat_abun.to_csv(path_or_buf=strat_abun_outfile,
                          sep="\t",
                          index=False)

    if strat_cov is not None:
        strat_cov_outfile = path.join(args.out_dir, "path_cov_strat.tsv")
        strat_cov.to_csv(path_or_buf=strat_cov_outfile, sep="\t", index=False)
コード例 #19
0
ファイル: picrust2_pipeline.py プロジェクト: misazaa/picrust2
def main():

    args = parser.parse_args()

    # Get start time.
    start_time = time.time()

    # Check that input files exist.
    check_files_exist([args.study_fasta, args.input])

    # Make output folder.
    make_output_dir(args.output)

    out_tree = path.join(args.output, "out.tre")

    if args.custom_trait_tables is None:

        # Check that specified functional categories are allowed.
        FUNC_TRAIT_OPTIONS = ['COG', 'EC', 'KO', 'PFAM', 'TIGRFAM']
        funcs = args.in_traits.split(",")
        for func in funcs:
            if func not in FUNC_TRAIT_OPTIONS:
                sys.exit("Error - specified category " + func +
                         " is not one of "
                         "the default categories.")

        # Add EC to this set if pathways are to be predicted.
        if "EC" not in funcs and not args.no_pathways:
            funcs.append("EC")

        rxn_func = "EC"

        func_tables = default_tables

    else:
        funcs = []
        func_tables = {}

        table_i = 0

        for custom in args.custom_trait_tables.split(","):

            func_id = path.splitext(path.basename(custom))[0]
            funcs.append(func_id)
            func_tables[func_id] = custom

            if table_i == 0:
                rxn_func = func_id
                table_i += 1

    # Append marker as well, since this also needs to be run.
    funcs.append("marker")
    func_tables["marker"] = args.marker_gene_table

    # Methods for discrete trait prediction with CI enabled.
    discrete_set = set(['emp_prob', 'mp'])

    if args.confidence and args.hsp_method in discrete_set:
        ci_setting = True
    else:
        ci_setting = False

    gap_fill_opt = not args.no_gap_fill

    with TemporaryDirectory() as temp_dir:

        print("Placing sequences onto reference tree.")

        place_seqs_pipeline(study_fasta=args.study_fasta,
                            ref_msa=args.ref_msa,
                            tree=args.tree,
                            out_tree=out_tree,
                            threads=args.threads,
                            papara_output=None,
                            out_dir=temp_dir,
                            chunk_size=5000,
                            print_cmds=args.print_cmds)

        print("Finished placing sequences on output tree: " + out_tree)

    # Get predictions for all specified functions and keep track of outfiles.
    predicted_funcs = {}

    for func in funcs:

        # Only output NSTI in 16S table.
        nsti_setting = False
        if func == "marker" and args.calculate_NSTI:
            nsti_setting = True

        print("Running hidden-state prediction for " + func)

        hsp_table, ci_table = castor_hsp_workflow(
            tree_path=out_tree,
            trait_table_path=func_tables[func],
            hsp_method=args.hsp_method,
            calc_nsti=nsti_setting,
            calc_ci=ci_setting,
            check_input=False,
            num_proc=args.threads,
            ran_seed=args.seed)

        count_outfile = path.join(args.output, func + "_predicted.tsv")

        # Add "_nsti" to filename if output.
        if nsti_setting:
            count_outfile = path.join(args.output,
                                      func + "_nsti_predicted.tsv")

        # Keep track of output file name for next step of pipeline.
        predicted_funcs[func] = count_outfile

        print("Writing out predicted gene family abundances to " +
              count_outfile)

        hsp_table.to_csv(path_or_buf=count_outfile,
                         index_label="sequence",
                         sep="\t")

        # Output the CI file as well if option set.
        if ci_setting:
            ci_outfile = path.join(args.output, func + "_predicted_ci.tsv")
            print("Writing out predicted gene family CIs to " + ci_outfile)
            ci_table.to_csv(path_or_buf=ci_outfile,
                            index_label="sequence",
                            sep="\t")

    marker_infile = predicted_funcs["marker"]

    # Loop over each function again and run metagenome pipeline.
    for func in funcs:

        if func == "marker":
            continue

        func_infile = predicted_funcs[func]

        func_output_dir = path.join(args.output, func + "_metagenome_out")

        print("Running metagenome pipeline for " + func)

        # Infer metagenome abundances per-sample.
        with TemporaryDirectory() as temp_dir:

            # Pass arguments to key function and get predicted functions
            # stratified and unstratified by genomes.
            strat_pred, unstrat_pred = run_metagenome_pipeline(
                input_biom=args.input,
                function=func_infile,
                marker=marker_infile,
                out_dir=func_output_dir,
                max_nsti=args.max_nsti,
                min_reads=args.min_reads,
                min_samples=args.min_samples,
                strat_out=args.stratified,
                proc=args.threads,
                output_normfile=True)

            print("Writing metagenome output files for " + func + " to: " +
                  func_output_dir)

            # Generate output table filepaths and write out pandas dataframe.
            unstrat_outfile = path.join(func_output_dir,
                                        "pred_metagenome_unstrat.tsv")

            unstrat_pred.index.name = "function"
            unstrat_pred.reset_index(inplace=True)

            if args.custom_trait_tables is None:
                unstrat_pred = add_descrip_col(inputfile=unstrat_pred,
                                               mapfile=default_map[func],
                                               in_df=True)

            unstrat_pred.to_csv(path_or_buf=unstrat_outfile,
                                sep="\t",
                                index=False)

            # Write out stratified table only if that option was specified.
            if args.stratified:
                strat_outfile = path.join(func_output_dir,
                                          "pred_metagenome_strat.tsv")
                strat_pred.reset_index(inplace=True)

                if args.custom_trait_tables is None:
                    strat_pred = add_descrip_col(inputfile=strat_pred,
                                                 mapfile=default_map[func],
                                                 in_df=True)

                strat_pred.to_csv(path_or_buf=strat_outfile,
                                  sep="\t",
                                  index=False)

    # Infer pathway abundances and coverages unless --no_pathways set.
    if not args.no_pathways:

        if args.stratified:
            in_metagenome = path.join(args.output,
                                      rxn_func + "_metagenome_out",
                                      "pred_metagenome_strat.tsv")
        else:
            in_metagenome = path.join(args.output,
                                      rxn_func + "_metagenome_out",
                                      "pred_metagenome_unstrat.tsv")

        print("Inferring MetaCyc pathways from predicted functions in this "
              "file: " + in_metagenome)

        with TemporaryDirectory() as temp_dir:
            unstrat_abun, unstrat_cov, strat_abun, strat_cov = run_minpath_pipeline(
                inputfile=in_metagenome,
                mapfile=default_pathway_map,
                regroup_mapfile=default_regroup_map,
                proc=args.threads,
                out_dir=temp_dir,
                gap_fill=gap_fill_opt,
                per_sequence_contrib=args.per_sequence_contrib,
                print_cmds=args.print_cmds)

        pathways_out = path.join(args.output, "pathways_out")

        make_output_dir(pathways_out)

        print("Writing predicted pathway abundances and coverages to " +
              pathways_out)

        # Write output files.
        unstrat_abun_outfile = path.join(pathways_out, "path_abun_unstrat.tsv")
        unstrat_abun.reset_index(inplace=True)

        if args.custom_trait_tables is None:
            unstrat_abun = add_descrip_col(inputfile=unstrat_abun,
                                           mapfile=default_map["METACYC"],
                                           in_df=True)

        unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile,
                            sep="\t",
                            index=False)

        unstrat_cov_outfile = path.join(pathways_out, "path_cov_unstrat.tsv")
        unstrat_cov.reset_index(inplace=True)

        if args.custom_trait_tables is None:
            unstrat_cov = add_descrip_col(inputfile=unstrat_cov,
                                          mapfile=default_map["METACYC"],
                                          in_df=True)

        unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile,
                           sep="\t",
                           index=False)

        # Write stratified output only if something besides None was returned.
        if strat_abun is not None:
            strat_abun_outfile = path.join(pathways_out, "path_abun_strat.tsv")

            if args.custom_trait_tables is None:
                strat_abun = add_descrip_col(inputfile=strat_abun,
                                             mapfile=default_map["METACYC"],
                                             in_df=True)
            strat_abun.to_csv(path_or_buf=strat_abun_outfile,
                              sep="\t",
                              index=False)

        if strat_cov is not None:
            strat_cov_outfile = path.join(pathways_out, "path_cov_strat.tsv")

            if args.custom_trait_tables is None:
                strat_cov = add_descrip_col(inputfile=strat_cov,
                                            mapfile=default_map["METACYC"],
                                            in_df=True)

            strat_cov.to_csv(path_or_buf=strat_cov_outfile,
                             sep="\t",
                             index=False)

    # Print out elapsed time.
    elapsed_time = time.time() - start_time
    print("Completed PICRUSt2 pipeline in " + "%.2f" % elapsed_time +
          " seconds.")