Ejemplo n.º 1
0
    def test_unstrat_default_pipeline(self):
        '''Test running default pipeline on unstratified input table.'''

        with TemporaryDirectory() as temp_dir:
            unstrat_path_abun_df, unstrat_path_cov_df, strat_abun, strat_cov = pathway_pipeline(in_metagenome_unstrat,
                                                                                                default_pathway_map,
                                                                                                proc=1,
                                                                                                out_dir=temp_dir,
                                                                                                run_minpath=True,
                                                                                                coverage=True,
                                                                                                regroup_mapfile=default_regroup_map,
                                                                                                gap_fill_on=True,
                                                                                                per_sequence_contrib=False,
                                                                                                print_cmds=False)

        # Compare these predicted tables to expected tables.
        exp_abun_unstrat = pd.read_csv(exp_abun_unstrat_file, sep="\t",
                                       index_col="pathway")

        exp_cov_unstrat = pd.read_csv(exp_cov_unstrat_file, sep="\t",
                                       index_col="pathway")

        pd.testing.assert_frame_equal(exp_abun_unstrat, unstrat_path_abun_df,
                                      check_like=True, check_less_precise=True)

        pd.testing.assert_frame_equal(exp_cov_unstrat, unstrat_path_cov_df,
                                      check_like=True, check_less_precise=True)
    def test_strat_default_pipeline(self):
        '''Test running strat_minpath default pipeline. Make sure that
        community wide stratified abundances are calculated correctly and
        that unstratified abundances are right. Note that wide-format
        stratified tables are tested for this test only.'''

        with TemporaryDirectory() as temp_dir:
            unstrat_pathabun, \
                unstrat_pathcov, \
                strat_pathabun, \
                strat_pathcov, \
                pathabun_by_seq, \
                pathcov_by_seq, \
                unstrat_pathabun_per_seq = pathway_pipeline(in_metagenome_strat,
                                                            default_pathway_map,
                                                            proc=1,
                                                            out_dir=temp_dir,
                                                            run_minpath=True,
                                                            coverage=True,
                                                            regroup_mapfile=default_regroup_map,
                                                            gap_fill_on=True,
                                                            per_sequence_contrib=False,
                                                            verbose=True,
                                                            wide_table=True)

        # Compare these predicted tables to expected tables.
        exp_abun_unstrat = pd.read_csv(exp_abun_unstrat_file,
                                       sep="\t",
                                       index_col="pathway")

        exp_cov_unstrat = pd.read_csv(exp_cov_unstrat_file,
                                      sep="\t",
                                      index_col="pathway")

        exp_abun_strat = pd.read_csv(exp_abun_strat_file, sep="\t")

        # Sort stratified files (different versions can sort the output
        # slightly differently).
        strat_pathabun.sort_values(['pathway', 'sequence'], inplace=True)
        exp_abun_strat.sort_values(['pathway', 'sequence'], inplace=True)

        # Reset index labels.
        exp_abun_strat.reset_index(drop=True, inplace=True)
        strat_pathabun.reset_index(drop=True, inplace=True)

        pd.testing.assert_frame_equal(exp_abun_unstrat,
                                      unstrat_pathabun,
                                      check_like=True,
                                      check_less_precise=True)

        pd.testing.assert_frame_equal(exp_cov_unstrat,
                                      unstrat_pathcov,
                                      check_like=True,
                                      check_less_precise=True)

        pd.testing.assert_frame_equal(exp_abun_strat,
                                      strat_pathabun,
                                      check_like=True,
                                      check_less_precise=True)
    def test_strat_per_genome_pipeline_strat_input(self):
        '''Test running strat_minpath default pipeline. Make sure that
        per genome contributions are correct (per_sequence_contrib set).
        In this case the input is a stratified table.'''

        with TemporaryDirectory() as temp_dir:
            unstrat_pathabun, \
                unstrat_pathcov, \
                strat_pathabun, \
                strat_pathcov, \
                pathabun_by_seq, \
                pathcov_by_seq, \
                unstrat_pathabun_per_seq = pathway_pipeline(in_metagenome_strat_per_seq,
                                                            default_pathway_map,
                                                            proc=1,
                                                            out_dir=temp_dir,
                                                            run_minpath=True,
                                                            coverage=True,
                                                            regroup_mapfile=default_regroup_map,
                                                            gap_fill_on=True,
                                                            per_sequence_contrib=True,
                                                            per_sequence_abun=in_per_seq_abun,
                                                            per_sequence_function=in_per_seq_func,
                                                            verbose=True)

        # Compare these predicted tables to expected tables.
        exp_abun_unstrat = pd.read_csv(exp_abun_unstrat_per_genome_file,
                                       sep="\t",
                                       index_col="pathway")

        exp_abun_unstrat_per_seq = pd.read_csv(
            exp_abun_unstrat_per_genome_per_seq_file,
            sep="\t",
            index_col="pathway")

        exp_cov_unstrat = pd.read_csv(exp_cov_unstrat_per_genome_file,
                                      sep="\t",
                                      index_col="pathway")
        exp_abun_strat = pd.read_csv(exp_abun_strat_per_genome_file, sep="\t")
        exp_cov_strat = pd.read_csv(exp_cov_strat_per_genome_file, sep="\t")

        # Sort stratified files (different versions can sort the output
        # slightly differently).
        strat_pathabun.sort_values(['function', 'taxon'], inplace=True)
        exp_abun_strat.sort_values(['function', 'taxon'], inplace=True)

        strat_pathcov.sort_values(['function', 'taxon'], inplace=True)
        exp_cov_strat.sort_values(['function', 'taxon'], inplace=True)

        # Reset index labels.
        exp_abun_strat.reset_index(drop=True, inplace=True)
        strat_pathabun.reset_index(drop=True, inplace=True)

        exp_cov_strat.reset_index(drop=True, inplace=True)
        strat_pathcov.reset_index(drop=True, inplace=True)

        pd.testing.assert_frame_equal(exp_abun_unstrat,
                                      unstrat_pathabun,
                                      check_like=True,
                                      check_less_precise=True)

        pd.testing.assert_frame_equal(exp_abun_unstrat_per_seq,
                                      unstrat_pathabun_per_seq,
                                      check_like=True,
                                      check_less_precise=True)

        pd.testing.assert_frame_equal(exp_cov_unstrat,
                                      unstrat_pathcov,
                                      check_like=True,
                                      check_less_precise=True)

        pd.testing.assert_frame_equal(exp_abun_strat,
                                      strat_pathabun,
                                      check_like=True,
                                      check_less_precise=True)

        pd.testing.assert_frame_equal(exp_cov_strat,
                                      strat_pathcov,
                                      check_like=True,
                                      check_less_precise=True)
Ejemplo n.º 4
0
def main():

    args = parser.parse_args()

    # Check that input files exist.
    check_files_exist([args.input, args.map])

    gap_fill_opt = not args.no_gap_fill

    run_minpath_opt = not args.skip_minpath

    # If intermediate output directory set then create and output there.
    # Otherwise make a temporary directory for the intermediate files.
    if args.intermediate:

        make_output_dir(args.intermediate)

        unstrat_abun, unstrat_cov, strat_abun, strat_cov = pathway_pipeline(
                                                      inputfile=args.input,
                                                      mapfile=args.map,
                                                      regroup_mapfile=args.regroup_map,
                                                      proc=args.proc,
                                                      out_dir=args.intermediate,
                                                      run_minpath=run_minpath_opt,
                                                      coverage=args.coverage,
                                                      gap_fill_on=gap_fill_opt,
                                                      no_regroup=args.no_regroup,
                                                      per_sequence_contrib=args.per_sequence_contrib,
                                                      per_sequence_abun=args.per_sequence_abun,
                                                      per_sequence_function=args.per_sequence_function,
                                                      print_cmds=args.print_cmds)
    else:
        with TemporaryDirectory() as temp_dir:
            unstrat_abun, unstrat_cov, strat_abun, strat_cov = pathway_pipeline(
                                                            inputfile=args.input,
                                                            mapfile=args.map,
                                                            regroup_mapfile=args.regroup_map,
                                                            proc=args.proc,
                                                            out_dir=temp_dir,
                                                            run_minpath=run_minpath_opt,
                                                            coverage=args.coverage,
                                                            gap_fill_on=gap_fill_opt,
                                                            no_regroup=args.no_regroup,
                                                            per_sequence_contrib=args.per_sequence_contrib,
                                                            per_sequence_abun=args.per_sequence_abun,
                                                            per_sequence_function=args.per_sequence_function,
                                                            print_cmds=args.print_cmds)

    make_output_dir(args.out_dir)

    # Write output files.
    unstrat_abun_outfile = path.join(args.out_dir, "path_abun_unstrat.tsv.gz")
    unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile,  sep="\t",
                       index_label="pathway", compression="gzip")

    if args.coverage:
        unstrat_cov_outfile = path.join(args.out_dir, "path_cov_unstrat.tsv.gz")
        unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile,  sep="\t",
                           index_label="pathway", compression="gzip")

    # Write stratified output only if something besides None was returned.
    if strat_abun is not None:
        strat_abun_outfile = path.join(args.out_dir, "path_abun_strat.tsv.gz")
        strat_abun.to_csv(path_or_buf=strat_abun_outfile,  sep="\t",
                          index=False, compression="gzip")

    if args.coverage and strat_cov is not None:
        strat_cov_outfile = path.join(args.out_dir, "path_cov_strat.tsv.gz")
        strat_cov.to_csv(path_or_buf=strat_cov_outfile,  sep="\t",
                         index=False, compression="gzip")
Ejemplo n.º 5
0
def main():

    args = parser.parse_args()

    # Check that input files exist.
    check_files_exist([args.input, args.map])

    gap_fill_opt = not args.no_gap_fill

    run_minpath_opt = not args.skip_minpath

    # If intermediate output directory set then create and output there.
    # Otherwise make a temporary directory for the intermediate files.
    if args.intermediate:

        make_output_dir(args.intermediate)

        unstrat_abun, \
            unstrat_cov, \
            strat_abun, \
            strat_cov, \
            path_abun_by_seq, \
            path_cov_by_seq, \
            unstrat_abun_per_seq = pathway_pipeline(
                            inputfile=args.input,
                            mapfile=args.map,
                            regroup_mapfile=args.regroup_map,
                            proc=args.processes,
                            out_dir=args.intermediate,
                            run_minpath=run_minpath_opt,
                            coverage=args.coverage,
                            gap_fill_on=gap_fill_opt,
                            no_regroup=args.no_regroup,
                            per_sequence_contrib=args.per_sequence_contrib,
                            per_sequence_abun=args.per_sequence_abun,
                            per_sequence_function=args.per_sequence_function,
                            wide_table=args.wide_table,
                            print_cmds=args.print_cmds)
    else:
        with TemporaryDirectory() as temp_dir:
            unstrat_abun, \
                unstrat_cov, \
                strat_abun, \
                strat_cov, \
                path_abun_by_seq, \
                path_cov_by_seq, \
                unstrat_abun_per_seq = pathway_pipeline(
                            inputfile=args.input,
                            mapfile=args.map,
                            regroup_mapfile=args.regroup_map,
                            proc=args.processes,
                            out_dir=temp_dir,
                            run_minpath=run_minpath_opt,
                            coverage=args.coverage,
                            gap_fill_on=gap_fill_opt,
                            no_regroup=args.no_regroup,
                            per_sequence_contrib=args.per_sequence_contrib,
                            per_sequence_abun=args.per_sequence_abun,
                            per_sequence_function=args.per_sequence_function,
                            wide_table=args.wide_table,
                            print_cmds=args.print_cmds)

    make_output_dir(args.out_dir)

    # Write output files. The unstratified abundance table will always be
    # written, but the other files will only be written if applicable.
    unstrat_abun_outfile = path.join(args.out_dir, "path_abun_unstrat.tsv.gz")
    unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile,
                        sep="\t",
                        index_label="pathway",
                        compression="gzip")

    if args.coverage:
        unstrat_cov_outfile = path.join(args.out_dir,
                                        "path_cov_unstrat.tsv.gz")
        unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile,
                           sep="\t",
                           index_label="pathway",
                           compression="gzip")

    if strat_abun is not None:

        if args.wide_table:
            strat_abun_outfile = path.join(args.out_dir,
                                           "path_abun_strat.tsv.gz")
        else:
            strat_abun_outfile = path.join(args.out_dir,
                                           "path_abun_contrib.tsv.gz")

        strat_abun.to_csv(path_or_buf=strat_abun_outfile,
                          sep="\t",
                          index=False,
                          compression="gzip")

    if args.coverage and strat_cov is not None:
        if args.wide_table:
            strat_cov_outfile = path.join(args.out_dir,
                                          "path_cov_strat.tsv.gz")
        else:
            strat_cov_outfile = path.join(args.out_dir,
                                          "path_cov_contrib.tsv.gz")

        strat_cov.to_csv(path_or_buf=strat_cov_outfile,
                         sep="\t",
                         index=False,
                         compression="gzip")

    if path_abun_by_seq is not None:
        genome_path_abun_outfile = path.join(args.out_dir,
                                             "path_abun_predictions.tsv.gz")
        path_abun_by_seq.to_csv(path_or_buf=genome_path_abun_outfile,
                                sep="\t",
                                index=True,
                                compression="gzip",
                                index_label="sequence")

    if args.coverage and path_cov_by_seq is not None:
        genome_path_cov_outfile = path.join(args.out_dir,
                                            "path_cov_predictions.tsv.gz")
        path_cov_by_seq.to_csv(path_or_buf=genome_path_cov_outfile,
                               sep="\t",
                               index=True,
                               compression="gzip",
                               index_label="sequence")

    if unstrat_abun_per_seq is not None:
        unstrat_abun_per_seq_outfile = path.join(
            args.out_dir, "path_abun_unstrat_per_seq.tsv.gz")
        unstrat_abun_per_seq.to_csv(path_or_buf=unstrat_abun_per_seq_outfile,
                                    sep="\t",
                                    index_label="pathway",
                                    compression="gzip")
Ejemplo n.º 6
0
    def test_strat_default_pipeline(self):
        '''Test running strat_minpath default pipeline. Make sure that
        community wide stratified abundances are calculated correctly and
        that unstratified abundances are right.'''

        with TemporaryDirectory() as temp_dir:
            unstrat_path_abun_df, unstrat_path_cov_df, strat_path_abun_df, strat_cov = pathway_pipeline(in_metagenome_strat,
                                                                                                        default_pathway_map,
                                                                                                        proc=1,
                                                                                                        out_dir=temp_dir,
                                                                                                        run_minpath=True,
                                                                                                        coverage=True,
                                                                                                        regroup_mapfile=default_regroup_map,
                                                                                                        gap_fill_on=True,
                                                                                                        per_sequence_contrib=False,
                                                                                                        print_cmds=False)


        # Compare these predicted tables to expected tables.
        exp_abun_unstrat = pd.read_csv(exp_abun_unstrat_file, sep="\t",
                                       index_col="pathway")

        exp_cov_unstrat = pd.read_csv(exp_cov_unstrat_file, sep="\t",
                                       index_col="pathway")


        exp_abun_strat = pd.read_csv(exp_abun_strat_file, sep="\t")

        # Sort stratified files (different versions can sort the output
        # slightly differently).
        strat_path_abun_df.sort_values(['pathway', 'sequence'], inplace=True)
        exp_abun_strat.sort_values(['pathway', 'sequence'], inplace=True)

        # Reset index labels.
        exp_abun_strat.reset_index(drop=True, inplace=True)
        strat_path_abun_df.reset_index(drop=True, inplace=True)

        pd.testing.assert_frame_equal(exp_abun_unstrat, unstrat_path_abun_df,
                                      check_like=True, check_less_precise=True)

        pd.testing.assert_frame_equal(exp_cov_unstrat, unstrat_path_cov_df,
                                      check_like=True, check_less_precise=True)

        # Check with less precision here since the HUMAnN2 output that is used
        # as expected abundances are not rounded.
        pd.testing.assert_frame_equal(exp_abun_strat, strat_path_abun_df,
                                      check_like=True, check_less_precise=True)