Ejemplo n.º 1
0
    def test_rare_2_samp(self):
        '''Check that correct sequences are identified as rare when a cut-off
        of 2 samples is used.'''

        seqtab_in = biom_to_pandas_df(biom.load_table(seqtab_biom))

        rare_seqs = id_rare_seqs(seqtab_in, 1, 2)

        self.assertSetEqual(set(rare_seqs), set(["2558860574", "2571042244"]))
Ejemplo n.º 2
0
    def test_rare_4_reads(self):
        '''Check that correct sequences are identified as rare when a cut-off
        of 4 reads is used.'''

        seqtab_in = biom_to_pandas_df(biom.load_table(seqtab_biom))

        rare_seqs = id_rare_seqs(seqtab_in, 4, 1)

        self.assertSetEqual(set(rare_seqs), set(["2558860574", "extra"]))
    def test_norm_by_marker_copies(self):
        '''Test that expected normalized sequence abundance table generated.'''

        seqtab_in = biom_to_pandas_df(biom.load_table(seqtab_biom))

        # Get output index labels in same order as expected.
        seqtab_in = seqtab_in.reindex(exp_norm_in.index)

        test_norm = norm_by_marker_copies(input_seq_counts=seqtab_in,
                                          input_marker_num=marker_predict_in,
                                          norm_filename=None)

        # Test whether normalized table matches expected table.
        pd.testing.assert_frame_equal(test_norm, exp_norm_in)
def main():

    parser = argparse.ArgumentParser(

        description="Creates output FASTA for each sample with each ASV repeated for every count in that sample.",

formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument("-f", "--fasta", metavar="FASTA", type=str,
                        help="Path to full FASTA file.", required=True)

    parser.add_argument("-b", "--biom", metavar="BIOM", type=str,
                        help="Path to BIOM table.", required=True)

    parser.add_argument("-o", "--outdir", metavar="PATH", type=str,
                        help="Name of folder to make for output files.", required=True)

    args = parser.parse_args()

    in_fasta = read_fasta(args.fasta)

    in_table = biom_to_pandas_df(biom.load_table(args.biom))

    # If no sequences in file then stop job.
    if not in_fasta:
        sys.exit("Stopping - no sequences in file.")

    make_output_dir(args.outdir)

    for sample in in_table.columns:
        sample_outfile = args.outdir + "/" + sample + ".fasta"

        sample_outfh = open(sample_outfile, 'wt')

        for asv in in_table.index.values:
            asv_count = in_table.loc[asv, sample]
            if asv_count > 0:
                for i in range(int(asv_count)):
                    print(">" + asv + "_" + sample + "_" + str(i), file=sample_outfh)
                    print(in_fasta[asv], file=sample_outfh)

        sample_outfh.close()
Ejemplo n.º 5
0
def run_metagenome_pipeline(input_biom,
                            function,
                            marker,
                            max_nsti,
                            out_dir='metagenome_out',
                            proc=1,
                            output_normfile=False):
    '''Main function to run full metagenome pipeline. Meant to run modular
    functions largely listed below. Will return predicted metagenomes
    straitifed and unstratified by contributing genomes (i.e. taxa).'''

    # Read in input table of sequence abundances and convert to pandas df.
    study_seq_counts = biom_to_pandas_df(biom.load_table(input_biom))

    # Read in predicted function and marker gene abundances.
    pred_function = pd.read_table(function, sep="\t", index_col="sequence")
    pred_marker = pd.read_table(marker, sep="\t", index_col="sequence")

    pred_function.index = pred_function.index.astype(str)
    pred_marker.index = pred_marker.index.astype(str)

    # Initialize empty pandas dataframe to contain NSTI values.
    nsti_val = pd.DataFrame()

    # If NSTI column present then remove all rows with value above specified
    # max value. Also, remove NSTI column (in both dataframes).
    if "metadata_NSTI" in pred_function.columns:

        pred_function = pred_function[pred_function['metadata_NSTI'] <= max_nsti]
        nsti_val = pred_function[['metadata_NSTI']]
        pred_function.drop('metadata_NSTI', axis=1, inplace=True)

    if "metadata_NSTI" in pred_marker.columns:
        pred_marker = pred_marker[pred_marker['metadata_NSTI'] <= max_nsti]

        nsti_val = pred_marker[['metadata_NSTI']]

        pred_marker.drop('metadata_NSTI', axis=1, inplace=True)

    # Re-order predicted abundance tables to be in same order as study seqs.
    # Also, drop any sequence ids that don't overlap across all dataframes.
    study_seq_counts, pred_function, pred_marker = three_df_index_overlap_sort(study_seq_counts,
                                                                               pred_function,
                                                                               pred_marker)

    # Create output directory if it does not already exist.
    make_output_dir(out_dir)

    # Create normalized sequence abundance filename if outfile specified.
    if output_normfile:
        norm_output = path.join(out_dir, "seqtab_norm.tsv")
    else:
        norm_output = None

    # Normalize input study sequence abundances by predicted abundance of
    # marker genes and output normalized table if specified.
    study_seq_counts = norm_by_marker_copies(input_seq_counts=study_seq_counts,
                                             input_marker_num=pred_marker,
                                             norm_filename=norm_output)

    # If NSTI column input then output weighted NSTI values.
    if not nsti_val.empty:
        weighted_nsti_out = path.join(out_dir, "weighted_nsti.tsv")
        weighted_nsti = calc_weighted_nsti(seq_counts=study_seq_counts,
                                           nsti_input=nsti_val,
                                           outfile=weighted_nsti_out)

    # Get predicted function counts by sample, stratified by contributing
    # genomes and also separately unstratified.
    return(funcs_by_sample(input_seq_counts=study_seq_counts,
                           input_function_num=pred_function,
                           proc=proc))