Ejemplo n.º 1
0
def import_shogun_biom(f,
                       annotation_table=None,
                       annotation_type=None,
                       names_to_taxonomy=False):
    import_funcs = {
        'module': shogun_parse_module_table,
        'pathway': shogun_parse_pathway_table,
        'enzyme': shogun_parse_enzyme_table
    }

    table = pd.read_csv(f, sep='\t', index_col=0)

    bt = Table(table.values,
               observation_ids=list(map(str, table.index)),
               sample_ids=list(map(str, table.columns)))

    if names_to_taxonomy:
        metadata = {
            x: {
                'taxonomy': x.split(';')
            }
            for x in bt.ids(axis='observation')
        }
        bt.add_metadata(metadata, axis='observation')

    if annotation_table is not None:
        metadata = import_funcs[annotation_type](annotation_table)
        bt.add_metadata(metadata, axis='observation')

    return (bt)
Ejemplo n.º 2
0
    def rm_sparse_obs(self,
                      prevalence_thres: float = 0.05,
                      abundance_thres: float = 0.01) -> "Otu":
        """
        Remove observations with prevalence < `prevalence_thres` and abundance < `abundance_thres`

        Parameters
        ----------
        prevalence_thres : float
            Minimum fraction of samples the observation must be present in in order to be accepted
        abundance_thres : float
            Minimum observation count fraction in a sample needed in order to be accepted

        Returns
        -------
        Otu
            Otu instance with bad observations removed
        """
        filt_fun = (lambda val, *_:
                    (val.astype(int).astype(bool).mean()) >= prevalence_thres)
        otu_dense_obs = self.otu_data.filter(filt_fun,
                                             axis="observation",
                                             inplace=False)
        otu_df = otu_dense_obs.to_dataframe()
        if otu_df.apply(pd.api.types.is_sparse).any():
            otu_rel_abund = (otu_df / otu_df.sum(axis=0)).sparse.to_dense()
        else:
            otu_rel_abund = otu_df / otu_df.sum(axis=0)
        ind_above_thres = otu_rel_abund.index[(otu_rel_abund >
                                               abundance_thres).any(axis=1)]
        new_otu = self.otu_data.filter(ind_above_thres,
                                       axis="observation",
                                       inplace=False)
        ind_below_thres = set(
            self.otu_data.ids("observation")) - set(ind_above_thres)
        otu_sparse_obs = self.otu_data.filter(ind_below_thres,
                                              axis="observation",
                                              inplace=False)
        new_row = Table(
            otu_sparse_obs.sum(axis="sample"),
            ["otu_merged"],
            self.otu_data.ids(axis="sample"),
        )
        tax_level = self.tax_level
        random_row_metadata = dict(
            self.otu_data.metadata(axis="observation")[0])
        new_row.add_metadata(
            {
                "otu_merged": {
                    **random_row_metadata,
                    **Lineage("Unclassified").to_dict(tax_level),
                }
            },
            axis="observation",
        )
        final_otu = new_otu.concat([new_row], axis="observation")
        return Otu(final_otu)
Ejemplo n.º 3
0
def collapse_biom(table: biom.Table, mapping: dict, normalize=False):
    """Collapse a BIOM table in many-to-many mode.

    Parameters
    ----------
    table : biom.Table
        Table to collapse.
    mapping : dict of list of str
        Source-to-target(s) mapping.
    normalize : bool, optional
        Whether normalize per-target counts by number of targets per source.

    Returns
    -------
    biom.Table
        Collapsed BIOM table.

    Notes
    -----
    Metadata will not be retained in the collapsed table.

    See Also
    --------
    .table.collapse_table
    """
    # filter table features
    table = table.filter(lambda data, id_, md: id_ in mapping,
                         axis='observation',
                         inplace=False)

    # stop if no feature left
    if table.is_empty():
        return table

    # add mapping to table metadata
    table.add_metadata({k: dict(part=v)
                        for k, v in mapping.items()},
                       axis='observation')

    # determine collapsing method
    kwargs = dict(norm=False,
                  one_to_many=True,
                  axis='observation',
                  one_to_many_mode=('divide' if normalize else 'add'))

    # collapse table in many-to-many mode
    table = table.collapse(lambda id_, md: zip(md['part'], md['part']),
                           **kwargs)

    # round to integers
    if normalize:
        round_biom(table)

    # clean up
    table.del_metadata(keys=['Path'])
    return table
Ejemplo n.º 4
0
def rename_deblur_biom(biom, name_stub='deblur', metadata_name='deblurred_seq'):
    seqs = biom.ids(axis='observation')

    seqnames = ['{0}{1}'.format(name_stub, x) for x in range(len(seqs))]

    seq_metadata = {seqname: {metadata_name: seq} for seq, seqname in zip(seqs, seqnames)}

    renamed_biom = Table(biom.matrix_data, 
                         seqnames,
                         biom.ids(axis='sample'),
                         biom.metadata(axis='observation'),
                         biom.metadata(axis='sample'),
                         table_id = biom.table_id + ' renamed')

    renamed_biom.add_metadata(seq_metadata, axis='observation')

    return(renamed_biom)
Ejemplo n.º 5
0
def biom_add_metacol(table: biom.Table, dic, name, missing=''):
    """Add a metadata column to a table in place based on a dictionary.

    Parameters
    ----------
    table : biom.Table
        Table to add metadata column.
    dict : dict
        Metadata column (feature-to-value mapping).
    name : str
        Metadata column name.
    missing : any type, optional
        Default value if not found in dictionary.
    """
    metadata = {
        x: {
            name: dic.get(x, missing)
        }
        for x in table.ids('observation')
    }
    table.add_metadata(metadata, axis='observation')
Ejemplo n.º 6
0
def trim_dada2_posthoc(
        table: biom.Table,
        representative_sequences: pd.Series,
        trim_length: int = 0,
        hashed_feature_ids: bool = True) -> (biom.Table, pd.Series):
    """
    Trims ASVs generated by DADA2 to a standard length

    Parameters
    ----------
    table : biom.Table
        The feature table
    representative_sequences: DNAFASTAFormat
        The sequences which correspond to the ASV table
    trim_length  : int
        The length to trim the ASVS. If the length is 0, the minimum sequence
        length will be used.
    hash_feature_ids: bool
        Whether feature and sequence IDs should be hashed.
    """

    # Trims the sequences
    seq_length = representative_sequences.apply(lambda x: len(x))

    if trim_length == 0:
        trim_length = seq_length.min()

    if (seq_length < trim_length).any():
        warnings.warn(
            "There are ASVs shorter than the trim length. "
            "These sequences will be discarded.", UserWarning)
    rep_seqs = representative_sequences.astype(str)
    rep_seqs = rep_seqs.loc[seq_length >= trim_length].copy()
    rep_seqs = pd.DataFrame(data=[rep_seqs.apply(lambda x: x[:trim_length])],
                            index=['sequence']).T

    # Collapses the table based on the trimmed sequences
    table.filter(lambda v, id_, md: id_ in rep_seqs.index,
                 axis='observation',
                 inplace=True)
    table.add_metadata(
        rep_seqs.loc[table.ids(axis='observation')].to_dict(orient='index'),
        axis='observation')

    table2 = table.collapse(lambda id_, md: md['sequence'],
                            norm=False,
                            axis='observation')

    seqs2 = rep_seqs.drop_duplicates()['sequence'].copy()

    if hashed_feature_ids:
        table2.update_ids(
            {seq_: _hash_seq(seq_)
             for seq_ in table2.ids(axis='observation')},
            axis='observation',
            inplace=True)
        seqs2.rename({id_: _hash_seq(seq_)
                      for id_, seq_ in seqs2.items()},
                     inplace=True)
    else:
        seqs2.rename({id_: seq_ for id_, seq_ in seqs2.items()}, inplace=True)

    return table2, seqs2
Ejemplo n.º 7
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    otu_table = load_table(opts.input_otu_fp)

    ids_to_load = otu_table.ids(axis='observation')

    if(opts.input_count_fp is None):
        #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz)
        precalc_file_name='_'.join(['16S',opts.gg_version,'precalculated.tab.gz'])
        input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name)
    else:
        input_count_table=opts.input_count_fp

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext=path.splitext(input_count_table)[1]

    if (ext == '.gz'):
        count_table_fh = gzip.open(input_count_table,'rb')
    else:
        count_table_fh = open(input_count_table,'U')

    if opts.load_precalc_file_in_biom:
        count_table = load_table(count_table_fh)
    else:
        count_table = convert_precalc_to_biom(count_table_fh, ids_to_load)

    #Need to only keep data relevant to our otu list
    ids=[]
    for x in otu_table.iter(axis='observation'):
        ids.append(str(x[1]))

    ob_id=count_table.ids(axis='observation')[0]

    filtered_otus=[]
    filtered_values=[]
    for x in ids:
        if count_table.exists(x, axis='sample'):
            filtered_otus.append(x)
            filtered_values.append(otu_table.data(x, axis='observation'))

    filtered_otu_table = Table(filtered_values, filtered_otus, otu_table.ids())

    copy_numbers_filtered={}
    for x in filtered_otus:
        value = count_table.get_value_by_ids(ob_id,x)
        try:
            #data can be floats so round them and make them integers
            value = int(round(float(value)))

        except ValueError:
            raise ValueError,\
                  "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x]={opts.metadata_identifer:value}

    filtered_otu_table.add_metadata(copy_numbers_filtered, axis='observation')

    def metadata_norm(v, i, md):
        return v / float(md[opts.metadata_identifer])
    normalized_table = filtered_otu_table.transform(metadata_norm, axis='observation')

    #move Observation Metadata from original to filtered OTU table
    normalized_table = transfer_observation_metadata(otu_table, normalized_table, 'observation')

    make_output_dir_for_file(opts.output_otu_fp)
    write_biom_table(normalized_table, opts.output_otu_fp)
Ejemplo n.º 8
0
    def test_import_shogun_biom(self):
        shogun_table = ('#OTU ID\t1450\t2563\n'
                        'k__Archaea\t26\t25\n'
                        'k__Archaea;p__Crenarchaeota\t3\t5\n'
                        'k__Archaea;p__Crenarchaeota;c__Thermoprotei\t1\t25\n')

        exp_biom = Table(np.array([[26, 25],
                                   [3, 5],
                                   [1, 25]]),
                         ['k__Archaea',
                          'k__Archaea;p__Crenarchaeota',
                          'k__Archaea;p__Crenarchaeota;c__Thermoprotei'],
                         ['1450',
                          '2563'])

        obs_biom = import_shogun_biom(StringIO(shogun_table))
        self.assertEqual(exp_biom, obs_biom)

        tax_metadata = {'k__Archaea': {
                            'taxonomy': ['k__Archaea']},
                        'k__Archaea;p__Crenarchaeota': {
                            'taxonomy': ['k__Archaea',
                                         'p__Crenarchaeota']},
                        'k__Archaea;p__Crenarchaeota;c__Thermoprotei': {
                            'taxonomy': ['k__Archaea',
                                         'p__Crenarchaeota',
                                         'c__Thermoprotei']}}
        exp_biom_tax = Table(np.array([[26, 25],
                                       [3, 5],
                                       [1, 25]]),
                             ['k__Archaea',
                              'k__Archaea;p__Crenarchaeota',
                              'k__Archaea;p__Crenarchaeota;c__Thermoprotei'],
                             ['1450',
                              '2563'])
        exp_biom_tax.add_metadata(tax_metadata, axis='observation')
        obs_biom_tax = import_shogun_biom(
            StringIO(shogun_table), names_to_taxonomy=True)

        self.assertEqual(exp_biom_tax, obs_biom_tax)

        # test modules
        module_table = ('#MODULE ID\t1450\t2563\n'
                        'M00017\t26\t25\n'
                        'M00018\t3\t5\n')

        exp_m_biom = Table(np.array([[26, 25],
                                     [3, 5]]),
                           ['M00017', 'M00018'],
                           ['1450', '2563'])
        exp_m_biom.add_metadata(self.mod_md, axis='observation')
        obs_m_biom = import_shogun_biom(
            StringIO(module_table), annotation_table=StringIO(self.modules),
            annotation_type='module')

        self.assertEqual(exp_m_biom, obs_m_biom)

        # test pathways
        path_table = ('#PATHWAY ID\t1450\t2563\n'
                      '1.4.1  With NAD+ or NADP+ as acceptor\t26\t25\n'
                      '1.4.3  With oxygen as acceptor\t3\t5\n')

        exp_p_biom = Table(np.array([[26, 25],
                                     [3, 5]]),
                           ['1.4.1  With NAD+ or NADP+ as acceptor',
                            '1.4.3  With oxygen as acceptor'],
                           ['1450', '2563'])

        exp_p_biom.add_metadata(self.path_md, axis='observation')
        obs_p_biom = import_shogun_biom(
            StringIO(path_table), annotation_table=StringIO(self.pathways),
            annotation_type='pathway')

        self.assertEqual(exp_p_biom, obs_p_biom)

        # test enzymes
        enzyme_table = ('#KEGG ID\t1450\t2563\n'
                        'K00001\t26\t25\n'
                        'K00002\t3\t5\n'
                        'K00003\t1\t25\n')
        exp_e_biom = Table(np.array([[26, 25],
                                     [3, 5],
                                     [1, 25]]),
                           ['K00001',
                            'K00002',
                            'K00003'],
                           ['1450', '2563'])
        exp_e_biom.add_metadata(self.enz_md, axis='observation')
        obs_e_biom = import_shogun_biom(
            StringIO(enzyme_table), annotation_table=StringIO(self.enzymes),
            annotation_type='enzyme')

        self.assertEqual(exp_e_biom, obs_e_biom)

        # test empty
        empty_table = ('#KEGG ID\t1450\t2563\n')
        exp_empty_biom = Table(np.zeros((0, 2)),
                               [],
                               ['1450', '2563'])
        obs_empty_biom = import_shogun_biom(
            StringIO(empty_table), annotation_table=StringIO(self.enzymes),
            annotation_type='enzyme')

        self.assertEqual(exp_empty_biom, obs_empty_biom)
Ejemplo n.º 9
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    otu_table = load_table(opts.input_otu_fp)

    ids_to_load = otu_table.ids(axis="observation")

    if opts.input_count_fp is None:
        # precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz)
        precalc_file_name = "_".join(["16S", opts.gg_version, "precalculated.tab.gz"])
        input_count_table = join(get_picrust_project_dir(), "picrust", "data", precalc_file_name)
    else:
        input_count_table = opts.input_count_fp

    if opts.verbose:
        print "Loading trait table: ", input_count_table

    ext = path.splitext(input_count_table)[1]

    if ext == ".gz":
        count_table_fh = gzip.open(input_count_table, "rb")
    else:
        count_table_fh = open(input_count_table, "U")

    if opts.load_precalc_file_in_biom:
        count_table = load_table(count_table_fh)
    else:
        count_table = convert_precalc_to_biom(count_table_fh, ids_to_load)

    # Need to only keep data relevant to our otu list
    ids = []
    for x in otu_table.iter(axis="observation"):
        ids.append(str(x[1]))

    ob_id = count_table.ids(axis="observation")[0]

    filtered_otus = []
    filtered_values = []
    for x in ids:
        if count_table.exists(x, axis="sample"):
            filtered_otus.append(x)
            filtered_values.append(otu_table.data(x, axis="observation"))

    filtered_otu_table = Table(filtered_values, filtered_otus, otu_table.ids())

    copy_numbers_filtered = {}
    for x in filtered_otus:
        value = count_table.get_value_by_ids(ob_id, x)
        try:
            # data can be floats so round them and make them integers
            value = int(round(float(value)))

        except ValueError:
            raise ValueError, "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value)
        if value < 1:
            raise ValueError, "Copy numbers must be greater than or equal to 1."

        copy_numbers_filtered[x] = {opts.metadata_identifer: value}

    filtered_otu_table.add_metadata(copy_numbers_filtered, axis="observation")

    def metadata_norm(v, i, md):
        return v / float(md[opts.metadata_identifer])

    normalized_table = filtered_otu_table.transform(metadata_norm, axis="observation")

    # move Observation Metadata from original to filtered OTU table
    normalized_table = transfer_observation_metadata(otu_table, normalized_table, "observation")

    make_output_dir_for_file(opts.output_otu_fp)
    write_biom_table(normalized_table, opts.output_otu_fp)
Ejemplo n.º 10
0
def collapse_biom(table: biom.Table, mapping: dict, divide=False, field=None):
    """Collapse a BIOM table in many-to-many mode.

    Parameters
    ----------
    table : biom.Table
        Table to collapse.
    mapping : dict of list of str
        Source-to-target(s) mapping.
    divide : bool, optional
        Whether divide per-target counts by number of targets per source.
    field : int, optional
        Index of field to be collapsed in a stratified table.

    Returns
    -------
    biom.Table
        Collapsed BIOM table.

    Raises
    ------
    ValueError
        Field index is not present in a feature ID.

    Notes
    -----
    Metadata will not be retained in the collapsed table.

    See Also
    --------
    .table.collapse_table
    """
    # generate metadata
    metadata = {}
    for id_ in table.ids('observation'):
        feature = id_
        if field:
            fields = feature.split('|')
            try:
                feature = fields[field]
            except IndexError:
                raise ValueError(
                    f'Feature "{feature}" has less than {field + 1} fields.')
        if feature not in mapping:
            continue
        targets = []
        for target in mapping[feature]:
            if field:
                fields[field] = target
                target = '|'.join(fields)
            targets.append(target)
        metadata[id_] = dict(part=targets)

    # filter table features
    table = table.filter(lambda data, id_, md: id_ in metadata,
                         axis='observation',
                         inplace=False)

    # stop if no feature left
    if table.is_empty():
        return table

    # add mapping to table metadata
    table.add_metadata(metadata, axis='observation')

    # determine collapsing method
    kwargs = dict(norm=False,
                  one_to_many=True,
                  axis='observation',
                  one_to_many_mode=('divide' if divide else 'add'))

    # collapse table in many-to-many mode
    table = table.collapse(lambda _, md: zip(md['part'], md['part']), **kwargs)

    # round to integers
    if divide:
        round_biom(table)

    # clean up
    table.del_metadata(keys=['Path'])
    return table