Ejemplo n.º 1
0
def cluster_features_de_novo(sequences: DNAFASTAFormat, table: biom.Table,
                             perc_identity: float, threads: int = 1
                             ) -> (biom.Table, DNAFASTAFormat):
    clustered_sequences = DNAFASTAFormat()
    with tempfile.NamedTemporaryFile() as fasta_with_sizes:
        with tempfile.NamedTemporaryFile() as out_uc:
            _fasta_with_sizes(str(sequences), fasta_with_sizes.name, table)
            cmd = ['vsearch',
                   '--cluster_size', fasta_with_sizes.name,
                   '--id', str(perc_identity),
                   '--centroids', str(clustered_sequences),
                   '--uc', out_uc.name,
                   '--qmask', 'none',  # ensures no lowercase DNA chars
                   '--xsize',
                   '--threads', str(threads)]
            run_command(cmd)
            out_uc.seek(0)

            conn = _uc_to_sqlite(out_uc)
            collapse_f = _collapse_f_from_sqlite(conn)

    table = table.collapse(collapse_f, norm=False, min_group_size=1,
                           axis='observation',
                           include_collapsed_metadata=False)

    return table, clustered_sequences
Ejemplo n.º 2
0
def group(table: biom.Table, axis: str,
          metadata: qiime2.CategoricalMetadataColumn, mode: str) -> biom.Table:
    if table.is_empty():
        raise ValueError("Cannot group an empty table.")

    if axis == 'feature':
        biom_axis = 'observation'
    else:
        biom_axis = axis

    metadata = _munge_metadata_column(metadata, table.ids(axis=biom_axis),
                                      axis)

    grouped_table = table.collapse(
        lambda axis_id, _: metadata.get_value(axis_id),
        collapse_f=_mode_lookup[mode],
        axis=biom_axis,
        norm=False,
        include_collapsed_metadata=False)
    # Reorder axis by first unique appearance of each group value in metadata
    # (makes it stable for identity mappings and easier to test)
    # TODO use CategoricalMetadataColumn API for retrieving categories/groups,
    # when the API exists.
    series = metadata.to_series()
    return grouped_table.sort_order(series.unique(), axis=biom_axis)
Ejemplo n.º 3
0
def group(table: biom.Table, axis: str,
          metadata: qiime2.CategoricalMetadataColumn, mode: str) -> biom.Table:
    if table.is_empty():
        raise ValueError("Cannot group an empty table.")

    if axis == 'feature':
        biom_axis = 'observation'
    else:
        biom_axis = axis

    metadata = _munge_metadata_column(metadata, table.ids(axis=biom_axis),
                                      axis)

    grouped_table = table.collapse(
        lambda axis_id, _: metadata.get_value(axis_id),
        collapse_f=_mode_lookup[mode],
        axis=biom_axis,
        norm=False,
        include_collapsed_metadata=False)
    # Reorder axis by first unique appearance of each group value in metadata
    # (makes it stable for identity mappings and easier to test)
    # TODO use CategoricalMetadataColumn API for retrieving categories/groups,
    # when the API exists.
    series = metadata.to_series()
    return grouped_table.sort_order(series.unique(), axis=biom_axis)
Ejemplo n.º 4
0
def collapse_biom(table: biom.Table, mapping: dict, normalize=False):
    """Collapse a BIOM table in many-to-many mode.

    Parameters
    ----------
    table : biom.Table
        Table to collapse.
    mapping : dict of list of str
        Source-to-target(s) mapping.
    normalize : bool, optional
        Whether normalize per-target counts by number of targets per source.

    Returns
    -------
    biom.Table
        Collapsed BIOM table.

    Notes
    -----
    Metadata will not be retained in the collapsed table.

    See Also
    --------
    .table.collapse_table
    """
    # filter table features
    table = table.filter(lambda data, id_, md: id_ in mapping,
                         axis='observation',
                         inplace=False)

    # stop if no feature left
    if table.is_empty():
        return table

    # add mapping to table metadata
    table.add_metadata({k: dict(part=v)
                        for k, v in mapping.items()},
                       axis='observation')

    # determine collapsing method
    kwargs = dict(norm=False,
                  one_to_many=True,
                  axis='observation',
                  one_to_many_mode=('divide' if normalize else 'add'))

    # collapse table in many-to-many mode
    table = table.collapse(lambda id_, md: zip(md['part'], md['part']),
                           **kwargs)

    # round to integers
    if normalize:
        round_biom(table)

    # clean up
    table.del_metadata(keys=['Path'])
    return table
Ejemplo n.º 5
0
def cluster_features(represenative_seqs: DNAFASTAFormat, table: biom.Table,
                     id: float) -> (biom.Table, DNAFASTAFormat):
    seqs_fp = str(represenative_seqs)
    out_representative_seqs = DNAFASTAFormat()
    with tempfile.NamedTemporaryFile() as out_uc:
        cmd = [
            'vsearch', '--cluster_fast', seqs_fp, '--id',
            str(id), '--centroids',
            str(out_representative_seqs), '--uc', out_uc.name
        ]
        run_command(cmd)
        out_uc.seek(0)
        collapse_f = _collapse_f_from_uc(out_uc)

    table = table.collapse(collapse_f,
                           norm=False,
                           min_group_size=1,
                           axis='observation')

    return table, out_representative_seqs
Ejemplo n.º 6
0
def group(table: biom.Table, axis: str, metadata: qiime2.MetadataCategory,
          mode: str) -> biom.Table:
    if table.is_empty():
        raise ValueError("Cannot group an empty table.")

    if axis == 'feature':
        biom_axis = 'observation'
    else:
        biom_axis = axis

    series = _munge_metadata_category(metadata, table.ids(axis=biom_axis),
                                      axis)

    grouped_table = table.collapse(lambda axis_id, _: series.loc[axis_id],
                                   collapse_f=_mode_lookup[mode],
                                   axis=biom_axis,
                                   norm=False,
                                   include_collapsed_metadata=False)
    # Reorder axis by first unique appearance of each group value in metadata
    # (makes it stable for identity mappings and easier to test)
    return grouped_table.sort_order(series.unique(), axis=biom_axis)
Ejemplo n.º 7
0
def trim_dada2_posthoc(
        table: biom.Table,
        representative_sequences: pd.Series,
        trim_length: int = 0,
        hashed_feature_ids: bool = True) -> (biom.Table, pd.Series):
    """
    Trims ASVs generated by DADA2 to a standard length

    Parameters
    ----------
    table : biom.Table
        The feature table
    representative_sequences: DNAFASTAFormat
        The sequences which correspond to the ASV table
    trim_length  : int
        The length to trim the ASVS. If the length is 0, the minimum sequence
        length will be used.
    hash_feature_ids: bool
        Whether feature and sequence IDs should be hashed.
    """

    # Trims the sequences
    seq_length = representative_sequences.apply(lambda x: len(x))

    if trim_length == 0:
        trim_length = seq_length.min()

    if (seq_length < trim_length).any():
        warnings.warn(
            "There are ASVs shorter than the trim length. "
            "These sequences will be discarded.", UserWarning)
    rep_seqs = representative_sequences.astype(str)
    rep_seqs = rep_seqs.loc[seq_length >= trim_length].copy()
    rep_seqs = pd.DataFrame(data=[rep_seqs.apply(lambda x: x[:trim_length])],
                            index=['sequence']).T

    # Collapses the table based on the trimmed sequences
    table.filter(lambda v, id_, md: id_ in rep_seqs.index,
                 axis='observation',
                 inplace=True)
    table.add_metadata(
        rep_seqs.loc[table.ids(axis='observation')].to_dict(orient='index'),
        axis='observation')

    table2 = table.collapse(lambda id_, md: md['sequence'],
                            norm=False,
                            axis='observation')

    seqs2 = rep_seqs.drop_duplicates()['sequence'].copy()

    if hashed_feature_ids:
        table2.update_ids(
            {seq_: _hash_seq(seq_)
             for seq_ in table2.ids(axis='observation')},
            axis='observation',
            inplace=True)
        seqs2.rename({id_: _hash_seq(seq_)
                      for id_, seq_ in seqs2.items()},
                     inplace=True)
    else:
        seqs2.rename({id_: seq_ for id_, seq_ in seqs2.items()}, inplace=True)

    return table2, seqs2
Ejemplo n.º 8
0
def cluster_features_closed_reference(
        sequences: DNAFASTAFormat,
        table: biom.Table,
        reference_sequences: DNAFASTAFormat,
        perc_identity: float,
        strand: str = 'plus',
        threads: int = 1) -> (biom.Table, DNAFASTAFormat, DNAFASTAFormat):

    table_ids = set(table.ids(axis='observation'))
    sequence_ids = {
        e.metadata['id']
        for e in skbio.io.read(
            str(sequences), constructor=skbio.DNA, format='fasta')
    }
    _error_on_nonoverlapping_ids(table_ids, sequence_ids)
    matched_seqs, unmatched_seqs = DNAFASTAFormat(), DNAFASTAFormat()

    with tempfile.NamedTemporaryFile() as fasta_with_sizes, \
            tempfile.NamedTemporaryFile() as out_uc, \
            tempfile.NamedTemporaryFile() as tmp_unmatched_seqs:
        _fasta_with_sizes(str(sequences), fasta_with_sizes.name, table)
        cmd = [
            'vsearch',
            '--usearch_global',
            fasta_with_sizes.name,
            '--id',
            str(perc_identity),
            '--db',
            str(reference_sequences),
            '--uc',
            out_uc.name,
            '--strand',
            str(strand),
            '--qmask',
            'none',  # ensures no lowercase DNA chars
            '--notmatched',
            tmp_unmatched_seqs.name,
            '--threads',
            str(threads)
        ]
        run_command(cmd)
        out_uc.seek(0)

        # It is possible for there to be no unmatched sequences --- if that
        # is the case, skip thie following clean-up.
        if os.path.getsize(tmp_unmatched_seqs.name) > 0:
            # We don't really need to sort the matched sequences, this
            # is just to let us use --xsize, which strips the counts from
            # the Feature ID. It would be more ideal if --usearch_global,
            # above let us pass in --xsize, but unfortunately it isn't
            # supported.
            cmd = [
                'vsearch', '--sortbysize', tmp_unmatched_seqs.name, '--xsize',
                '--output',
                str(unmatched_seqs)
            ]
            run_command(cmd)

        try:
            conn = _uc_to_sqlite(out_uc)
            collapse_f = _collapse_f_from_sqlite(conn)
            _fasta_from_sqlite(conn, str(sequences), str(matched_seqs))
        except ValueError:
            raise VSearchError('No matches were identified to '
                               'reference_sequences. This can happen if '
                               'sequences are not homologous to '
                               'reference_sequences, or if sequences are '
                               'not in the same orientation as reference_'
                               'sequences (i.e., if sequences are reverse '
                               'complemented with respect to reference '
                               'sequences). Sequence orientation can be '
                               'adjusted with the strand parameter.')

        unmatched_ids = [
            e.metadata['id'] for e in skbio.io.read(open(str(unmatched_seqs)),
                                                    constructor=skbio.DNA,
                                                    format='fasta')
        ]
    table.filter(ids_to_keep=unmatched_ids,
                 invert=True,
                 axis='observation',
                 inplace=True)
    table = table.collapse(collapse_f,
                           norm=False,
                           min_group_size=1,
                           axis='observation',
                           include_collapsed_metadata=False)

    return table, matched_seqs, unmatched_seqs
Ejemplo n.º 9
0
def collapse_biom(table: biom.Table, mapping: dict, divide=False, field=None):
    """Collapse a BIOM table in many-to-many mode.

    Parameters
    ----------
    table : biom.Table
        Table to collapse.
    mapping : dict of list of str
        Source-to-target(s) mapping.
    divide : bool, optional
        Whether divide per-target counts by number of targets per source.
    field : int, optional
        Index of field to be collapsed in a stratified table.

    Returns
    -------
    biom.Table
        Collapsed BIOM table.

    Raises
    ------
    ValueError
        Field index is not present in a feature ID.

    Notes
    -----
    Metadata will not be retained in the collapsed table.

    See Also
    --------
    .table.collapse_table
    """
    # generate metadata
    metadata = {}
    for id_ in table.ids('observation'):
        feature = id_
        if field:
            fields = feature.split('|')
            try:
                feature = fields[field]
            except IndexError:
                raise ValueError(
                    f'Feature "{feature}" has less than {field + 1} fields.')
        if feature not in mapping:
            continue
        targets = []
        for target in mapping[feature]:
            if field:
                fields[field] = target
                target = '|'.join(fields)
            targets.append(target)
        metadata[id_] = dict(part=targets)

    # filter table features
    table = table.filter(lambda data, id_, md: id_ in metadata,
                         axis='observation',
                         inplace=False)

    # stop if no feature left
    if table.is_empty():
        return table

    # add mapping to table metadata
    table.add_metadata(metadata, axis='observation')

    # determine collapsing method
    kwargs = dict(norm=False,
                  one_to_many=True,
                  axis='observation',
                  one_to_many_mode=('divide' if divide else 'add'))

    # collapse table in many-to-many mode
    table = table.collapse(lambda _, md: zip(md['part'], md['part']), **kwargs)

    # round to integers
    if divide:
        round_biom(table)

    # clean up
    table.del_metadata(keys=['Path'])
    return table