Ejemplo n.º 1
0
    def translate(self, nucleotide_sequence, start=0):
        """Translate nucleotide to protein sequence

        Parameters
        ----------
        nucleotide_sequence : NucleotideSequence
            sequence to be translated
        start : int, optional
            position to begin translation

        Returns
        -------
        ProteinSequence
            translation of nucleotide_sequence

        Notes
        -----
        ``translate`` returns the translation of the entire sequence, (i.e., of
        ``nucleotide_sequence[start:]``). It is the user's responsibility to
        trim to an open reading frame, either from the input or using the
        output, if that is desired.

        See Also
        --------
        translate_six_frames

        Examples
        --------
        >>> from skbio.sequence import GeneticCode
        >>> sgc = GeneticCode('FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSS'
        ...                   'RRVVVVAAAADDEEGGGG')
        >>> sgc.translate('AUGCAUGACUUUUGA', 1)
        Protein
        -----------------------------
        Stats:
            length: 4
            has gaps: False
            has degenerates: False
            has non-degenerates: True
        -----------------------------
        0 CMTF

        """
        if len(nucleotide_sequence) == 0:
            return Protein('')
        if start + 1 > len(nucleotide_sequence):
            raise ValueError("Translation starts after end of"
                             "NucleotideSequence")

        translation = []
        for i in range(start, len(nucleotide_sequence) - 2, 3):
            translation.append(self[nucleotide_sequence[i:i + 3]])
        translation = Protein(''.join(translation))

        return translation
Ejemplo n.º 2
0
def identity_coverage(dna_query, protein_query, dna_target, protein_target):
    """
        def category(query, dna_seq, protein_seq):
       if identity_coverage(query, dna_seq) >= (0.95, 0.95): return "EXACT"
       if identity_coverage(query, protein_seq) >= (0.8, 0.8): return "SIMILAR"
       if identity_coverage(query, protein_seq) >= (0.5, 0.5): return "MATCH"
       return "NO MATCH"
    """
    if dna_query != '':
        try:
            sw_dna = skbio.alignment.local_pairwise_align_ssw(
                DNA(dna_query), DNA(dna_target))
        except:
            sw_dna = skbio.alignment.local_pairwise_align_nucleotide(
                DNA(dna_query), DNA(dna_target))
        dna_identity, align_length = extract_sw(sw_dna)
        dna_coverage = align_length / min(len(dna_query), len(dna_target))
        if dna_identity >= 0.95 and dna_coverage >= 0.95:
            return 'EXACT'
    try:
        sw_protein = skbio.alignment.local_pairwise_align_ssw(
            Protein(protein_query),
            Protein(protein_target),
            substitution_matrix=blosum62,
            gap_open_penalty=11,
            gap_extend_penalty=1)
    except:
        sw_protein = skbio.alignment.local_pairwise_align_protein(
            Protein(protein_query),
            Protein(protein_target),
            substitution_matrix=blosum62,
            gap_open_penalty=11,
            gap_extend_penalty=1)
    protein_identity, align_length = extract_sw(sw_protein)
    protein_coverage = align_length / min(len(protein_query),
                                          len(protein_target))
    if protein_identity >= 0.8 and protein_coverage >= 0.8:
        return 'SIMILAR'

    if protein_identity >= 0.5 and protein_coverage >= 0.5:
        return 'MATCH'

    return 'NO MATCH'
Ejemplo n.º 3
0
    def _set_amino_acids(self, amino_acids):
        amino_acids = Protein(amino_acids)

        if len(amino_acids) != self._num_codons:
            raise ValueError("`amino_acids` must be length %d, not %d" %
                             (self._num_codons, len(amino_acids)))
        indices = (amino_acids.values == b'M').nonzero()[0]
        if indices.size < 1:
            raise ValueError("`amino_acids` must contain at least one M "
                             "(methionine) character")
        self._amino_acids = amino_acids
        self._m_character_codon = self._index_to_codon(indices[0])
Ejemplo n.º 4
0
def _translate(seq: str, table="Standard"):
    """Translates a given DNA sequence into a
    protein sequence, using a specified codon table.

    Args:
        seq (str): DNA sequence. Expects the coding strand and
                    a start with a leading ATG codon.
        table (str, optional): NCBI table name as used in Bio.Data.CodonTable

    Returns:
        skbio.sequence.Protein: Protein object with the translated sequence.
    """
    return Protein(str(Seq(seq).translate(table=table)))
Ejemplo n.º 5
0
def _assembleTwo(seq1, seq2):
    """This only works if two sequences share a significant identical overlap"""
    if len(seq2) <= len(seq1) and re.search(seq2, seq1):
        return seq1
    elif len(seq1) <= len(seq2) and re.search(seq1, seq2):
        return seq2
    else:
        msa = local_pairwise_align_ssw(Protein(seq1),
                                       Protein(seq2),
                                       substitution_matrix=ident)
        if msa[1] >= 8:
            try:
                (s1, e1), (s2, e2) = msa[-1]
            except:
                print(msa)

            if s1 >= s2:
                return seq1 + seq2[e2 + 1:]
            else:
                return seq2 + seq1[e1 + 1:]
            return out
        else:
            print('No significant overlap')
            raise
Ejemplo n.º 6
0
    def _set_starts(self, starts):
        starts = Protein(starts)

        if len(starts) != self._num_codons:
            raise ValueError("`starts` must be length %d, not %d" %
                             (self._num_codons, len(starts)))
        if ((starts.values == b'M').sum() +
            (starts.values == b'-').sum() != len(starts)):
            # to prevent the user from accidentally swapping `starts` and
            # `amino_acids` and getting a translation back
            raise ValueError("`starts` may only contain M and - characters")

        self._starts = starts

        indices = (self._starts.values == b'M').nonzero()[0]
        codons = np.empty((indices.size, 3), dtype=np.uint8)
        for i, index in enumerate(indices):
            codons[i] = self._index_to_codon(index)
        self._start_codons = codons
Ejemplo n.º 7
0
    def translate(self,
                  sequence,
                  reading_frame=1,
                  start='ignore',
                  stop='ignore'):
        """Translate RNA sequence into protein sequence.

        Parameters
        ----------
        sequence : RNA
            RNA sequence to translate.
        reading_frame : {1, 2, 3, -1, -2, -3}
            Reading frame to use in translation. 1, 2, and 3 are forward frames
            and -1, -2, and -3 are reverse frames. If reverse (negative), will
            reverse complement the sequence before translation.
        start : {'ignore', 'require', 'optional'}
            How to handle start codons:

            * "ignore": translation will start from the beginning of the
              reading frame, regardless of the presence of a start codon.

            * "require": translation will start at the first start codon in
              the reading frame, ignoring all prior positions. The first amino
              acid in the translated sequence will *always* be methionine
              (M character), even if an alternative start codon was used in
              translation. This behavior most closely matches the underlying
              biology since fMet doesn't have a corresponding IUPAC character.
              If a start codon does not exist, a ``ValueError`` is raised.

            * "optional": if a start codon exists in the reading frame, matches
              the behavior of "require". If a start codon does not exist,
              matches the behavior of "ignore".

        stop : {'ignore', 'require', 'optional'}
            How to handle stop codons:

            * "ignore": translation will ignore the presence of stop codons and
              translate to the end of the reading frame.

            * "require": translation will terminate at the first stop codon.
              The stop codon will not be included in the translated sequence.
              If a stop codon does not exist, a ``ValueError`` is raised.

            * "optional": if a stop codon exists in the reading frame, matches
              the behavior of "require". If a stop codon does not exist,
              matches the behavior of "ignore".

        Returns
        -------
        Protein
            Translated sequence.

        See Also
        --------
        translate_six_frames

        Notes
        -----
        Input RNA sequence metadata are included in the translated protein
        sequence. Positional metadata are not included.

        Examples
        --------
        Translate RNA into protein using NCBI's standard genetic code (table ID
        1, the default genetic code in scikit-bio):

        >>> from skbio import RNA, GeneticCode
        >>> rna = RNA('AGUAUUCUGCCACUGUAAGAA')
        >>> sgc = GeneticCode.from_ncbi()
        >>> sgc.translate(rna)
        Protein
        -----------------------------
        Stats:
            length: 7
            has gaps: False
            has degenerates: False
            has non-degenerates: True
            has stops: True
        -----------------------------
        0 SILPL*E

        In this command, we used the default ``start`` behavior, which starts
        translation at the beginning of the reading frame, regardless of the
        presence of a start codon. If we specify "require", translation will
        start at the first start codon in the reading frame (in this example,
        CUG), ignoring all prior positions:

        >>> sgc.translate(rna, start='require')
        Protein
        -----------------------------
        Stats:
            length: 5
            has gaps: False
            has degenerates: False
            has non-degenerates: True
            has stops: True
        -----------------------------
        0 MPL*E

        Note that the codon coding for L (CUG) is an alternative start codon in
        this genetic code. Since we specified "require" mode, methionine (M)
        was used in place of the alternative start codon (L). This behavior
        most closely matches the underlying biology since fMet doesn't have a
        corresponding IUPAC character.

        Translate the same RNA sequence, also specifying that translation
        terminate at the first stop codon in the reading frame:

        >>> sgc.translate(rna, start='require', stop='require')
        Protein
        -----------------------------
        Stats:
            length: 3
            has gaps: False
            has degenerates: False
            has non-degenerates: True
            has stops: False
        -----------------------------
        0 MPL

        Passing "require" to both ``start`` and ``stop`` trims the translation
        to the CDS (and in fact requires that one is present in the reading
        frame). Changing the reading frame to 2 causes an exception to be
        raised because a start codon doesn't exist in the reading frame:

        >>> sgc.translate(rna, start='require', stop='require',
        ...               reading_frame=2) # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        ValueError: ...

        """
        self._validate_translate_inputs(sequence, reading_frame, start, stop)

        offset = abs(reading_frame) - 1
        if reading_frame < 0:
            sequence = sequence.reverse_complement()

        # Translation strategy:
        #
        #   1. Obtain view of underlying sequence bytes from the beginning of
        #      the reading frame.
        #   2. Convert bytes to offsets (0-3, base 4 since there are only 4
        #      characters allowed: UCAG).
        #   3. Reshape byte vector into (N, 3), where N is the number of codons
        #      in the reading frame. Each row represents a codon in the
        #      sequence.
        #   4. (Optional) Find start codon in the reading frame and trim to
        #      this position. Replace start codon with M codon.
        #   5. Convert each codon (encoded as offsets) into an index
        #      corresponding to an amino acid (0-63).
        #   6. Obtain translated sequence by indexing into the amino acids
        #      vector (`amino_acids`) using the indices defined in step 5.
        #   7. (Optional) Find first stop codon and trim to this position.
        data = sequence.values[offset:].view(np.uint8)
        # since advanced indexing is used with an integer ndarray, a copy is
        # always returned. thus, the in-place modification made below
        # (replacing the start codon) is safe.
        data = self._offset_table[data]
        data = data[:data.size // 3 * 3].reshape((-1, 3))

        if start in {'require', 'optional'}:
            start_codon_index = data.shape[0]
            for start_codon in self._start_codons:
                indices = np.all(data == start_codon, axis=1).nonzero()[0]

                if indices.size > 0:
                    first_index = indices[0]
                    if first_index < start_codon_index:
                        start_codon_index = first_index

            if start_codon_index != data.shape[0]:
                data = data[start_codon_index:]
                data[0] = self._m_character_codon
            elif start == 'require':
                self._raise_require_error('start', reading_frame)

        indices = (data * self._radix_multiplier).sum(axis=1)
        translated = self._amino_acids.values[indices]

        if stop in {'require', 'optional'}:
            stop_codon_indices = (translated == b'*').nonzero()[0]
            if stop_codon_indices.size > 0:
                translated = translated[:stop_codon_indices[0]]
            elif stop == 'require':
                self._raise_require_error('stop', reading_frame)

        metadata = None
        if sequence.has_metadata():
            metadata = sequence.metadata

        # turn off validation because `translated` is guaranteed to be valid
        return Protein(translated, metadata=metadata, validate=False)
Ejemplo n.º 8
0
def local_pairwise_align_ssw(sequence1,
                             sequence2,
                             constructor=Sequence,
                             **kwargs):
    """Align query and target sequences with Striped Smith-Waterman.

    Parameters
    ----------
    sequence1 : str or Sequence
        The first unaligned sequence
    sequence2 : str or Sequence
        The second unaligned sequence
    constructor : Sequence subclass
        A constructor to use if `protein` is not True.

    Returns
    -------
    ``skbio.alignment.Alignment``
        The resulting alignment as an Alignment object

    Notes
    -----
    This is a wrapper for the SSW package [1]_.

    For a complete list of optional keyword-arguments that can be provided,
    see ``skbio.alignment.StripedSmithWaterman``.

    The following kwargs will not have any effect: `suppress_sequences` and
    `zero_index`

    If an alignment does not meet a provided filter, `None` will be returned.

    References
    ----------
    .. [1] Zhao, Mengyao, Wan-Ping Lee, Erik P. Garrison, & Gabor T.
       Marth. "SSW Library: An SIMD Smith-Waterman C/C++ Library for
       Applications". PLOS ONE (2013). Web. 11 July 2014.
       http://www.plosone.org/article/info:doi/10.1371/journal.pone.0082138

    See Also
    --------
    skbio.alignment.StripedSmithWaterman

    """
    # We need the sequences for `Alignment` to make sense, so don't let the
    # user suppress them.
    kwargs['suppress_sequences'] = False
    kwargs['zero_index'] = True

    if isinstance(sequence1, Protein):
        kwargs['protein'] = True

    query = StripedSmithWaterman(str(sequence1), **kwargs)
    alignment = query(str(sequence2))

    # If there is no cigar, then it has failed a filter. Return None.
    if not alignment.cigar:
        return None

    start_end = None
    if alignment.query_begin != -1:
        start_end = [(alignment.query_begin, alignment.query_end),
                     (alignment.target_begin, alignment.target_end_optimal)]
    if kwargs.get('protein', False):
        seqs = [
            Protein(alignment.aligned_query_sequence, metadata={'id':
                                                                'query'}),
            Protein(alignment.aligned_target_sequence,
                    metadata={'id': 'target'})
        ]
    else:
        seqs = [
            constructor(alignment.aligned_query_sequence,
                        metadata={'id': 'query'}),
            constructor(alignment.aligned_target_sequence,
                        metadata={'id': 'target'})
        ]

    return Alignment(seqs,
                     score=alignment.optimal_alignment_score,
                     start_end_positions=start_end)
Ejemplo n.º 9
0
def alignment(protein_query,
              protein_target,
              line_length,
              output_format='text/plain',
              html_header=False):
    sw_protein = skbio.alignment.local_pairwise_align_ssw(
        Protein(protein_query),
        Protein(protein_target),
        substitution_matrix=blosum62,
        gap_open_penalty=11,
        gap_extend_penalty=1)
    query_align = str(sw_protein[0][0])
    target_aligh = str(sw_protein[0][1])
    align = num_alignment(query_align, target_aligh)
    identity = align / len(target_aligh)
    query_start, query_end = sw_protein[2][0]
    target_start, target_end = sw_protein[2][1]
    query_coverage = (query_end - query_start + 1) / len(protein_query)
    target_coverage = (target_end - target_start + 1) / len(protein_target)

    out_text = []

    out_text.append('identity: ' + '{:.1%}'.format(identity))
    out_text.append('Query coverage: {:.1%} (positions {}-{}; total length {})' \
            .format(query_coverage,
                    query_start,
                    query_end,
                    len(protein_query)))
    out_text.append(
        'Unigene coverage: {:.1%} (positions {}-{}; total length {})\n' \
            .format(target_coverage,
                    target_start,
                    target_end,
                    len(protein_target)))

    out_html = ['<div class="alignment">']
    out_html.append('<p class="summary">{}</p>'.format(out_text[0]))
    out_html.append('<p class="summary">{}</p>'.format(out_text[1]))
    out_html.append('<p class="summary">{}</p><br/>'.format(
        out_text[2].strip('\n')))

    out_html.append('<p class="alignment">')
    for row_index in range(len(query_align) // line_length + 1):
        if row_index < len(query_align) // line_length:
            row_mid = ''
            html_query = ''
            html_mid = ''
            html_target = ''

            for column_index in range(row_index * line_length,
                                      (row_index + 1) * line_length):
                if query_align[column_index] == '-' or target_aligh[
                        column_index] == '-':
                    row_mid = row_mid + ' '
                    html_query = html_query + '<span class="gap">{}</span>'.format(
                        query_align[column_index])
                    html_mid = html_mid + '<span class="gap">&nbsp</span>'
                    html_target = html_target + '<span class="gap">{}</span>'.format(
                        target_aligh[column_index])
                elif query_align[column_index] != target_aligh[column_index]:
                    row_mid = row_mid + '.'
                    if blosum62[query_align[column_index]][
                            target_aligh[column_index]] >= 0:
                        """
                        close
                        """
                        html_query = html_query + '<span class="close">{}</span>'.format(
                            query_align[column_index])
                        html_mid = html_mid + '<span class="close">&#58;</span>'
                        html_target = html_target + '<span class="close">{}</span>'.format(
                            target_aligh[column_index])
                    else:
                        """
                        mismatch
                        """
                        html_query = html_query + '<span class="mismatch">{}</span>'.format(
                            query_align[column_index])
                        html_mid = html_mid + '<span class="mismatch">.</span>'
                        html_target = html_target + '<span class="mismatch">{}</span>'.format(
                            target_aligh[column_index])

                else:
                    row_mid = row_mid + '|'
                    html_query = html_query + '<span class="match", >{}</span>'.format(
                        query_align[column_index])
                    html_mid = html_mid + '<span class="match">|</span>'
                    html_target = html_target + '<span class="match">{}</span>'.format(
                        target_aligh[column_index])
            out_text.append(
                query_align[row_index * line_length:(row_index + 1) *
                            line_length])
            out_text.append(row_mid)
            out_text.append(target_aligh[row_index * line_length:
                                         (row_index + 1) * line_length] + '\n')
            out_html.append(html_query + '<br/>')
            out_html.append(html_mid + '<br/>')
            out_html.append(html_target + '<br/><br/>')

        else:
            row_mid = ''
            html_query = ''
            html_mid = ''
            html_target = ''
            for column_index in range(row_index * line_length,
                                      len(target_aligh)):
                if query_align[column_index] == '-' or target_aligh[
                        column_index] == '-':
                    row_mid = row_mid + ' '
                    html_query = html_query + '<span class="gap">{}</span>'.format(
                        query_align[column_index])
                    html_mid = html_mid + '<span class="gap">&nbsp</span>'
                    html_target = html_target + '<span class="gap">{}</span>'.format(
                        target_aligh[column_index])
                elif query_align[column_index] != target_aligh[column_index]:
                    row_mid = row_mid + '.'
                    if blosum62[query_align[column_index]][
                            target_aligh[column_index]] >= 0:
                        """
                        close
                        """
                        html_query = html_query + '<span class="close">{}</span>'.format(
                            query_align[column_index])
                        html_mid = html_mid + '<span class="close">&#58;</span>'
                        html_target = html_target + '<span class="close">{}</span>'.format(
                            target_aligh[column_index])
                    else:
                        """
                        mismatch
                        """
                        html_query = html_query + '<span class="mismatch">{}</span>'.format(
                            query_align[column_index])
                        html_mid = html_mid + '<span class="mismatch">.</span>'
                        html_target = html_target + '<span class="mismatch">{}</span>'.format(
                            target_aligh[column_index])
                else:
                    row_mid = row_mid + '|'
                    html_query = html_query + '<span class="match">{}</span>'.format(
                        query_align[column_index])
                    html_mid = html_mid + '<span class="match">|</span>'
                    html_target = html_target + '<span class="match">{}</span>'.format(
                        target_aligh[column_index])
            out_text.append(query_align[row_index *
                                        line_length:len(query_align)])
            out_text.append(row_mid)
            out_text.append(target_aligh[row_index *
                                         line_length:len(target_aligh)])
            out_html.append(html_query + '<br/>')
            out_html.append(html_mid + '<br/>')
            out_html.append(html_target + '</p></div>')

    if output_format == 'text/plain':
        for s in out_text:
            print(s)
    else:
        if html_header:
            print(HTML_HEADER)
        for s in out_html:
            print(s)

        if html_header:
            print(HTML_FOOTER)