def translate(self, nucleotide_sequence, start=0): """Translate nucleotide to protein sequence Parameters ---------- nucleotide_sequence : NucleotideSequence sequence to be translated start : int, optional position to begin translation Returns ------- ProteinSequence translation of nucleotide_sequence Notes ----- ``translate`` returns the translation of the entire sequence, (i.e., of ``nucleotide_sequence[start:]``). It is the user's responsibility to trim to an open reading frame, either from the input or using the output, if that is desired. See Also -------- translate_six_frames Examples -------- >>> from skbio.sequence import GeneticCode >>> sgc = GeneticCode('FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSS' ... 'RRVVVVAAAADDEEGGGG') >>> sgc.translate('AUGCAUGACUUUUGA', 1) Protein ----------------------------- Stats: length: 4 has gaps: False has degenerates: False has non-degenerates: True ----------------------------- 0 CMTF """ if len(nucleotide_sequence) == 0: return Protein('') if start + 1 > len(nucleotide_sequence): raise ValueError("Translation starts after end of" "NucleotideSequence") translation = [] for i in range(start, len(nucleotide_sequence) - 2, 3): translation.append(self[nucleotide_sequence[i:i + 3]]) translation = Protein(''.join(translation)) return translation
def identity_coverage(dna_query, protein_query, dna_target, protein_target): """ def category(query, dna_seq, protein_seq): if identity_coverage(query, dna_seq) >= (0.95, 0.95): return "EXACT" if identity_coverage(query, protein_seq) >= (0.8, 0.8): return "SIMILAR" if identity_coverage(query, protein_seq) >= (0.5, 0.5): return "MATCH" return "NO MATCH" """ if dna_query != '': try: sw_dna = skbio.alignment.local_pairwise_align_ssw( DNA(dna_query), DNA(dna_target)) except: sw_dna = skbio.alignment.local_pairwise_align_nucleotide( DNA(dna_query), DNA(dna_target)) dna_identity, align_length = extract_sw(sw_dna) dna_coverage = align_length / min(len(dna_query), len(dna_target)) if dna_identity >= 0.95 and dna_coverage >= 0.95: return 'EXACT' try: sw_protein = skbio.alignment.local_pairwise_align_ssw( Protein(protein_query), Protein(protein_target), substitution_matrix=blosum62, gap_open_penalty=11, gap_extend_penalty=1) except: sw_protein = skbio.alignment.local_pairwise_align_protein( Protein(protein_query), Protein(protein_target), substitution_matrix=blosum62, gap_open_penalty=11, gap_extend_penalty=1) protein_identity, align_length = extract_sw(sw_protein) protein_coverage = align_length / min(len(protein_query), len(protein_target)) if protein_identity >= 0.8 and protein_coverage >= 0.8: return 'SIMILAR' if protein_identity >= 0.5 and protein_coverage >= 0.5: return 'MATCH' return 'NO MATCH'
def _set_amino_acids(self, amino_acids): amino_acids = Protein(amino_acids) if len(amino_acids) != self._num_codons: raise ValueError("`amino_acids` must be length %d, not %d" % (self._num_codons, len(amino_acids))) indices = (amino_acids.values == b'M').nonzero()[0] if indices.size < 1: raise ValueError("`amino_acids` must contain at least one M " "(methionine) character") self._amino_acids = amino_acids self._m_character_codon = self._index_to_codon(indices[0])
def _translate(seq: str, table="Standard"): """Translates a given DNA sequence into a protein sequence, using a specified codon table. Args: seq (str): DNA sequence. Expects the coding strand and a start with a leading ATG codon. table (str, optional): NCBI table name as used in Bio.Data.CodonTable Returns: skbio.sequence.Protein: Protein object with the translated sequence. """ return Protein(str(Seq(seq).translate(table=table)))
def _assembleTwo(seq1, seq2): """This only works if two sequences share a significant identical overlap""" if len(seq2) <= len(seq1) and re.search(seq2, seq1): return seq1 elif len(seq1) <= len(seq2) and re.search(seq1, seq2): return seq2 else: msa = local_pairwise_align_ssw(Protein(seq1), Protein(seq2), substitution_matrix=ident) if msa[1] >= 8: try: (s1, e1), (s2, e2) = msa[-1] except: print(msa) if s1 >= s2: return seq1 + seq2[e2 + 1:] else: return seq2 + seq1[e1 + 1:] return out else: print('No significant overlap') raise
def _set_starts(self, starts): starts = Protein(starts) if len(starts) != self._num_codons: raise ValueError("`starts` must be length %d, not %d" % (self._num_codons, len(starts))) if ((starts.values == b'M').sum() + (starts.values == b'-').sum() != len(starts)): # to prevent the user from accidentally swapping `starts` and # `amino_acids` and getting a translation back raise ValueError("`starts` may only contain M and - characters") self._starts = starts indices = (self._starts.values == b'M').nonzero()[0] codons = np.empty((indices.size, 3), dtype=np.uint8) for i, index in enumerate(indices): codons[i] = self._index_to_codon(index) self._start_codons = codons
def translate(self, sequence, reading_frame=1, start='ignore', stop='ignore'): """Translate RNA sequence into protein sequence. Parameters ---------- sequence : RNA RNA sequence to translate. reading_frame : {1, 2, 3, -1, -2, -3} Reading frame to use in translation. 1, 2, and 3 are forward frames and -1, -2, and -3 are reverse frames. If reverse (negative), will reverse complement the sequence before translation. start : {'ignore', 'require', 'optional'} How to handle start codons: * "ignore": translation will start from the beginning of the reading frame, regardless of the presence of a start codon. * "require": translation will start at the first start codon in the reading frame, ignoring all prior positions. The first amino acid in the translated sequence will *always* be methionine (M character), even if an alternative start codon was used in translation. This behavior most closely matches the underlying biology since fMet doesn't have a corresponding IUPAC character. If a start codon does not exist, a ``ValueError`` is raised. * "optional": if a start codon exists in the reading frame, matches the behavior of "require". If a start codon does not exist, matches the behavior of "ignore". stop : {'ignore', 'require', 'optional'} How to handle stop codons: * "ignore": translation will ignore the presence of stop codons and translate to the end of the reading frame. * "require": translation will terminate at the first stop codon. The stop codon will not be included in the translated sequence. If a stop codon does not exist, a ``ValueError`` is raised. * "optional": if a stop codon exists in the reading frame, matches the behavior of "require". If a stop codon does not exist, matches the behavior of "ignore". Returns ------- Protein Translated sequence. See Also -------- translate_six_frames Notes ----- Input RNA sequence metadata are included in the translated protein sequence. Positional metadata are not included. Examples -------- Translate RNA into protein using NCBI's standard genetic code (table ID 1, the default genetic code in scikit-bio): >>> from skbio import RNA, GeneticCode >>> rna = RNA('AGUAUUCUGCCACUGUAAGAA') >>> sgc = GeneticCode.from_ncbi() >>> sgc.translate(rna) Protein ----------------------------- Stats: length: 7 has gaps: False has degenerates: False has non-degenerates: True has stops: True ----------------------------- 0 SILPL*E In this command, we used the default ``start`` behavior, which starts translation at the beginning of the reading frame, regardless of the presence of a start codon. If we specify "require", translation will start at the first start codon in the reading frame (in this example, CUG), ignoring all prior positions: >>> sgc.translate(rna, start='require') Protein ----------------------------- Stats: length: 5 has gaps: False has degenerates: False has non-degenerates: True has stops: True ----------------------------- 0 MPL*E Note that the codon coding for L (CUG) is an alternative start codon in this genetic code. Since we specified "require" mode, methionine (M) was used in place of the alternative start codon (L). This behavior most closely matches the underlying biology since fMet doesn't have a corresponding IUPAC character. Translate the same RNA sequence, also specifying that translation terminate at the first stop codon in the reading frame: >>> sgc.translate(rna, start='require', stop='require') Protein ----------------------------- Stats: length: 3 has gaps: False has degenerates: False has non-degenerates: True has stops: False ----------------------------- 0 MPL Passing "require" to both ``start`` and ``stop`` trims the translation to the CDS (and in fact requires that one is present in the reading frame). Changing the reading frame to 2 causes an exception to be raised because a start codon doesn't exist in the reading frame: >>> sgc.translate(rna, start='require', stop='require', ... reading_frame=2) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... ValueError: ... """ self._validate_translate_inputs(sequence, reading_frame, start, stop) offset = abs(reading_frame) - 1 if reading_frame < 0: sequence = sequence.reverse_complement() # Translation strategy: # # 1. Obtain view of underlying sequence bytes from the beginning of # the reading frame. # 2. Convert bytes to offsets (0-3, base 4 since there are only 4 # characters allowed: UCAG). # 3. Reshape byte vector into (N, 3), where N is the number of codons # in the reading frame. Each row represents a codon in the # sequence. # 4. (Optional) Find start codon in the reading frame and trim to # this position. Replace start codon with M codon. # 5. Convert each codon (encoded as offsets) into an index # corresponding to an amino acid (0-63). # 6. Obtain translated sequence by indexing into the amino acids # vector (`amino_acids`) using the indices defined in step 5. # 7. (Optional) Find first stop codon and trim to this position. data = sequence.values[offset:].view(np.uint8) # since advanced indexing is used with an integer ndarray, a copy is # always returned. thus, the in-place modification made below # (replacing the start codon) is safe. data = self._offset_table[data] data = data[:data.size // 3 * 3].reshape((-1, 3)) if start in {'require', 'optional'}: start_codon_index = data.shape[0] for start_codon in self._start_codons: indices = np.all(data == start_codon, axis=1).nonzero()[0] if indices.size > 0: first_index = indices[0] if first_index < start_codon_index: start_codon_index = first_index if start_codon_index != data.shape[0]: data = data[start_codon_index:] data[0] = self._m_character_codon elif start == 'require': self._raise_require_error('start', reading_frame) indices = (data * self._radix_multiplier).sum(axis=1) translated = self._amino_acids.values[indices] if stop in {'require', 'optional'}: stop_codon_indices = (translated == b'*').nonzero()[0] if stop_codon_indices.size > 0: translated = translated[:stop_codon_indices[0]] elif stop == 'require': self._raise_require_error('stop', reading_frame) metadata = None if sequence.has_metadata(): metadata = sequence.metadata # turn off validation because `translated` is guaranteed to be valid return Protein(translated, metadata=metadata, validate=False)
def local_pairwise_align_ssw(sequence1, sequence2, constructor=Sequence, **kwargs): """Align query and target sequences with Striped Smith-Waterman. Parameters ---------- sequence1 : str or Sequence The first unaligned sequence sequence2 : str or Sequence The second unaligned sequence constructor : Sequence subclass A constructor to use if `protein` is not True. Returns ------- ``skbio.alignment.Alignment`` The resulting alignment as an Alignment object Notes ----- This is a wrapper for the SSW package [1]_. For a complete list of optional keyword-arguments that can be provided, see ``skbio.alignment.StripedSmithWaterman``. The following kwargs will not have any effect: `suppress_sequences` and `zero_index` If an alignment does not meet a provided filter, `None` will be returned. References ---------- .. [1] Zhao, Mengyao, Wan-Ping Lee, Erik P. Garrison, & Gabor T. Marth. "SSW Library: An SIMD Smith-Waterman C/C++ Library for Applications". PLOS ONE (2013). Web. 11 July 2014. http://www.plosone.org/article/info:doi/10.1371/journal.pone.0082138 See Also -------- skbio.alignment.StripedSmithWaterman """ # We need the sequences for `Alignment` to make sense, so don't let the # user suppress them. kwargs['suppress_sequences'] = False kwargs['zero_index'] = True if isinstance(sequence1, Protein): kwargs['protein'] = True query = StripedSmithWaterman(str(sequence1), **kwargs) alignment = query(str(sequence2)) # If there is no cigar, then it has failed a filter. Return None. if not alignment.cigar: return None start_end = None if alignment.query_begin != -1: start_end = [(alignment.query_begin, alignment.query_end), (alignment.target_begin, alignment.target_end_optimal)] if kwargs.get('protein', False): seqs = [ Protein(alignment.aligned_query_sequence, metadata={'id': 'query'}), Protein(alignment.aligned_target_sequence, metadata={'id': 'target'}) ] else: seqs = [ constructor(alignment.aligned_query_sequence, metadata={'id': 'query'}), constructor(alignment.aligned_target_sequence, metadata={'id': 'target'}) ] return Alignment(seqs, score=alignment.optimal_alignment_score, start_end_positions=start_end)
def alignment(protein_query, protein_target, line_length, output_format='text/plain', html_header=False): sw_protein = skbio.alignment.local_pairwise_align_ssw( Protein(protein_query), Protein(protein_target), substitution_matrix=blosum62, gap_open_penalty=11, gap_extend_penalty=1) query_align = str(sw_protein[0][0]) target_aligh = str(sw_protein[0][1]) align = num_alignment(query_align, target_aligh) identity = align / len(target_aligh) query_start, query_end = sw_protein[2][0] target_start, target_end = sw_protein[2][1] query_coverage = (query_end - query_start + 1) / len(protein_query) target_coverage = (target_end - target_start + 1) / len(protein_target) out_text = [] out_text.append('identity: ' + '{:.1%}'.format(identity)) out_text.append('Query coverage: {:.1%} (positions {}-{}; total length {})' \ .format(query_coverage, query_start, query_end, len(protein_query))) out_text.append( 'Unigene coverage: {:.1%} (positions {}-{}; total length {})\n' \ .format(target_coverage, target_start, target_end, len(protein_target))) out_html = ['<div class="alignment">'] out_html.append('<p class="summary">{}</p>'.format(out_text[0])) out_html.append('<p class="summary">{}</p>'.format(out_text[1])) out_html.append('<p class="summary">{}</p><br/>'.format( out_text[2].strip('\n'))) out_html.append('<p class="alignment">') for row_index in range(len(query_align) // line_length + 1): if row_index < len(query_align) // line_length: row_mid = '' html_query = '' html_mid = '' html_target = '' for column_index in range(row_index * line_length, (row_index + 1) * line_length): if query_align[column_index] == '-' or target_aligh[ column_index] == '-': row_mid = row_mid + ' ' html_query = html_query + '<span class="gap">{}</span>'.format( query_align[column_index]) html_mid = html_mid + '<span class="gap"> </span>' html_target = html_target + '<span class="gap">{}</span>'.format( target_aligh[column_index]) elif query_align[column_index] != target_aligh[column_index]: row_mid = row_mid + '.' if blosum62[query_align[column_index]][ target_aligh[column_index]] >= 0: """ close """ html_query = html_query + '<span class="close">{}</span>'.format( query_align[column_index]) html_mid = html_mid + '<span class="close">:</span>' html_target = html_target + '<span class="close">{}</span>'.format( target_aligh[column_index]) else: """ mismatch """ html_query = html_query + '<span class="mismatch">{}</span>'.format( query_align[column_index]) html_mid = html_mid + '<span class="mismatch">.</span>' html_target = html_target + '<span class="mismatch">{}</span>'.format( target_aligh[column_index]) else: row_mid = row_mid + '|' html_query = html_query + '<span class="match", >{}</span>'.format( query_align[column_index]) html_mid = html_mid + '<span class="match">|</span>' html_target = html_target + '<span class="match">{}</span>'.format( target_aligh[column_index]) out_text.append( query_align[row_index * line_length:(row_index + 1) * line_length]) out_text.append(row_mid) out_text.append(target_aligh[row_index * line_length: (row_index + 1) * line_length] + '\n') out_html.append(html_query + '<br/>') out_html.append(html_mid + '<br/>') out_html.append(html_target + '<br/><br/>') else: row_mid = '' html_query = '' html_mid = '' html_target = '' for column_index in range(row_index * line_length, len(target_aligh)): if query_align[column_index] == '-' or target_aligh[ column_index] == '-': row_mid = row_mid + ' ' html_query = html_query + '<span class="gap">{}</span>'.format( query_align[column_index]) html_mid = html_mid + '<span class="gap"> </span>' html_target = html_target + '<span class="gap">{}</span>'.format( target_aligh[column_index]) elif query_align[column_index] != target_aligh[column_index]: row_mid = row_mid + '.' if blosum62[query_align[column_index]][ target_aligh[column_index]] >= 0: """ close """ html_query = html_query + '<span class="close">{}</span>'.format( query_align[column_index]) html_mid = html_mid + '<span class="close">:</span>' html_target = html_target + '<span class="close">{}</span>'.format( target_aligh[column_index]) else: """ mismatch """ html_query = html_query + '<span class="mismatch">{}</span>'.format( query_align[column_index]) html_mid = html_mid + '<span class="mismatch">.</span>' html_target = html_target + '<span class="mismatch">{}</span>'.format( target_aligh[column_index]) else: row_mid = row_mid + '|' html_query = html_query + '<span class="match">{}</span>'.format( query_align[column_index]) html_mid = html_mid + '<span class="match">|</span>' html_target = html_target + '<span class="match">{}</span>'.format( target_aligh[column_index]) out_text.append(query_align[row_index * line_length:len(query_align)]) out_text.append(row_mid) out_text.append(target_aligh[row_index * line_length:len(target_aligh)]) out_html.append(html_query + '<br/>') out_html.append(html_mid + '<br/>') out_html.append(html_target + '</p></div>') if output_format == 'text/plain': for s in out_text: print(s) else: if html_header: print(HTML_HEADER) for s in out_html: print(s) if html_header: print(HTML_FOOTER)