def back_transcribe(self): """Returns the DNA sequence from an RNA sequence. New Seq object. >>> from Bio.Seq import Seq >>> from Bio.Alphabet import IUPAC >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", \ IUPAC.unambiguous_rna) >>> messenger_rna Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) >>> messenger_rna.back_transcribe() Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) Trying to back-transcribe a protein or DNA sequence raises an exception. >>> my_protein = Seq("MAIVMGR", IUPAC.protein) >>> my_protein.back_transcribe() Traceback (most recent call last): ... ValueError: Proteins cannot be back transcribed! """ if isinstance(Alphabet._get_base_alphabet(self.alphabet), Alphabet.ProteinAlphabet): raise ValueError("Proteins cannot be back transcribed!") if isinstance(Alphabet._get_base_alphabet(self.alphabet), Alphabet.DNAAlphabet): raise ValueError("DNA cannot be back transcribed!") if self.alphabet == IUPAC.unambiguous_rna: alphabet = IUPAC.unambiguous_dna elif self.alphabet == IUPAC.ambiguous_rna: alphabet = IUPAC.ambiguous_dna else: alphabet = Alphabet.generic_dna return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
def back_transcribe(self): """Returns the DNA sequence from an RNA sequence. New Seq object. >>> from Bio.Seq import Seq >>> from Bio.Alphabet import IUPAC >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", \ IUPAC.unambiguous_rna) >>> messenger_rna Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) >>> messenger_rna.back_transcribe() Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) Trying to back-transcribe a protein or DNA sequence raises an exception. >>> my_protein = Seq("MAIVMGR", IUPAC.protein) >>> my_protein.back_transcribe() Traceback (most recent call last): ... ValueError: Proteins cannot be back transcribed! """ if isinstance(Alphabet._get_base_alphabet(self.alphabet), Alphabet.ProteinAlphabet) : raise ValueError("Proteins cannot be back transcribed!") if isinstance(Alphabet._get_base_alphabet(self.alphabet), Alphabet.DNAAlphabet) : raise ValueError("DNA cannot be back transcribed!") if self.alphabet==IUPAC.unambiguous_rna: alphabet = IUPAC.unambiguous_dna elif self.alphabet==IUPAC.ambiguous_rna: alphabet = IUPAC.ambiguous_dna else: alphabet = Alphabet.generic_dna return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
def complement(self): """Returns the complement sequence. New Seq object. >>> from Bio.Seq import Seq >>> from Bio.Alphabet import IUPAC >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna) >>> my_dna Seq('CCCCCGATAG', IUPACUnambiguousDNA()) >>> my_dna.complement() Seq('GGGGGCTATC', IUPACUnambiguousDNA()) Trying to complement a protein sequence raises an exception. >>> my_protein = Seq("MAIVMGR", IUPAC.protein) >>> my_protein.complement() Traceback (most recent call last): ... ValueError: Proteins do not have complements! """ if isinstance(Alphabet._get_base_alphabet(self.alphabet), Alphabet.ProteinAlphabet): raise ValueError("Proteins do not have complements!") if isinstance(Alphabet._get_base_alphabet(self.alphabet), Alphabet.DNAAlphabet): d = ambiguous_dna_complement elif isinstance(Alphabet._get_base_alphabet(self.alphabet), Alphabet.RNAAlphabet): d = ambiguous_rna_complement elif 'U' in self._data and 'T' in self._data: #TODO - Handle this cleanly? raise ValueError("Mixed RNA/DNA found") elif 'U' in self._data: d = ambiguous_rna_complement else: d = ambiguous_dna_complement ttable = self.__maketrans(d) #Much faster on really long sequences than the previous loop based one. #thx to Michael Palmer, University of Waterloo s = str(self).translate(ttable) return Seq(s, self.alphabet)
def complement(self): """Returns the complement sequence. New Seq object. >>> from Bio.Seq import Seq >>> from Bio.Alphabet import IUPAC >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna) >>> my_dna Seq('CCCCCGATAG', IUPACUnambiguousDNA()) >>> my_dna.complement() Seq('GGGGGCTATC', IUPACUnambiguousDNA()) Trying to complement a protein sequence raises an exception. >>> my_protein = Seq("MAIVMGR", IUPAC.protein) >>> my_protein.complement() Traceback (most recent call last): ... ValueError: Proteins do not have complements! """ if isinstance(Alphabet._get_base_alphabet(self.alphabet), Alphabet.ProteinAlphabet) : raise ValueError("Proteins do not have complements!") if isinstance(Alphabet._get_base_alphabet(self.alphabet), Alphabet.DNAAlphabet) : d = ambiguous_dna_complement elif isinstance(Alphabet._get_base_alphabet(self.alphabet), Alphabet.RNAAlphabet) : d = ambiguous_rna_complement elif 'U' in self._data and 'T' in self._data: #TODO - Handle this cleanly? raise ValueError("Mixed RNA/DNA found") elif 'U' in self._data: d = ambiguous_rna_complement else: d = ambiguous_dna_complement ttable = self.__maketrans(d) #Much faster on really long sequences than the previous loop based one. #thx to Michael Palmer, University of Waterloo s = str(self).translate(ttable) return Seq(s, self.alphabet)
def complement(self): """Modify the mutable sequence to take on its complement. Trying to complement a protein sequence raises an exception. No return value""" if isinstance(Alphabet._get_base_alphabet(self.alphabet), Alphabet.ProteinAlphabet): raise ValueError("Proteins do not have complements!") if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): d = ambiguous_dna_complement elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): d = ambiguous_rna_complement elif 'U' in self.data and 'T' in self.data: #TODO - Handle this cleanly? raise ValueError("Mixed RNA/DNA found") elif 'U' in self.data: d = ambiguous_rna_complement else: d = ambiguous_dna_complement c = dict([(x.lower(), y.lower()) for x, y in d.iteritems()]) d.update(c) self.data = map(lambda c: d[c], self.data) self.data = array.array('c', self.data)
def complement(self): """Modify the mutable sequence to take on its complement. Trying to complement a protein sequence raises an exception. No return value""" if isinstance(Alphabet._get_base_alphabet(self.alphabet), Alphabet.ProteinAlphabet) : raise ValueError("Proteins do not have complements!") if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): d = ambiguous_dna_complement elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): d = ambiguous_rna_complement elif 'U' in self.data and 'T' in self.data : #TODO - Handle this cleanly? raise ValueError("Mixed RNA/DNA found") elif 'U' in self.data: d = ambiguous_rna_complement else: d = ambiguous_dna_complement c = dict([(x.lower(), y.lower()) for x,y in d.iteritems()]) d.update(c) self.data = map(lambda c: d[c], self.data) self.data = array.array('c', self.data)
def translate(self, table="Standard", stop_symbol="*", to_stop=False): """Turns a nucleotide sequence into a protein sequence. New Seq object. Trying to back-transcribe a protein sequence raises an exception. This method will translate DNA or RNA sequences. Trying to translate a protein sequence raises an exception. table - Which codon table to use? This can be either a name (string) or an NCBI identifier (integer). This defaults to the "Standard" table. stop_symbol - Single character string, what to use for terminators. This defaults to the asterisk, "*". to_stop - Boolean, defaults to False meaning do a full translation continuing on past any stop codons (translated as the specified stop_symbol). If True, translation is terminated at the first in frame stop codon (and the stop_symbol is not appended to the returned protein sequence). e.g. Using the standard table, >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") >>> coding_dna.translate() Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) >>> coding_dna.translate(stop_symbol="@") Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@')) >>> coding_dna.translate(to_stop=True) Seq('VAIVMGR', ExtendedIUPACProtein()) Now using NCBI table 2, where TGA is not a stop codon: >>> coding_dna.translate(table=2) Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) >>> coding_dna.translate(table=2, to_stop=True) Seq('VAIVMGRWKGAR', ExtendedIUPACProtein()) If the sequence has no in-frame stop codon, then the to_stop argument has no effect: >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") >>> coding_dna2.translate() Seq('LAIVMGR', ExtendedIUPACProtein()) >>> coding_dna2.translate(to_stop=True) Seq('LAIVMGR', ExtendedIUPACProtein()) NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid or a stop codon. These are translated as "X". Any invalid codon (e.g. "TA?" or "T-A") will throw a TranslationError. NOTE - Does NOT support gapped sequences. NOTE - This does NOT behave like the python string's translate method. For that use str(my_seq).translate(...) instead. """ try: table_id = int(table) except ValueError: table_id = None if isinstance(table, str) and len(table) == 256: raise ValueError("The Seq object translate method DOES NOT take " \ + "a 256 character string mapping table like " \ + "the python string object's translate method. " \ + "Use str(my_seq).translate(...) instead.") if isinstance(Alphabet._get_base_alphabet(self.alphabet), Alphabet.ProteinAlphabet): raise ValueError("Proteins cannot be translated!") if self.alphabet == IUPAC.unambiguous_dna: if table_id is None: codon_table = CodonTable.unambiguous_dna_by_name[table] else: codon_table = CodonTable.unambiguous_dna_by_id[table_id] elif self.alphabet == IUPAC.ambiguous_dna: if table_id is None: codon_table = CodonTable.ambiguous_dna_by_name[table] else: codon_table = CodonTable.ambiguous_dna_by_id[table_id] elif self.alphabet == IUPAC.unambiguous_rna: if table_id is None: codon_table = CodonTable.unambiguous_rna_by_name[table] else: codon_table = CodonTable.unambiguous_rna_by_id[table_id] elif self.alphabet == IUPAC.ambiguous_rna: if table_id is None: codon_table = CodonTable.ambiguous_rna_by_name[table] else: codon_table = CodonTable.ambiguous_rna_by_id[table_id] else: if table_id is None: codon_table = CodonTable.ambiguous_generic_by_name[table] else: codon_table = CodonTable.ambiguous_generic_by_id[table_id] protein = _translate_str(str(self), codon_table, stop_symbol, to_stop) if stop_symbol in protein: alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet, stop_symbol=stop_symbol) else: alphabet = codon_table.protein_alphabet return Seq(protein, alphabet)
def translate(self, table="Standard", stop_symbol="*", to_stop=False): """Turns a nucleotide sequence into a protein sequence. New Seq object. Trying to back-transcribe a protein sequence raises an exception. This method will translate DNA or RNA sequences. Trying to translate a protein sequence raises an exception. table - Which codon table to use? This can be either a name (string) or an NCBI identifier (integer). This defaults to the "Standard" table. stop_symbol - Single character string, what to use for terminators. This defaults to the asterisk, "*". to_stop - Boolean, defaults to False meaning do a full translation continuing on past any stop codons (translated as the specified stop_symbol). If True, translation is terminated at the first in frame stop codon (and the stop_symbol is not appended to the returned protein sequence). e.g. Using the standard table, >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") >>> coding_dna.translate() Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) >>> coding_dna.translate(stop_symbol="@") Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@')) >>> coding_dna.translate(to_stop=True) Seq('VAIVMGR', ExtendedIUPACProtein()) Now using NCBI table 2, where TGA is not a stop codon: >>> coding_dna.translate(table=2) Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) >>> coding_dna.translate(table=2, to_stop=True) Seq('VAIVMGRWKGAR', ExtendedIUPACProtein()) If the sequence has no in-frame stop codon, then the to_stop argument has no effect: >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") >>> coding_dna2.translate() Seq('LAIVMGR', ExtendedIUPACProtein()) >>> coding_dna2.translate(to_stop=True) Seq('LAIVMGR', ExtendedIUPACProtein()) NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid or a stop codon. These are translated as "X". Any invalid codon (e.g. "TA?" or "T-A") will throw a TranslationError. NOTE - Does NOT support gapped sequences. NOTE - This does NOT behave like the python string's translate method. For that use str(my_seq).translate(...) instead. """ try: table_id = int(table) except ValueError: table_id = None if isinstance(table, str) and len(table)==256 : raise ValueError("The Seq object translate method DOES NOT take " \ + "a 256 character string mapping table like " \ + "the python string object's translate method. " \ + "Use str(my_seq).translate(...) instead.") if isinstance(Alphabet._get_base_alphabet(self.alphabet), Alphabet.ProteinAlphabet) : raise ValueError("Proteins cannot be translated!") if self.alphabet==IUPAC.unambiguous_dna: if table_id is None: codon_table = CodonTable.unambiguous_dna_by_name[table] else: codon_table = CodonTable.unambiguous_dna_by_id[table_id] elif self.alphabet==IUPAC.ambiguous_dna: if table_id is None: codon_table = CodonTable.ambiguous_dna_by_name[table] else: codon_table = CodonTable.ambiguous_dna_by_id[table_id] elif self.alphabet==IUPAC.unambiguous_rna: if table_id is None: codon_table = CodonTable.unambiguous_rna_by_name[table] else: codon_table = CodonTable.unambiguous_rna_by_id[table_id] elif self.alphabet==IUPAC.ambiguous_rna: if table_id is None: codon_table = CodonTable.ambiguous_rna_by_name[table] else: codon_table = CodonTable.ambiguous_rna_by_id[table_id] else: if table_id is None: codon_table = CodonTable.ambiguous_generic_by_name[table] else: codon_table = CodonTable.ambiguous_generic_by_id[table_id] protein = _translate_str(str(self), codon_table, stop_symbol, to_stop) if stop_symbol in protein : alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet, stop_symbol = stop_symbol) else : alphabet = codon_table.protein_alphabet return Seq(protein, alphabet)