def test_exception_when_added_protein_has_more_than_one_stop_codon_type( self): """Test resulting protein has stop codon types '*' and '@'""" a = Seq.Seq( "MEDG-KRXR@", Alphabet.HasStopCodon(Alphabet.Gapped(IUPAC.extended_protein, "-"), "@")) b = Seq.Seq( "MEDG-KRXR*", Alphabet.Gapped(Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")) with self.assertRaises(ValueError): a + b
def setUp(self): self.s = Seq.Seq("TCAAAAGGATGCATCATG", IUPAC.unambiguous_dna) self.dna = [ Seq.Seq("ATCG", IUPAC.ambiguous_dna), Seq.Seq("gtca", Alphabet.generic_dna), Seq.MutableSeq("GGTCA", Alphabet.generic_dna), Seq.Seq("CTG-CA", Alphabet.Gapped(IUPAC.unambiguous_dna, "-")), ] self.rna = [ Seq.Seq("AUUUCG", IUPAC.ambiguous_rna), Seq.MutableSeq("AUUCG", IUPAC.ambiguous_rna), Seq.Seq("uCAg", Alphabet.generic_rna), Seq.MutableSeq("UC-AG", Alphabet.Gapped(Alphabet.generic_rna, "-")), Seq.Seq("U.CAG", Alphabet.Gapped(Alphabet.generic_rna, ".")), ] self.nuc = [Seq.Seq("ATCG", Alphabet.generic_nucleotide)] self.protein = [ Seq.Seq("ATCGPK", IUPAC.protein), Seq.Seq("atcGPK", Alphabet.generic_protein), Seq.Seq("T.CGPK", Alphabet.Gapped(IUPAC.protein, ".")), Seq.Seq("T-CGPK", Alphabet.Gapped(IUPAC.protein, "-")), Seq.Seq( "MEDG-KRXR*", Alphabet.Gapped( Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")), Seq.MutableSeq( "ME-K-DRXR*XU", Alphabet.Gapped( Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")), Seq.Seq( "MEDG-KRXR@", Alphabet.HasStopCodon( Alphabet.Gapped(IUPAC.extended_protein, "-"), "@")), Seq.Seq( "ME-KR@", Alphabet.HasStopCodon(Alphabet.Gapped(IUPAC.protein, "-"), "@")), Seq.Seq( "MEDG.KRXR@", Alphabet.Gapped( Alphabet.HasStopCodon(IUPAC.extended_protein, "@"), ".")), ] self.test_chars = ["-", Seq.Seq("-"), Seq.Seq("*"), "-X@"]
def setUp(self): self.dna = [ Seq.Seq("ATCG", IUPAC.ambiguous_dna), Seq.Seq("gtca", Alphabet.generic_dna), Seq.MutableSeq("GGTCA", Alphabet.generic_dna), Seq.Seq("CTG-CA", Alphabet.Gapped(IUPAC.unambiguous_dna, "-")), "TGGTCA", ] self.rna = [ Seq.Seq("AUUUCG", IUPAC.ambiguous_rna), Seq.MutableSeq("AUUCG", IUPAC.ambiguous_rna), Seq.Seq("uCAg", Alphabet.generic_rna), Seq.MutableSeq("UC-AG", Alphabet.Gapped(Alphabet.generic_rna, "-")), Seq.Seq("U.CAG", Alphabet.Gapped(Alphabet.generic_rna, ".")), "UGCAU", ] self.nuc = [ Seq.Seq("ATCG", Alphabet.generic_nucleotide), "UUUTTTACG", ] self.protein = [ Seq.Seq("ATCGPK", IUPAC.protein), Seq.Seq("atcGPK", Alphabet.generic_protein), Seq.Seq("T.CGPK", Alphabet.Gapped(IUPAC.protein, ".")), Seq.Seq("T-CGPK", Alphabet.Gapped(IUPAC.protein, "-")), Seq.Seq( "MEDG-KRXR*", Alphabet.Gapped( Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")), Seq.MutableSeq( "ME-K-DRXR*XU", Alphabet.Gapped( Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")), "TEDDF", ]
def translate(self, seq, stop_symbol="*"): #Allow different instances of the same class to be used: assert seq.alphabet.__class__ == \ self.table.nucleotide_alphabet.__class__, \ "cannot translate from given alphabet (have %s, need %s)" %\ (seq.alphabet, self.table.nucleotide_alphabet) s = seq.data letters = [] append = letters.append table = self.table get = table.forward_table.get n = len(seq) for i in range(0, n - n % 3, 3): append(get(s[i:i + 3], stop_symbol)) # return with the correct alphabet encoding (cache the encoding) try: alphabet = self._encoded[stop_symbol] except KeyError: alphabet = Alphabet.HasStopCodon(table.protein_alphabet, stop_symbol) self._encoded[stop_symbol] = alphabet return Seq.Seq("".join(letters), alphabet)
rna = [ Seq.Seq("AUUUCG", IUPAC.ambiguous_rna), Seq.MutableSeq("AUUCG", IUPAC.ambiguous_rna), Seq.Seq("uCAg", Alphabet.generic_rna), Seq.MutableSeq("UC-AG", Alphabet.Gapped(Alphabet.generic_rna, "-")), Seq.Seq("U.CAG", Alphabet.Gapped(Alphabet.generic_rna, ".")), "UGCAU" ] nuc = [Seq.Seq("ATCG", Alphabet.generic_nucleotide), "UUUTTTACG"] protein = [ Seq.Seq("ATCGPK", IUPAC.protein), Seq.Seq("atcGPK", Alphabet.generic_protein), Seq.Seq("T.CGPK", Alphabet.Gapped(IUPAC.protein, ".")), Seq.Seq("T-CGPK", Alphabet.Gapped(IUPAC.protein, "-")), Seq.Seq( "MEDG-KRXR*", Alphabet.Gapped(Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")), Seq.MutableSeq( "ME-K-DRXR*XU", Alphabet.Gapped(Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")), Seq.Seq( "MEDG-KRXR@", Alphabet.HasStopCodon(Alphabet.Gapped(IUPAC.extended_protein, "-"), "@")), Seq.Seq("ME-KR@", Alphabet.HasStopCodon(Alphabet.Gapped(IUPAC.protein, "-"), "@")), Seq.Seq( "MEDG.KRXR@", Alphabet.Gapped(Alphabet.HasStopCodon(IUPAC.extended_protein, "@"), ".")), "TEDDF"
consensus = summary.dumb_consensus(ambiguous="N") print consensus consensus = summary.gap_consensus(ambiguous="N") print consensus print print summary.pos_specific_score_matrix(chars_to_ignore=['-'], axis_seq=consensus) print #Have a generic alphabet, without a declared gap char, so must tell #provide the frequencies and chars to ignore explicitly. print summary.information_content(e_freq_table=expected, chars_to_ignore=['-']) print print "Trying a protein sequence with gaps and stops" alpha = Alphabet.HasStopCodon( Alphabet.Gapped(Alphabet.generic_protein, "-"), "*") a = Alignment(alpha) a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-") a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*") a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*") print a print "=" * a.get_alignment_length() s = SummaryInfo(a) c = s.dumb_consensus(ambiguous="X") print c c = s.gap_consensus(ambiguous="X") print c print print s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)
def testseq(size=30, alphabet=IUPAC.unambiguous_dna, table=1, gc_target=None, persistent=True, from_start=True, to_stop=True, stop_symbol="*", truncate=True, messenger=False, rand_seed=0): """Generate and return a Seq object. This function will generate and return a custom Seq object using any IUPAC alphabet. These sequences are a faux representation of biological data and can be used for testing/demonstration purposes. Arguments: - size - The number of letters in the generated sequence. This preferably accepts an integer value and will attempt to convert any input to an integer. - alphabet - Any IUPAC alphabet can be used to generate the sequence. Defaults to unambiguous DNA. - table - An integer value that denotes the NCBI identifier of the codon table which will be used in generating the sequence. This argument prefers an integer and will attempt to convert any input into an integer value. - gc_target - The function will attempt to generate a sequence with a GC-content equal to the 'gc_target' argument. The argument will accept any integer between 0 and 100. Alternatively, if 'gc_target' is set to 'None', the 'gc_target' argument will be ignored when generating the sequence. - persistent - A boolean that, if set to True, will remove any stop codons that are generated by chance within the sequence. - from_start - A boolean that, if set to True, will ensure that the first codon in the generated sequence is a start codon. - to_stop - A boolean that, if set to True, will ensure that the last codon in the generated sequence is a stop codon. - stop_symbol - Single character string that denotes the presence of a translated stop codon. This defaults to the asterisk, "*". - truncate - A boolean that, if set to True, will ensure that the size of any generated (non-protein) sequence is a multiple of three (3). - messenger - A boolean that, if set to True, will ensure that any RNA sequence generated will additionally have a 5'-UTR, 3'-UTR, and a poly-A tail. - rand_seed - The seed used to generate the randomized sequence. This argument accepts any hashable data value as the seed. If the argument is set to None, the sequence will be re-seeded with every function call. Hey there! We can use the 'testseq' function to quickly generate sequences. >>> from Scripts.testseq import testseq >>> my_seq = testseq() >>> type(my_seq) <class 'Bio.Seq.Seq'> The default size of the generated sequence is 30 letters, but you can change that at any time, like so: >>> my_seq = testseq(1500) >>> len(my_seq) 1500 You can generate your sequence using any IUPAC alphabet, like so: >>> from Bio.Alphabet import IUPAC >>> my_seq = testseq(alphabet=IUPAC.extended_protein) >>> my_seq.alphabet HasStopCodon(ExtendedIUPACProtein(), '*') Please notice below that this sequence starts with Methionine(M) and ends in an asterisk(*). That's because of the two arguments 'from_start' and 'to_stop' respectively. Curiously, there are no asterisks (or terminators) within the sequence either; this is due to the 'persistent' argument. >>> my_seq[0] 'M' >>> my_seq[-1] '*' >>> "*" in my_seq[1:-1] False The 'from_start', 'to_stop', and 'persistent' arguments are all set to True by default. You can read more about what they do in the "Arguments" section above. It's useful to note that all three of those arguments involve the use of codon tables! When generating your sequence, you can set which codon table you'd like to use: >>> my_seq1 = testseq(table=5) Now we can translate our sequence with ease! >>> my_seq2 = my_seq1.translate(table=6) >>> my_seq2.alphabet IUPACProtein() Oops! We're missing a stop codon. We've generated a sequence using Table 5, but translated it using Table 6. Those tables don't share a common stop codon! Let's fix that... >>> my_seq2 = my_seq1.translate(table=5) >>> my_seq2.alphabet HasStopCodon(IUPACProtein(), '*') That's better! The 'testseq' function can also attempt to generate sequences with a custom GC-content. You can alter your desired GC-content by declaring it in the 'gc_target' argument, like so: >>> my_seq = testseq(gc_target=60) >>> from Bio.SeqUtils import GC >>> error = 60 - GC(my_seq) >>> -5 < error < 5 False What happened? Note that in the above example, the sequence is at the default size of 30 letters. Since the sequence is generated letter by letter, larger sequences will have a tendency to be closer to the desired GC-content than smaller sequences. Let's try that again with a much larger sequence: >>> my_seq = testseq(10000, gc_target=60) >>> error = 60 - GC(my_seq) >>> -5 < error < 5 True Much better! It's also worth noting that the 'gc_target' argument is ignored when generating protein sequences. Lets revisit the 'size' argument for a moment. You will notice that when generating the sequence above, I declared a 'size' of 10000. Lets confirm whether that was generated as requested: >>> len(my_seq) 9999 That may seem like a bug, but it isn't! All non-protein sequences are truncated to a multiple of three (3). This is to allow for smooth translation from nucleotide to amino-acid alphabets. This behavior is controlled by the 'truncate' argument, which is set to True by default. Let's see what happens when we set it to False: >>> my_seq = testseq(10000, truncate=False) >>> len(my_seq) 10000 The result above seems cleaner, but would result in a 'BiopythonWarning' if you translated that sequence. So please be careful when altering the 'truncate' argument. Let us briefly discuss the 'messenger' argument. You can use it to add messenger RNA components to a generated RNA sequence. Though, it has two caveats. First, the messenger argument is ignored unless an RNA alphabet is declared. More importantly though, all mRNA compenents are added to the sequence addtionally. Let's look at an example: >>> my_seq = testseq(300, alphabet=IUPAC.unambiguous_rna, messenger=True) >>> len(my_seq) == 300 False >>> len(my_seq) > 400 True Notice that the sequence requested was 300 letters, however the final length of the sequence is much larger. Those extra letters are the mRNA components. The generated sequence is buried in there and it is exactly 300 letters in size! Lastly, lets discuss the sequence generator itself. The sequence is created using a pseudo-random number generator which relies on a seed to process and spit out random numbers. Lets look at an example: >>> a = testseq() >>> b = testseq() >>> a == b True Since sequence "a" and sequence "b" were both generated using the same seed, they ended up being the exact same sequence. We can change that behavior to shuffle the seed every time the function is called by setting the the 'rand_seed' argument to 'None' (without quotation marks), like so: >>> a = testseq(rand_seed=None) >>> b = testseq(rand_seed=None) >>> a == b False If you'd like to generate a specific sequence, you can set the 'rand_seed' argument to any desired hashable data value: >>> a = testseq(rand_seed=0.7334) >>> b = testseq(rand_seed="Hello World!") >>> a == b False This concludes our discussion. Thanks again for using Biopython! Contribution by Adil Iqbal (2017). """ # Set seed, gather data, clean-up logic, validate arguments. if rand_seed is None: rand_seed = anchor_instance.random() random_instance.seed(rand_seed) typeof = _SeqType(alphabet) if not typeof.rna and messenger: warnings.warn("Only RNA sequences can be messengers.", BiopythonWarning) messenger = False if typeof.rna and messenger: from_start = True to_stop = True persistent = True if persistent or from_start or to_stop: stop_symbol = str(stop_symbol)[0] codon_set = _CodonSet(alphabet, table, stop_symbol) if gc_target is not None and typeof.protein: warnings.warn("Proteins do not have a GC-content.", BiopythonWarning) if gc_target is not None and not typeof.protein: gc_target = int(gc_target) if gc_target < 0: warnings.warn( "Argument 'gc_target' must be an integer between 0 and 100." "It has been set to 0.", BiopythonWarning) gc_target = 0 if gc_target > 100: warnings.warn( "Argument 'gc_target' must be an integer between 0 and 100." "It has been set to 100.", BiopythonWarning) gc_target = 100 probability_table = _construct_probability_table(alphabet, gc_target) size = int(size) if not typeof.protein and truncate: size -= size % 3 # Begin generating sequence. seq = "" for i in range(size): if gc_target is not None and not typeof.protein: seq += _pick_one(probability_table) else: roll = random_instance.randint(0, len(alphabet.letters) - 1) seq += alphabet.letters[roll] if len(seq) >= 3 and len( seq) % 3 == 0 and not typeof.protein and persistent: # Replace stop codons with non-stop codons. this_codon = seq[-3:] if this_codon in codon_set.stop: roll = random_instance.randint(0, len(codon_set.nonstop) - 1) new_codon = codon_set.nonstop[roll] seq = seq[:-3] + new_codon # Additional processing of generated sequence. x = 3 if typeof.protein: x = 1 if from_start: aug = None if typeof.dna: aug = "ATG" elif typeof.rna: aug = "AUG" elif typeof.protein: aug = "M" if aug is not None and aug in codon_set.start: start = aug else: roll = random_instance.randint(0, len(codon_set.start) - 1) start = codon_set.start[roll] seq = start + seq[x:] if to_stop: roll = random_instance.randint(0, len(codon_set.stop) - 1) stop = codon_set.stop[roll] seq = seq[:-x] + stop if messenger: seq = _add_messenger_parts(seq, size, alphabet, codon_set) if typeof.protein and stop_symbol in seq: alphabet = Alphabet.HasStopCodon(alphabet, stop_symbol) return Seq(seq, alphabet)
Seq.Seq("gtca", Alphabet.generic_dna), Seq.MutableSeq("GGTCA", Alphabet.generic_dna), Seq.Seq("CTG-CA", Alphabet.Gapped(IUPAC.unambiguous_dna, "-")), "TGGTCA"] rna = [Seq.Seq("AUUUCG", IUPAC.ambiguous_rna), Seq.MutableSeq("AUUCG", IUPAC.ambiguous_rna), Seq.Seq("uCAg", Alphabet.generic_rna), Seq.MutableSeq("UC-AG", Alphabet.Gapped(Alphabet.generic_rna, "-")), Seq.Seq("U.CAG", Alphabet.Gapped(Alphabet.generic_rna, ".")), "UGCAU"] nuc = [Seq.Seq("ATCG", Alphabet.generic_nucleotide),"UUUTTTACG"] protein = [Seq.Seq("ATCGPK", IUPAC.protein), Seq.Seq("atcGPK", Alphabet.generic_protein), Seq.Seq("T.CGPK", Alphabet.Gapped(IUPAC.protein, ".")), Seq.Seq("T-CGPK", Alphabet.Gapped(IUPAC.protein, "-")), Seq.Seq("MEDG-KRXR*", Alphabet.Gapped(Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")), Seq.MutableSeq("ME-K-DRXR*XU", Alphabet.Gapped(Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")), Seq.Seq("MEDG-KRXR@", Alphabet.HasStopCodon(Alphabet.Gapped(IUPAC.extended_protein, "-"), "@")), Seq.Seq("ME-KR@", Alphabet.HasStopCodon(Alphabet.Gapped(IUPAC.protein, "-"), "@")), Seq.Seq("MEDG.KRXR@", Alphabet.Gapped(Alphabet.HasStopCodon(IUPAC.extended_protein, "@"), ".")), "TEDDF"] for a in dna+rna: for b in nuc: c=a+b assert str(c) == str(a) + str(b) for a in rna: for b in rna: try: c=a+b assert str(c) == str(a) + str(b) except ValueError as e: