def duplicate_sequence(chrSeq, dupStart, dupStop, insertLoc, numDup=1, invert=False): duplication = str(chrSeq[int(dupStart) - 1:int(dupStop)]) * int(numDup) if invert == True: MutableSeq.reverse(duplication) begin = MutableSeq.__add__(chrSeq[:int(insertLoc)], duplication) chrSeq = MutableSeq.__add__(begin, chrSeq[int(insertLoc):]) return chrSeq
def get_optimal_alignment(self): """Follow the traceback to get the optimal alignment.""" # intialize the two sequences which will return the alignment align_seq1 = MutableSeq(array.array("c"), Alphabet.Gapped(IUPAC.protein, GAP_CHAR)) align_seq2 = MutableSeq(array.array("c"), Alphabet.Gapped(IUPAC.protein, GAP_CHAR)) # take care of the initial case with the bottom corner matrix # item current_cell = self.dpmatrix[(len(self.seq1), len(self.seq2))] align_seq1.append(current_cell.seq1item) align_seq2.append(current_cell.seq2item) next_cell = current_cell.get_parent() current_cell = next_cell next_cell = current_cell.get_parent() # keeping adding sequence until we reach (0, 0) while next_cell: # add the new sequence--three cases: # 1. Move up diaganolly, add a new seq1 and seq2 to the # aligned sequences if ((next_cell.col_pos == current_cell.col_pos - 1) and (next_cell.row_pos == current_cell.row_pos - 1)): # print "case 1 -> seq1 %s, seq2 %s" % ( # current_cell.seq1item, current_cell.seq2item) align_seq1.append(current_cell.seq1item) align_seq2.append(current_cell.seq2item) # 2. Move upwards, add a new seq2 and a gap in seq1 elif ((next_cell.col_pos == current_cell.col_pos) and (next_cell.row_pos == current_cell.row_pos - 1)): #print "case 2 -> seq2 %s" % current_cell.seq2item align_seq1.append(GAP_CHAR) align_seq2.append(current_cell.seq2item) # 3. Move to the right, add a new seq1 and a gap in seq2 elif ((next_cell.col_pos == current_cell.col_pos - 1) and (next_cell.row_pos == current_cell.row_pos)): #print "case 3 -> seq1 % s" % current_cell.seq1item align_seq1.append(current_cell.seq1item) align_seq2.append(GAP_CHAR) # now move on to the next sequence current_cell = next_cell next_cell = current_cell.get_parent() # reverse the returned alignments since we are reading them in # backwards align_seq1.reverse() align_seq2.reverse() return align_seq1.toseq(), align_seq2.toseq()
class TestMutableSeq(unittest.TestCase): def setUp(self): self.s = Seq.Seq("TCAAAAGGATGCATCATG") self.mutable_s = MutableSeq("TCAAAAGGATGCATCATG") def test_mutableseq_creation(self): """Test creating MutableSeqs in multiple ways.""" mutable_s = MutableSeq("TCAAAAGGATGCATCATG") self.assertIsInstance(mutable_s, MutableSeq, "Creating MutableSeq") mutable_s = self.s.tomutable() self.assertIsInstance(mutable_s, MutableSeq, "Converting Seq to mutable") array_seq = MutableSeq(array.array("u", "TCAAAAGGATGCATCATG")) self.assertIsInstance(array_seq, MutableSeq, "Creating MutableSeq using array") def test_repr(self): self.assertEqual("MutableSeq('TCAAAAGGATGCATCATG')", repr(self.mutable_s)) def test_truncated_repr(self): seq = "TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGA" expected = ( "MutableSeq('TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATG...GGA')" ) self.assertEqual(expected, repr(MutableSeq(seq))) def test_equal_comparison(self): """Test __eq__ comparison method.""" self.assertEqual(self.mutable_s, "TCAAAAGGATGCATCATG") def test_not_equal_comparison(self): """Test __ne__ comparison method.""" self.assertNotEqual(self.mutable_s, "other thing") def test_less_than_comparison(self): """Test __lt__ comparison method.""" self.assertLess(self.mutable_s[:-1], self.mutable_s) def test_less_than_comparison_of_incompatible_types(self): with self.assertRaises(TypeError): self.mutable_s < 1 def test_less_than_comparison_without_alphabet(self): self.assertLessEqual(self.mutable_s[:-1], "TCAAAAGGATGCATCATG") def test_less_than_or_equal_comparison(self): """Test __le__ comparison method.""" self.assertLessEqual(self.mutable_s[:-1], self.mutable_s) def test_less_than_or_equal_comparison_of_incompatible_types(self): with self.assertRaises(TypeError): self.mutable_s <= 1 def test_less_than_or_equal_comparison_without_alphabet(self): self.assertLessEqual(self.mutable_s[:-1], "TCAAAAGGATGCATCATG") def test_greater_than_comparison(self): """Test __gt__ comparison method.""" self.assertGreater(self.mutable_s, self.mutable_s[:-1]) def test_greater_than_comparison_of_incompatible_types(self): with self.assertRaises(TypeError): self.mutable_s > 1 def test_greater_than_comparison_without_alphabet(self): self.assertGreater(self.mutable_s, "TCAAAAGGATGCATCAT") def test_greater_than_or_equal_comparison(self): """Test __ge__ comparison method.""" self.assertGreaterEqual(self.mutable_s, self.mutable_s) def test_greater_than_or_equal_comparison_of_incompatible_types(self): with self.assertRaises(TypeError): self.mutable_s >= 1 def test_greater_than_or_equal_comparison_without_alphabet(self): self.assertGreaterEqual(self.mutable_s, "TCAAAAGGATGCATCATG") def test_add_method(self): """Test adding wrong type to MutableSeq.""" with self.assertRaises(TypeError): self.mutable_s + 1234 def test_radd_method(self): self.assertEqual( "TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG", self.mutable_s.__radd__(self.mutable_s), ) def test_radd_method_incompatible_alphabets(self): self.assertEqual( "UCAAAAGGATCAAAAGGATGCATCATG", self.mutable_s.__radd__(MutableSeq("UCAAAAGGA")), ) def test_radd_method_using_seq_object(self): self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG", self.mutable_s.__radd__(self.s)) def test_radd_method_wrong_type(self): with self.assertRaises(TypeError): self.mutable_s.__radd__(1234) def test_as_string(self): self.assertEqual("TCAAAAGGATGCATCATG", str(self.mutable_s)) def test_length(self): self.assertEqual(18, len(self.mutable_s)) def test_converting_to_immutable(self): self.assertIsInstance(self.mutable_s.toseq(), Seq.Seq) def test_first_nucleotide(self): self.assertEqual("T", self.mutable_s[0]) def test_setting_slices(self): self.assertEqual( MutableSeq("CAAA"), self.mutable_s[1:5], "Slice mutable seq", ) self.mutable_s[1:3] = "GAT" self.assertEqual( MutableSeq("TGATAAAGGATGCATCATG"), self.mutable_s, "Set slice with string and adding extra nucleotide", ) self.mutable_s[1:3] = self.mutable_s[5:7] self.assertEqual( MutableSeq("TAATAAAGGATGCATCATG"), self.mutable_s, "Set slice with MutableSeq", ) self.mutable_s[1:3] = array.array("u", "GAT") self.assertEqual( MutableSeq("TGATTAAAGGATGCATCATG"), self.mutable_s, "Set slice with array", ) def test_setting_item(self): self.mutable_s[3] = "G" self.assertEqual(MutableSeq("TCAGAAGGATGCATCATG"), self.mutable_s) def test_deleting_slice(self): del self.mutable_s[4:5] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG"), self.mutable_s) def test_deleting_item(self): del self.mutable_s[3] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG"), self.mutable_s) def test_appending(self): self.mutable_s.append("C") self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGC"), self.mutable_s) def test_inserting(self): self.mutable_s.insert(4, "G") self.assertEqual(MutableSeq("TCAAGAAGGATGCATCATG"), self.mutable_s) def test_popping_last_item(self): self.assertEqual("G", self.mutable_s.pop()) def test_remove_items(self): self.mutable_s.remove("G") self.assertEqual(MutableSeq("TCAAAAGATGCATCATG"), self.mutable_s, "Remove first G") self.assertRaises(ValueError, self.mutable_s.remove, "Z") def test_count(self): self.assertEqual(7, self.mutable_s.count("A")) self.assertEqual(2, self.mutable_s.count("AA")) def test_index(self): self.assertEqual(2, self.mutable_s.index("A")) self.assertRaises(ValueError, self.mutable_s.index, "8888") def test_reverse(self): """Test using reverse method.""" self.mutable_s.reverse() self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT"), self.mutable_s) def test_reverse_with_stride(self): """Test reverse using -1 stride.""" self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT"), self.mutable_s[::-1]) def test_complement(self): self.mutable_s.complement() self.assertEqual("AGTTTTCCTACGTAGTAC", str(self.mutable_s)) def test_complement_rna(self): seq = Seq.MutableSeq("AUGaaaCUG") seq.complement() self.assertEqual("UACuuuGAC", str(seq)) def test_complement_mixed_aphabets(self): seq = Seq.MutableSeq("AUGaaaCTG") with self.assertRaises(ValueError): seq.complement() def test_complement_rna_string(self): seq = Seq.MutableSeq("AUGaaaCUG") seq.complement() self.assertEqual("UACuuuGAC", str(seq)) def test_complement_dna_string(self): seq = Seq.MutableSeq("ATGaaaCTG") seq.complement() self.assertEqual("TACtttGAC", str(seq)) def test_reverse_complement(self): self.mutable_s.reverse_complement() self.assertEqual("CATGATGCATCCTTTTGA", str(self.mutable_s)) def test_extend_method(self): self.mutable_s.extend("GAT") self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGGAT"), self.mutable_s) def test_extend_with_mutable_seq(self): self.mutable_s.extend(MutableSeq("TTT")) self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGTTT"), self.mutable_s) def test_delete_stride_slice(self): del self.mutable_s[4:6 - 1] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG"), self.mutable_s) def test_extract_third_nucleotide(self): """Test extracting every third nucleotide (slicing with stride 3).""" self.assertEqual(MutableSeq("TAGTAA"), self.mutable_s[0::3]) self.assertEqual(MutableSeq("CAGGTT"), self.mutable_s[1::3]) self.assertEqual(MutableSeq("AAACCG"), self.mutable_s[2::3]) def test_set_wobble_codon_to_n(self): """Test setting wobble codon to N (set slice with stride 3).""" self.mutable_s[2::3] = "N" * len(self.mutable_s[2::3]) self.assertEqual(MutableSeq("TCNAANGGNTGNATNATN"), self.mutable_s)
def viterbi(self, sequence, state_alphabet): """Calculate the most probable state path using the Viterbi algorithm. This implements the Viterbi algorithm (see pgs 55-57 in Durbin et al for a full explanation -- this is where I took my implementation ideas from), to allow decoding of the state path, given a sequence of emissions. Arguments: o sequence -- A Seq object with the emission sequence that we want to decode. o state_alphabet -- The alphabet of the possible state sequences that can be generated. """ # calculate logarithms of the initial, transition, and emission probs log_initial = self._log_transform(self.initial_prob) log_trans = self._log_transform(self.transition_prob) log_emission = self._log_transform(self.emission_prob) viterbi_probs = {} pred_state_seq = {} state_letters = state_alphabet.letters # --- recursion # loop over the training squence (i = 1 .. L) # NOTE: My index numbers are one less than what is given in Durbin # et al, since we are indexing the sequence going from 0 to # (Length - 1) not 1 to Length, like in Durbin et al. for i in range(0, len(sequence)): # loop over all of the possible i-th states in the state path for cur_state in state_letters: # e_{l}(x_{i}) emission_part = log_emission[(cur_state, sequence[i])] max_prob = 0 if i == 0: # for the first state, use the initial probability rather # than looking back to previous states max_prob = log_initial[cur_state] else: # loop over all possible (i-1)-th previous states possible_state_probs = {} for prev_state in self.transitions_to(cur_state): # a_{kl} trans_part = log_trans[(prev_state, cur_state)] # v_{k}(i - 1) viterbi_part = viterbi_probs[(prev_state, i - 1)] cur_prob = viterbi_part + trans_part possible_state_probs[prev_state] = cur_prob # calculate the viterbi probability using the max max_prob = max(possible_state_probs.values()) # v_{k}(i) viterbi_probs[(cur_state, i)] = (emission_part + max_prob) if i > 0: # get the most likely prev_state leading to cur_state for state in possible_state_probs: if possible_state_probs[state] == max_prob: pred_state_seq[(i - 1, cur_state)] = state break # --- termination # calculate the probability of the state path # loop over all states all_probs = {} for state in state_letters: # v_{k}(L) all_probs[state] = viterbi_probs[(state, len(sequence) - 1)] state_path_prob = max(all_probs.values()) # find the last pointer we need to trace back from last_state = '' for state in all_probs: if all_probs[state] == state_path_prob: last_state = state assert last_state != '', "Didn't find the last state to trace from!" # --- traceback traceback_seq = MutableSeq('', state_alphabet) loop_seq = range(1, len(sequence)) loop_seq.reverse() # last_state is the last state in the most probable state sequence. # Compute that sequence by walking backwards in time. From the i-th # state in the sequence, find the (i-1)-th state as the most # probable state preceding the i-th state. state = last_state traceback_seq.append(state) for i in loop_seq: state = pred_state_seq[(i - 1, state)] traceback_seq.append(state) # put the traceback sequence in the proper orientation traceback_seq.reverse() return traceback_seq.toseq(), state_path_prob
def viterbi(self, sequence, state_alphabet): """Calculate the most probable state path using the Viterbi algorithm. This implements the Viterbi algorithm (see pgs 55-57 in Durbin et al for a full explanation -- this is where I took my implementation ideas from), to allow decoding of the state path, given a sequence of emissions. Arguments: o sequence -- A Seq object with the emission sequence that we want to decode. o state_alphabet -- The alphabet of the possible state sequences that can be generated. """ # calculate logarithms of the initial, transition, and emission probs log_initial = self._log_transform(self.initial_prob) log_trans = self._log_transform(self.transition_prob) log_emission = self._log_transform(self.emission_prob) viterbi_probs = {} pred_state_seq = {} state_letters = state_alphabet.letters # --- recursion # loop over the training squence (i = 1 .. L) # NOTE: My index numbers are one less than what is given in Durbin # et al, since we are indexing the sequence going from 0 to # (Length - 1) not 1 to Length, like in Durbin et al. for i in range(0, len(sequence)): # loop over all of the possible i-th states in the state path for cur_state in state_letters: # e_{l}(x_{i}) emission_part = log_emission[(cur_state, sequence[i])] max_prob = 0 if i == 0: # for the first state, use the initial probability rather # than looking back to previous states max_prob = log_initial[cur_state] else: # loop over all possible (i-1)-th previous states possible_state_probs = {} for prev_state in self.transitions_to(cur_state): # a_{kl} trans_part = log_trans[(prev_state, cur_state)] # v_{k}(i - 1) viterbi_part = viterbi_probs[(prev_state, i - 1)] cur_prob = viterbi_part + trans_part possible_state_probs[prev_state] = cur_prob # calculate the viterbi probability using the max max_prob = max(possible_state_probs.values()) # v_{k}(i) viterbi_probs[(cur_state, i)] = (emission_part + max_prob) if i > 0: # get the most likely prev_state leading to cur_state for state in possible_state_probs: if possible_state_probs[state] == max_prob: pred_state_seq[(i - 1, cur_state)] = state break # --- termination # calculate the probability of the state path # loop over all states all_probs = {} for state in state_letters: # v_{k}(L) all_probs[state] = viterbi_probs[(state, len(sequence) - 1)] state_path_prob = max(all_probs.values()) # find the last pointer we need to trace back from last_state = '' for state in all_probs: if all_probs[state] == state_path_prob: last_state = state assert last_state != '', "Didn't find the last state to trace from!" # --- traceback traceback_seq = MutableSeq('', state_alphabet) loop_seq = list(range(1, len(sequence))) loop_seq.reverse() # last_state is the last state in the most probable state sequence. # Compute that sequence by walking backwards in time. From the i-th # state in the sequence, find the (i-1)-th state as the most # probable state preceding the i-th state. state = last_state traceback_seq.append(state) for i in loop_seq: state = pred_state_seq[(i - 1, state)] traceback_seq.append(state) # put the traceback sequence in the proper orientation traceback_seq.reverse() return traceback_seq.toseq(), state_path_prob
mutable_seq = my_seq.tomutable() mutable_seq new_seq = mutable_seq.toseq() new_seq from Bio.Seq import MutableSeq from Bio.Alphabet import IUPAC mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna) mutable_seq mutable_seq[5] = "C" mutable_seq mutable_seq.remove("T") mutable_seq mutable_seq.reverse() mutable_seq # UnknownSeq objects from Bio.Seq import UnknownSeq unk = UnknownSeq(20) unk print(unk) len(unk) from Bio.Seq import UnknownSeq from Bio.Alphabet import IUPAC unk_dna = UnknownSeq(20, alphabet=IUPAC.ambiguous_dna) unk_dna print(unk_dna)
"AATCGTGGCTATTACTGGGATGGAGGTCACTGGCGCGACCACGGCTGGTGGAAACAACAT" + "TATGAATGGCGAGGCAATCGCTGGCACCTACACGGACCGCCGCCACCGCCGCGCCACCAT" + "AAGAAAGCTCCTCATGATCATCACGGCGGTCATGGTCCAGGCAAACATCACCGCTAA", generic_dna) print(gene.translate(table="Bacterial")) print(gene.translate(table="Bacterial", cds=True)) ##查看密码子表 from Bio.Data import CodonTable standard_table = CodonTable.unambiguous_dna_by_name["Standard"] mito_table = CodonTable.unambiguous_dna_by_id[2] print(standard_table) print(mito_table.start_codons) print(mito_table.stop_codons) print(mito_table.forward_table["ACG"]) ##可变对象 from Bio.Seq import MutableSeq mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna) print(mutable_seq) mutable_seq[5] = "C" print(mutable_seq) mutable_seq.remove("T") print(mutable_seq) mutable_seq.reverse() print(mutable_seq) new_seq = mutable_seq.toseq() print(new_seq)
from Bio.Seq import Seq from Bio.Seq import MutableSeq from Bio.Alphabet import IUPAC my_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna) # my_seq[6] = "C" my_seq.remove("T") my_seq.reverse() print(repr(my_seq)) non_mutable_seq = my_seq.toseq()
#print gene #YAAX = yaaX.translate(table='Bacterial', cds=True, to_stop=True) #print YAAX #playing with codon usage tables #from Bio.Data import CodonTable #standard_table = CodonTable.unambiguous_dna_by_name["Standard"] #mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"] #print standard_table #mutable seq objects from Bio.Seq import Seq from Bio.Seq import MutableSeq from Bio.Alphabet import IUPAC #my_seq = Seq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna) #mutable_seq = my_seq.tomutable() #Or just create a mutable seq! my_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna) print my_seq #my_seq_div = my_seq #my_seq_div[5:8] = 'tag' #how to do insertions???????? only can replace as many characters as indicated. wait it works now. #why 5:8? #print my_seq #why does this print as my_seq_div with SNP? #print my_seq_div #my_seq_del = my_seq_div.remove("T") #print my_seq_del my_seq_rev = my_seq.reverse() #should be able to do my_seq.reverse_complement() as well print my_seq_rev #this should be working, but it returning None fin_seq = my_seq_div.toseq() #converts back to immutable Seq Object
def viterbi(self, sequence, state_alphabet): """Calculate the most probable state path using the Viterbi algorithm. This implements the Viterbi algorithm (see pgs 55-57 in Durbin et al for a full explanation -- this is where I took my implementation ideas from), to allow decoding of the state path, given a sequence of emissions. Arguments: o sequence -- A Seq object with the emission sequence that we want to decode. o state_alphabet -- The alphabet of the possible state sequences that can be generated. """ # calculate logarithms of the transition and emission probs log_trans = self._log_transform(self.transition_prob) log_emission = self._log_transform(self.emission_prob) viterbi_probs = {} pred_state_seq = {} state_letters = state_alphabet.letters # --- initialization # # NOTE: My index numbers are one less than what is given in Durbin # et al, since we are indexing the sequence going from 0 to # (Length - 1) not 1 to Length, like in Durbin et al. # # v_{0}(0) = 1 viterbi_probs[(state_letters[0], -1)] = 1 # v_{k}(0) = 0 for k > 0 for state_letter in state_letters[1:]: viterbi_probs[(state_letter, -1)] = 0 # --- recursion # loop over the training squence (i = 1 .. L) for i in range(0, len(sequence)): # now loop over all of the letters in the state path for main_state in state_letters: # e_{l}(x_{i}) emission_part = log_emission[(main_state, sequence[i])] # loop over all possible states possible_state_probs = {} for cur_state in self.transitions_from(main_state): # a_{kl} trans_part = log_trans[(cur_state, main_state)] # v_{k}(i - 1) viterbi_part = viterbi_probs[(cur_state, i - 1)] cur_prob = viterbi_part + trans_part possible_state_probs[cur_state] = cur_prob # finally calculate the viterbi probability using the max max_prob = max(possible_state_probs.values()) viterbi_probs[(main_state, i)] = (emission_part + max_prob) # now get the most likely state for state in possible_state_probs: if possible_state_probs[state] == max_prob: pred_state_seq[(i - 1, main_state)] = state break # --- termination # calculate the probability of the state path # loop over all letters all_probs = {} for state in state_letters: # v_{k}(L) viterbi_part = viterbi_probs[(state, len(sequence) - 1)] # a_{k0} transition_part = log_trans[(state, state_letters[0])] all_probs[state] = viterbi_part * transition_part state_path_prob = max(all_probs.values()) # find the last pointer we need to trace back from last_state = '' for state in all_probs: if all_probs[state] == state_path_prob: last_state = state assert last_state != '', "Didn't find the last state to trace from!" # --- traceback traceback_seq = MutableSeq('', state_alphabet) loop_seq = range(0, len(sequence)) loop_seq.reverse() cur_state = last_state for i in loop_seq: traceback_seq.append(cur_state) cur_state = pred_state_seq[(i - 1, cur_state)] # put the traceback sequence in the proper orientation traceback_seq.reverse() return traceback_seq.toseq(), state_path_prob
# Print the second codon position seqs[2::3] # Sequence Length Comparison seq1 = Seq("TTGTGGCCGCTCAGATCAGGCAGTTTAGGCTTA") seq2 = Seq("ATTTATAGAAATGTGGTTATTTCTTAAGCATGGC") seq1 == seq2 # Mutable sequence mut_seq = MutableSeq("TTGTGGCCGCTCAGATCAGGCAGTTTAGGCTTA") print(f'MutSeq: {mut_seq}') mut_seq[5] == "C" print(mut_seq) mut_seq.remove("T") print(mut_seq) mut_seq.reverse() print(mut_seq) !wget http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/SRR835775_1.first1000.fastq # Working with Fastq files for record in SeqIO.parse("SRR835775_1.first1000.fastq", "fastq"): print(record) print(record.seq) print(record.letter_annotations['phred_quality']) quals = [record.letter_annotations['phred_quality'] for record in SeqIO.parse("SRR835775_1.first1000.fastq", "fastq")]
class TestMutableSeq(unittest.TestCase): def setUp(self): self.s = Seq.Seq("TCAAAAGGATGCATCATG", IUPAC.unambiguous_dna) self.mutable_s = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna) def test_mutableseq_creation(self): """Test creating MutableSeqs in multiple ways""" mutable_s = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna) self.assertIsInstance(mutable_s, MutableSeq, "Creating MutableSeq") mutable_s = self.s.tomutable() self.assertIsInstance(mutable_s, MutableSeq, "Converting Seq to mutable") array_seq = MutableSeq( array.array(array_indicator, "TCAAAAGGATGCATCATG"), IUPAC.ambiguous_dna) self.assertIsInstance(array_seq, MutableSeq, "Creating MutableSeq using array") def test_repr(self): self.assertEqual( "MutableSeq('TCAAAAGGATGCATCATG', IUPACAmbiguousDNA())", repr(self.mutable_s)) def test_truncated_repr(self): seq = "TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGA" expected = "MutableSeq('TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATG...GGA', IUPACAmbiguousDNA())" self.assertEqual(expected, repr(MutableSeq(seq, IUPAC.ambiguous_dna))) def test_equal_comparison(self): """Test __eq__ comparison method""" self.assertEqual(self.mutable_s, "TCAAAAGGATGCATCATG") def test_equal_comparison_of_incompatible_alphabets(self): with warnings.catch_warnings(record=True): self.mutable_s == MutableSeq('UCAAAAGGA', IUPAC.ambiguous_rna) def test_not_equal_comparison(self): """Test __ne__ comparison method""" self.assertNotEqual(self.mutable_s, "other thing") def test_less_than_comparison(self): """Test __lt__ comparison method""" self.assertTrue(self.mutable_s[:-1] < self.mutable_s) def test_less_than_comparison_of_incompatible_alphabets(self): with warnings.catch_warnings(record=True): self.mutable_s[:-1] < MutableSeq("UCAAAAGGAUGCAUCAUG", IUPAC.ambiguous_rna) def test_less_than_comparison_without_alphabet(self): self.assertTrue(self.mutable_s[:-1] < "TCAAAAGGATGCATCATG") def test_less_than_or_equal_comparison(self): """Test __le__ comparison method""" self.assertTrue(self.mutable_s[:-1] <= self.mutable_s) def test_less_than_or_equal_comparison_of_incompatible_alphabets(self): with warnings.catch_warnings(record=True): self.mutable_s[:-1] <= MutableSeq("UCAAAAGGAUGCAUCAUG", IUPAC.ambiguous_rna) def test_less_than_or_equal_comparison_without_alphabet(self): self.assertTrue(self.mutable_s[:-1] <= "TCAAAAGGATGCATCATG") def test_add_method(self): """Test adding wrong type to MutableSeq""" with self.assertRaises(TypeError): self.mutable_s + 1234 def test_radd_method(self): self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG", self.mutable_s.__radd__(self.mutable_s)) def test_radd_method_incompatible_alphabets(self): with self.assertRaises(TypeError): self.mutable_s.__radd__( MutableSeq("UCAAAAGGA", IUPAC.ambiguous_rna)) def test_radd_method_using_seq_object(self): self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG", self.mutable_s.__radd__(self.s)) def test_radd_method_wrong_type(self): with self.assertRaises(TypeError): self.mutable_s.__radd__(1234) def test_as_string(self): self.assertEqual("TCAAAAGGATGCATCATG", str(self.mutable_s)) def test_length(self): self.assertEqual(18, len(self.mutable_s)) def test_converting_to_immutable(self): self.assertIsInstance(self.mutable_s.toseq(), Seq.Seq) def test_first_nucleotide(self): self.assertEqual('T', self.mutable_s[0]) def test_setting_slices(self): self.assertEqual(MutableSeq('CAAA', IUPAC.ambiguous_dna), self.mutable_s[1:5], "Slice mutable seq") self.mutable_s[1:3] = "GAT" self.assertEqual( MutableSeq("TGATAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Set slice with string and adding extra nucleotide") self.mutable_s[1:3] = self.mutable_s[5:7] self.assertEqual( MutableSeq("TAATAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Set slice with MutableSeq") self.mutable_s[1:3] = array.array(array_indicator, "GAT") self.assertEqual( MutableSeq("TGATTAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Set slice with array") def test_setting_item(self): self.mutable_s[3] = "G" self.assertEqual(MutableSeq("TCAGAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_deleting_slice(self): del self.mutable_s[4:5] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_deleting_item(self): del self.mutable_s[3] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_appending(self): self.mutable_s.append("C") self.assertEqual( MutableSeq("TCAAAAGGATGCATCATGC", IUPAC.ambiguous_dna), self.mutable_s) def test_inserting(self): self.mutable_s.insert(4, "G") self.assertEqual( MutableSeq("TCAAGAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_popping_last_item(self): self.assertEqual("G", self.mutable_s.pop()) def test_remove_items(self): self.mutable_s.remove("G") self.assertEqual(MutableSeq("TCAAAAGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Remove first G") self.assertRaises(ValueError, self.mutable_s.remove, 'Z') def test_count(self): self.assertEqual(7, self.mutable_s.count("A")) self.assertEqual(2, self.mutable_s.count("AA")) def test_index(self): self.assertEqual(2, self.mutable_s.index("A")) self.assertRaises(ValueError, self.mutable_s.index, "8888") def test_reverse(self): """Test using reverse method""" self.mutable_s.reverse() self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT", IUPAC.ambiguous_dna), self.mutable_s) def test_reverse_with_stride(self): """Test reverse using -1 stride""" self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT", IUPAC.ambiguous_dna), self.mutable_s[::-1]) def test_complement(self): self.mutable_s.complement() self.assertEqual(str("AGTTTTCCTACGTAGTAC"), str(self.mutable_s)) def test_complement_rna(self): seq = Seq.MutableSeq("AUGaaaCUG", IUPAC.unambiguous_rna) seq.complement() self.assertEqual(str("UACuuuGAC"), str(seq)) def test_complement_mixed_aphabets(self): seq = Seq.MutableSeq("AUGaaaCTG") with self.assertRaises(ValueError): seq.complement() def test_complement_rna_string(self): seq = Seq.MutableSeq("AUGaaaCUG") seq.complement() self.assertEqual('UACuuuGAC', str(seq)) def test_complement_dna_string(self): seq = Seq.MutableSeq("ATGaaaCTG") seq.complement() self.assertEqual('TACtttGAC', str(seq)) def test_reverse_complement(self): self.mutable_s.reverse_complement() self.assertEqual("CATGATGCATCCTTTTGA", str(self.mutable_s)) def test_reverse_complement_of_protein(self): seq = Seq.MutableSeq("ACTGTCGTCT", Alphabet.generic_protein) with self.assertRaises(ValueError): seq.reverse_complement() def test_to_string_method(self): """This method is currently deprecated, probably will need to remove this test soon""" with warnings.catch_warnings(record=True): self.mutable_s.tostring() def test_extend_method(self): self.mutable_s.extend("GAT") self.assertEqual( MutableSeq("TCAAAAGGATGCATCATGGAT", IUPAC.ambiguous_dna), self.mutable_s) def test_extend_with_mutable_seq(self): self.mutable_s.extend(MutableSeq("TTT", IUPAC.ambiguous_dna)) self.assertEqual( MutableSeq("TCAAAAGGATGCATCATGTTT", IUPAC.ambiguous_dna), self.mutable_s) def test_delete_stride_slice(self): del self.mutable_s[4:6 - 1] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_extract_third_nucleotide(self): """Test extracting every third nucleotide (slicing with stride 3)""" self.assertEqual(MutableSeq("TAGTAA", IUPAC.ambiguous_dna), self.mutable_s[0::3]) self.assertEqual(MutableSeq("CAGGTT", IUPAC.ambiguous_dna), self.mutable_s[1::3]) self.assertEqual(MutableSeq("AAACCG", IUPAC.ambiguous_dna), self.mutable_s[2::3]) def test_set_wobble_codon_to_n(self): """Test setting wobble codon to N (set slice with stride 3)""" self.mutable_s[2::3] = "N" * len(self.mutable_s[2::3]) self.assertEqual(MutableSeq("TCNAANGGNTGNATNATN", IUPAC.ambiguous_dna), self.mutable_s)
class TestMutableSeq(unittest.TestCase): def setUp(self): self.s = Seq.Seq("TCAAAAGGATGCATCATG", IUPAC.unambiguous_dna) self.mutable_s = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna) def test_mutableseq_creation(self): """Test creating MutableSeqs in multiple ways""" mutable_s = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna) self.assertIsInstance(mutable_s, MutableSeq, "Creating MutableSeq") mutable_s = self.s.tomutable() self.assertIsInstance(mutable_s, MutableSeq, "Converting Seq to mutable") array_seq = MutableSeq(array.array(array_indicator, "TCAAAAGGATGCATCATG"), IUPAC.ambiguous_dna) self.assertIsInstance(array_seq, MutableSeq, "Creating MutableSeq using array") def test_repr(self): self.assertEqual("MutableSeq('TCAAAAGGATGCATCATG', IUPACAmbiguousDNA())", repr(self.mutable_s)) def test_truncated_repr(self): seq = "TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGA" expected = "MutableSeq('TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATG...GGA', IUPACAmbiguousDNA())" self.assertEqual(expected, repr(MutableSeq(seq, IUPAC.ambiguous_dna))) def test_equal_comparison(self): """Test __eq__ comparison method""" self.assertEqual(self.mutable_s, "TCAAAAGGATGCATCATG") def test_equal_comparison_of_incompatible_alphabets(self): with warnings.catch_warnings(record=True): self.mutable_s == MutableSeq('UCAAAAGGA', IUPAC.ambiguous_rna) def test_not_equal_comparison(self): """Test __ne__ comparison method""" self.assertNotEqual(self.mutable_s, "other thing") def test_less_than_comparison(self): """Test __lt__ comparison method""" self.assertTrue(self.mutable_s[:-1] < self.mutable_s) def test_less_than_comparison_of_incompatible_alphabets(self): with warnings.catch_warnings(record=True): self.mutable_s[:-1] < MutableSeq("UCAAAAGGAUGCAUCAUG", IUPAC.ambiguous_rna) def test_less_than_comparison_without_alphabet(self): self.assertTrue(self.mutable_s[:-1] < "TCAAAAGGATGCATCATG") def test_less_than_or_equal_comparison(self): """Test __le__ comparison method""" self.assertTrue(self.mutable_s[:-1] <= self.mutable_s) def test_less_than_or_equal_comparison_of_incompatible_alphabets(self): with warnings.catch_warnings(record=True): self.mutable_s[:-1] <= MutableSeq("UCAAAAGGAUGCAUCAUG", IUPAC.ambiguous_rna) def test_less_than_or_equal_comparison_without_alphabet(self): self.assertTrue(self.mutable_s[:-1] <= "TCAAAAGGATGCATCATG") def test_add_method(self): """Test adding wrong type to MutableSeq""" with self.assertRaises(TypeError): self.mutable_s + 1234 def test_radd_method(self): self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG", self.mutable_s.__radd__(self.mutable_s)) def test_radd_method_incompatible_alphabets(self): with self.assertRaises(TypeError): self.mutable_s.__radd__(MutableSeq("UCAAAAGGA", IUPAC.ambiguous_rna)) def test_radd_method_using_seq_object(self): self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG", self.mutable_s.__radd__(self.s)) def test_radd_method_wrong_type(self): with self.assertRaises(TypeError): self.mutable_s.__radd__(1234) def test_as_string(self): self.assertEqual("TCAAAAGGATGCATCATG", str(self.mutable_s)) def test_length(self): self.assertEqual(18, len(self.mutable_s)) def test_converting_to_immutable(self): self.assertIsInstance(self.mutable_s.toseq(), Seq.Seq) def test_first_nucleotide(self): self.assertEqual('T', self.mutable_s[0]) def test_setting_slices(self): self.assertEqual(MutableSeq('CAAA', IUPAC.ambiguous_dna), self.mutable_s[1:5], "Slice mutable seq") self.mutable_s[1:3] = "GAT" self.assertEqual(MutableSeq("TGATAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Set slice with string and adding extra nucleotide") self.mutable_s[1:3] = self.mutable_s[5:7] self.assertEqual(MutableSeq("TAATAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Set slice with MutableSeq") self.mutable_s[1:3] = array.array(array_indicator, "GAT") self.assertEqual(MutableSeq("TGATTAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Set slice with array") def test_setting_item(self): self.mutable_s[3] = "G" self.assertEqual(MutableSeq("TCAGAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_deleting_slice(self): del self.mutable_s[4:5] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_deleting_item(self): del self.mutable_s[3] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_appending(self): self.mutable_s.append("C") self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGC", IUPAC.ambiguous_dna), self.mutable_s) def test_inserting(self): self.mutable_s.insert(4, "G") self.assertEqual(MutableSeq("TCAAGAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_popping_last_item(self): self.assertEqual("G", self.mutable_s.pop()) def test_remove_items(self): self.mutable_s.remove("G") self.assertEqual(MutableSeq("TCAAAAGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Remove first G") self.assertRaises(ValueError, self.mutable_s.remove, 'Z') def test_count(self): self.assertEqual(7, self.mutable_s.count("A")) self.assertEqual(2, self.mutable_s.count("AA")) def test_index(self): self.assertEqual(2, self.mutable_s.index("A")) self.assertRaises(ValueError, self.mutable_s.index, "8888") def test_reverse(self): """Test using reverse method""" self.mutable_s.reverse() self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT", IUPAC.ambiguous_dna), self.mutable_s) def test_reverse_with_stride(self): """Test reverse using -1 stride""" self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT", IUPAC.ambiguous_dna), self.mutable_s[::-1]) def test_complement(self): self.mutable_s.complement() self.assertEqual(str("AGTTTTCCTACGTAGTAC"), str(self.mutable_s)) def test_complement_rna(self): seq = Seq.MutableSeq("AUGaaaCUG", IUPAC.unambiguous_rna) seq.complement() self.assertEqual(str("UACuuuGAC"), str(seq)) def test_complement_mixed_aphabets(self): seq = Seq.MutableSeq("AUGaaaCTG") with self.assertRaises(ValueError): seq.complement() def test_complement_rna_string(self): seq = Seq.MutableSeq("AUGaaaCUG") seq.complement() self.assertEqual('UACuuuGAC', str(seq)) def test_complement_dna_string(self): seq = Seq.MutableSeq("ATGaaaCTG") seq.complement() self.assertEqual('TACtttGAC', str(seq)) def test_reverse_complement(self): self.mutable_s.reverse_complement() self.assertEqual("CATGATGCATCCTTTTGA", str(self.mutable_s)) def test_reverse_complement_of_protein(self): seq = Seq.MutableSeq("ACTGTCGTCT", Alphabet.generic_protein) with self.assertRaises(ValueError): seq.reverse_complement() def test_to_string_method(self): """This method is currently deprecated, probably will need to remove this test soon""" with warnings.catch_warnings(record=True): self.mutable_s.tostring() def test_extend_method(self): self.mutable_s.extend("GAT") self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGGAT", IUPAC.ambiguous_dna), self.mutable_s) def test_extend_with_mutable_seq(self): self.mutable_s.extend(MutableSeq("TTT", IUPAC.ambiguous_dna)) self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGTTT", IUPAC.ambiguous_dna), self.mutable_s) def test_delete_stride_slice(self): del self.mutable_s[4:6 - 1] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_extract_third_nucleotide(self): """Test extracting every third nucleotide (slicing with stride 3)""" self.assertEqual(MutableSeq("TAGTAA", IUPAC.ambiguous_dna), self.mutable_s[0::3]) self.assertEqual(MutableSeq("CAGGTT", IUPAC.ambiguous_dna), self.mutable_s[1::3]) self.assertEqual(MutableSeq("AAACCG", IUPAC.ambiguous_dna), self.mutable_s[2::3]) def test_set_wobble_codon_to_n(self): """Test setting wobble codon to N (set slice with stride 3)""" self.mutable_s[2::3] = "N" * len(self.mutable_s[2::3]) self.assertEqual(MutableSeq("TCNAANGGNTGNATNATN", IUPAC.ambiguous_dna), self.mutable_s)
def viterbi(self, sequence, state_alphabet): """Calculate the most probable state path using the Viterbi algorithm. This implements the Viterbi algorithm (see pgs 55-57 in Durbin et al for a full explanation -- this is where I took my implementation ideas from), to allow decoding of the state path, given a sequence of emissions. Arguments: o sequence -- A Seq object with the emission sequence that we want to decode. o state_alphabet -- The alphabet of the possible state sequences that can be generated. """ # calculate logarithms of the transition and emission probs log_trans = self._log_transform(self.transition_prob) log_emission = self._log_transform(self.emission_prob) viterbi_probs = {} pred_state_seq = {} state_letters = state_alphabet.letters # --- initialization # # NOTE: My index numbers are one less than what is given in Durbin # et al, since we are indexing the sequence going from 0 to # (Length - 1) not 1 to Length, like in Durbin et al. # # v_{0}(0) = 1 viterbi_probs[(state_letters[0], -1)] = 1 # v_{k}(0) = 0 for k > 0 for state_letter in state_letters[1:]: viterbi_probs[(state_letter, -1)] = 0 # --- recursion # loop over the training squence (i = 1 .. L) for i in range(0, len(sequence)): # now loop over all of the letters in the state path for main_state in state_letters: # e_{l}(x_{i}) emission_part = log_emission[(main_state, sequence[i])] # loop over all possible states possible_state_probs = {} for cur_state in self.transitions_from(main_state): # a_{kl} trans_part = log_trans[(cur_state, main_state)] # v_{k}(i - 1) viterbi_part = viterbi_probs[(cur_state, i - 1)] cur_prob = viterbi_part + trans_part possible_state_probs[cur_state] = cur_prob # finally calculate the viterbi probability using the max max_prob = max(possible_state_probs.values()) viterbi_probs[(main_state, i)] = (emission_part + max_prob) # now get the most likely state for state in possible_state_probs.keys(): if possible_state_probs[state] == max_prob: pred_state_seq[(i - 1, main_state)] = state break # --- termination # calculate the probability of the state path # loop over all letters all_probs = {} for state in state_letters: # v_{k}(L) viterbi_part = viterbi_probs[(state, len(sequence) - 1)] # a_{k0} transition_part = log_trans[(state, state_letters[0])] all_probs[state] = viterbi_part * transition_part state_path_prob = max(all_probs.values()) # find the last pointer we need to trace back from last_state = '' for state in all_probs.keys(): if all_probs[state] == state_path_prob: last_state = state assert last_state != '', "Didn't find the last state to trace from!" # --- traceback traceback_seq = MutableSeq('', state_alphabet) loop_seq = range(0, len(sequence)) loop_seq.reverse() cur_state = last_state for i in loop_seq: traceback_seq.append(cur_state) cur_state = pred_state_seq[(i - 1, cur_state)] # put the traceback sequence in the proper orientation traceback_seq.reverse() return traceback_seq.toseq(), state_path_prob
print seq[:5] #methods as string print len(seq) #seq[0]='C' #aren't mutables st=str(seq) #toString print st #tipo de dato secuencia editable from Bio.Seq import MutableSeq mut_seq=seq.tomutable() #convertirlo a tipo seq mutable print mut_seq mut_seq[0]='C' print mut_seq mut_seq=MutableSeq('ATGCCG',IUPAC.IUPACUnambiguousDNA()) #has methods as a list: append(), insert(), pop(), remove() mut_seq[1:3]='TTT' mut_seq.reverse() mut_seq.complement() print mut_seq mut_seq.reverse_complement() print mut_seq #tipo de dato metadatos de secuencia from Bio.SeqRecord import SeqRecord seqrec=SeqRecord(seq,id='001', name='My Secuencia') #2 main attributes: # id: string identifier, optional, recommended # seq: Seq object, required #additional attributes # name, description: name and more info of sequence # dbxrefs: list of strings, each string an id of a DB # features: list of SeqFeature objects, those found in Genbank records
print id(seq1) == id(seq2) # seq1 == seq2 look for the same object print str(seq1) == str(seq2) # convert to string print str(seq1) == str(seq3) # dna similar enought to protein #MutableSeq from Bio.Seq import MutableSeq mutseq = seq1.tomutable() # convert to MutableSeq print mutseq, type(mutseq) mutSeq = MutableSeq('CGTTTAAGCTGC',IUPAC.unambiguous_dna) print mutSeq, type(mutSeq) mutseq[1]='T' # imposible on simple Seq print mutseq seq1 = mutseq.toseq() # convert to Seq mutSeq.remove('A') # remove first A mutSeq[2:-5]='TTTT' mutSeq.reverse() # reverse() and reverse_complement() change object itself print mutSeq #MutableSeq can't be a dictionary key, Seq and string can #UnknownSeq # Subclass of Seq when you know length but not the characters to save memory from Bio.Seq import UnknownSeq unk = UnknownSeq(25) print unk, len(unk), type(unk) unkDNA = UnknownSeq(20, alphabet=IUPAC.ambiguous_dna) print unkDNA # N = any base unkProt = UnknownSeq(10, alphabet=IUPAC.protein) print unkProt # X = any aminoacid print unkDNA.complement(), unkDNA.reverse_complement() print unkDNA.transcribe(), unkDNA.translate()