def generate_rolls(num_rolls): """Generate a bunch of rolls corresponding to the casino probabilities. Returns: - The generate roll sequence - The state sequence that generated the roll. """ # start off in the fair state cur_state = 'F' roll_seq = MutableSeq('', DiceRollAlphabet()) state_seq = MutableSeq('', DiceTypeAlphabet()) # generate the sequence for roll in range(num_rolls): state_seq.append(cur_state) # generate a random number chance_num = random.random() # add on a new roll to the sequence new_roll = _loaded_dice_roll(chance_num, cur_state) roll_seq.append(new_roll) # now give us a chance to switch to a new state chance_num = random.random() if cur_state == 'F': if chance_num <= .05: cur_state = 'L' elif cur_state == 'L': if chance_num <= .1: cur_state = 'F' return roll_seq.toseq(), state_seq.toseq()
def random_generator(num): states = MutableSeq('',state()) for i in range(num): states.append(random.choice('123')) sequence = MutableSeq('',DNA()) for i in range(num): sequence.append(random.choice('ACTG')) return states.toseq(),sequence.toseq()
def random_population(genome_alphabet, genome_size, num_organisms, fitness_calculator): """Generate a population of individuals with randomly set genomes. Arguments: o genome_alphabet -- An Alphabet object describing all of the possible letters that could potentially be in the genome of an organism. o genome_size -- The size of each organisms genome. o num_organism -- The number of organisms we want in the population. o fitness_calculator -- A function that will calculate the fitness of the organism when given the organisms genome. """ all_orgs = [] # a random number generator to get letters for the genome letter_rand = random.Random() # figure out what type of characters are in the alphabet if isinstance(genome_alphabet.letters[0], str): if sys.version_info[0] == 3: alphabet_type = "u" # Use unicode string on Python 3 else: alphabet_type = "c" # Use byte string on Python 2 elif isinstance(genome_alphabet.letters[0], int): alphabet_type = "i" elif isinstance(genome_alphabet.letters[0], float): alphabet_type = "d" else: raise ValueError( "Alphabet type is unsupported: %s" % genome_alphabet.letters) for org_num in range(num_organisms): new_genome = MutableSeq(array.array(alphabet_type), genome_alphabet) # generate the genome randomly for gene_num in range(genome_size): new_gene = letter_rand.choice(genome_alphabet.letters) new_genome.append(new_gene) # add the new organism with this genome all_orgs.append(Organism(new_genome, fitness_calculator)) return all_orgs
def random_population(genome_alphabet, genome_size, num_organisms, fitness_calculator): """Generate a population of individuals with randomly set genomes. Arguments: o genome_alphabet -- An Alphabet object describing all of the possible letters that could potentially be in the genome of an organism. o genome_size -- The size of each organisms genome. o num_organism -- The number of organisms we want in the population. o fitness_calculator -- A function that will calculate the fitness of the organism when given the organisms genome. """ all_orgs = [] # a random number generator to get letters for the genome letter_rand = random.Random() # figure out what type of characters are in the alphabet if isinstance(genome_alphabet.letters[0], str): if sys.version_info[0] == 3: alphabet_type = "u" # Use unicode string on Python 3 else: alphabet_type = "c" # Use byte string on Python 2 elif isinstance(genome_alphabet.letters[0], int): alphabet_type = "i" elif isinstance(genome_alphabet.letters[0], float): alphabet_type = "d" else: raise ValueError("Alphabet type is unsupported: %s" % genome_alphabet.letters) for org_num in range(num_organisms): new_genome = MutableSeq(array.array(alphabet_type), genome_alphabet) # generate the genome randomly for gene_num in range(genome_size): new_gene = letter_rand.choice(genome_alphabet.letters) new_genome.append(new_gene) # add the new organism with this genome all_orgs.append(Organism(new_genome, fitness_calculator)) return all_orgs
class TestMutableSeq(unittest.TestCase): def setUp(self): self.s = Seq.Seq("TCAAAAGGATGCATCATG") self.mutable_s = MutableSeq("TCAAAAGGATGCATCATG") def test_mutableseq_creation(self): """Test creating MutableSeqs in multiple ways.""" mutable_s = MutableSeq("TCAAAAGGATGCATCATG") self.assertIsInstance(mutable_s, MutableSeq, "Creating MutableSeq") mutable_s = self.s.tomutable() self.assertIsInstance(mutable_s, MutableSeq, "Converting Seq to mutable") array_seq = MutableSeq(array.array("u", "TCAAAAGGATGCATCATG")) self.assertIsInstance(array_seq, MutableSeq, "Creating MutableSeq using array") def test_repr(self): self.assertEqual("MutableSeq('TCAAAAGGATGCATCATG')", repr(self.mutable_s)) def test_truncated_repr(self): seq = "TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGA" expected = ( "MutableSeq('TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATG...GGA')" ) self.assertEqual(expected, repr(MutableSeq(seq))) def test_equal_comparison(self): """Test __eq__ comparison method.""" self.assertEqual(self.mutable_s, "TCAAAAGGATGCATCATG") def test_not_equal_comparison(self): """Test __ne__ comparison method.""" self.assertNotEqual(self.mutable_s, "other thing") def test_less_than_comparison(self): """Test __lt__ comparison method.""" self.assertLess(self.mutable_s[:-1], self.mutable_s) def test_less_than_comparison_of_incompatible_types(self): with self.assertRaises(TypeError): self.mutable_s < 1 def test_less_than_comparison_without_alphabet(self): self.assertLessEqual(self.mutable_s[:-1], "TCAAAAGGATGCATCATG") def test_less_than_or_equal_comparison(self): """Test __le__ comparison method.""" self.assertLessEqual(self.mutable_s[:-1], self.mutable_s) def test_less_than_or_equal_comparison_of_incompatible_types(self): with self.assertRaises(TypeError): self.mutable_s <= 1 def test_less_than_or_equal_comparison_without_alphabet(self): self.assertLessEqual(self.mutable_s[:-1], "TCAAAAGGATGCATCATG") def test_greater_than_comparison(self): """Test __gt__ comparison method.""" self.assertGreater(self.mutable_s, self.mutable_s[:-1]) def test_greater_than_comparison_of_incompatible_types(self): with self.assertRaises(TypeError): self.mutable_s > 1 def test_greater_than_comparison_without_alphabet(self): self.assertGreater(self.mutable_s, "TCAAAAGGATGCATCAT") def test_greater_than_or_equal_comparison(self): """Test __ge__ comparison method.""" self.assertGreaterEqual(self.mutable_s, self.mutable_s) def test_greater_than_or_equal_comparison_of_incompatible_types(self): with self.assertRaises(TypeError): self.mutable_s >= 1 def test_greater_than_or_equal_comparison_without_alphabet(self): self.assertGreaterEqual(self.mutable_s, "TCAAAAGGATGCATCATG") def test_add_method(self): """Test adding wrong type to MutableSeq.""" with self.assertRaises(TypeError): self.mutable_s + 1234 def test_radd_method(self): self.assertEqual( "TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG", self.mutable_s.__radd__(self.mutable_s), ) def test_radd_method_incompatible_alphabets(self): self.assertEqual( "UCAAAAGGATCAAAAGGATGCATCATG", self.mutable_s.__radd__(MutableSeq("UCAAAAGGA")), ) def test_radd_method_using_seq_object(self): self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG", self.mutable_s.__radd__(self.s)) def test_radd_method_wrong_type(self): with self.assertRaises(TypeError): self.mutable_s.__radd__(1234) def test_as_string(self): self.assertEqual("TCAAAAGGATGCATCATG", str(self.mutable_s)) def test_length(self): self.assertEqual(18, len(self.mutable_s)) def test_converting_to_immutable(self): self.assertIsInstance(self.mutable_s.toseq(), Seq.Seq) def test_first_nucleotide(self): self.assertEqual("T", self.mutable_s[0]) def test_setting_slices(self): self.assertEqual( MutableSeq("CAAA"), self.mutable_s[1:5], "Slice mutable seq", ) self.mutable_s[1:3] = "GAT" self.assertEqual( MutableSeq("TGATAAAGGATGCATCATG"), self.mutable_s, "Set slice with string and adding extra nucleotide", ) self.mutable_s[1:3] = self.mutable_s[5:7] self.assertEqual( MutableSeq("TAATAAAGGATGCATCATG"), self.mutable_s, "Set slice with MutableSeq", ) self.mutable_s[1:3] = array.array("u", "GAT") self.assertEqual( MutableSeq("TGATTAAAGGATGCATCATG"), self.mutable_s, "Set slice with array", ) def test_setting_item(self): self.mutable_s[3] = "G" self.assertEqual(MutableSeq("TCAGAAGGATGCATCATG"), self.mutable_s) def test_deleting_slice(self): del self.mutable_s[4:5] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG"), self.mutable_s) def test_deleting_item(self): del self.mutable_s[3] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG"), self.mutable_s) def test_appending(self): self.mutable_s.append("C") self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGC"), self.mutable_s) def test_inserting(self): self.mutable_s.insert(4, "G") self.assertEqual(MutableSeq("TCAAGAAGGATGCATCATG"), self.mutable_s) def test_popping_last_item(self): self.assertEqual("G", self.mutable_s.pop()) def test_remove_items(self): self.mutable_s.remove("G") self.assertEqual(MutableSeq("TCAAAAGATGCATCATG"), self.mutable_s, "Remove first G") self.assertRaises(ValueError, self.mutable_s.remove, "Z") def test_count(self): self.assertEqual(7, self.mutable_s.count("A")) self.assertEqual(2, self.mutable_s.count("AA")) def test_index(self): self.assertEqual(2, self.mutable_s.index("A")) self.assertRaises(ValueError, self.mutable_s.index, "8888") def test_reverse(self): """Test using reverse method.""" self.mutable_s.reverse() self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT"), self.mutable_s) def test_reverse_with_stride(self): """Test reverse using -1 stride.""" self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT"), self.mutable_s[::-1]) def test_complement(self): self.mutable_s.complement() self.assertEqual("AGTTTTCCTACGTAGTAC", str(self.mutable_s)) def test_complement_rna(self): seq = Seq.MutableSeq("AUGaaaCUG") seq.complement() self.assertEqual("UACuuuGAC", str(seq)) def test_complement_mixed_aphabets(self): seq = Seq.MutableSeq("AUGaaaCTG") with self.assertRaises(ValueError): seq.complement() def test_complement_rna_string(self): seq = Seq.MutableSeq("AUGaaaCUG") seq.complement() self.assertEqual("UACuuuGAC", str(seq)) def test_complement_dna_string(self): seq = Seq.MutableSeq("ATGaaaCTG") seq.complement() self.assertEqual("TACtttGAC", str(seq)) def test_reverse_complement(self): self.mutable_s.reverse_complement() self.assertEqual("CATGATGCATCCTTTTGA", str(self.mutable_s)) def test_extend_method(self): self.mutable_s.extend("GAT") self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGGAT"), self.mutable_s) def test_extend_with_mutable_seq(self): self.mutable_s.extend(MutableSeq("TTT")) self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGTTT"), self.mutable_s) def test_delete_stride_slice(self): del self.mutable_s[4:6 - 1] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG"), self.mutable_s) def test_extract_third_nucleotide(self): """Test extracting every third nucleotide (slicing with stride 3).""" self.assertEqual(MutableSeq("TAGTAA"), self.mutable_s[0::3]) self.assertEqual(MutableSeq("CAGGTT"), self.mutable_s[1::3]) self.assertEqual(MutableSeq("AAACCG"), self.mutable_s[2::3]) def test_set_wobble_codon_to_n(self): """Test setting wobble codon to N (set slice with stride 3).""" self.mutable_s[2::3] = "N" * len(self.mutable_s[2::3]) self.assertEqual(MutableSeq("TCNAANGGNTGNATNATN"), self.mutable_s)
def viterbi(self, sequence, state_alphabet): """Calculate the most probable state path using the Viterbi algorithm. This implements the Viterbi algorithm (see pgs 55-57 in Durbin et al for a full explanation -- this is where I took my implementation ideas from), to allow decoding of the state path, given a sequence of emissions. Arguments: o sequence -- A Seq object with the emission sequence that we want to decode. o state_alphabet -- The alphabet of the possible state sequences that can be generated. """ # calculate logarithms of the initial, transition, and emission probs log_initial = self._log_transform(self.initial_prob) log_trans = self._log_transform(self.transition_prob) log_emission = self._log_transform(self.emission_prob) viterbi_probs = {} pred_state_seq = {} state_letters = state_alphabet.letters # --- recursion # loop over the training squence (i = 1 .. L) # NOTE: My index numbers are one less than what is given in Durbin # et al, since we are indexing the sequence going from 0 to # (Length - 1) not 1 to Length, like in Durbin et al. for i in range(0, len(sequence)): # loop over all of the possible i-th states in the state path for cur_state in state_letters: # e_{l}(x_{i}) emission_part = log_emission[(cur_state, sequence[i])] max_prob = 0 if i == 0: # for the first state, use the initial probability rather # than looking back to previous states max_prob = log_initial[cur_state] else: # loop over all possible (i-1)-th previous states possible_state_probs = {} for prev_state in self.transitions_to(cur_state): # a_{kl} trans_part = log_trans[(prev_state, cur_state)] # v_{k}(i - 1) viterbi_part = viterbi_probs[(prev_state, i - 1)] cur_prob = viterbi_part + trans_part possible_state_probs[prev_state] = cur_prob # calculate the viterbi probability using the max max_prob = max(possible_state_probs.values()) # v_{k}(i) viterbi_probs[(cur_state, i)] = (emission_part + max_prob) if i > 0: # get the most likely prev_state leading to cur_state for state in possible_state_probs: if possible_state_probs[state] == max_prob: pred_state_seq[(i - 1, cur_state)] = state break # --- termination # calculate the probability of the state path # loop over all states all_probs = {} for state in state_letters: # v_{k}(L) all_probs[state] = viterbi_probs[(state, len(sequence) - 1)] state_path_prob = max(all_probs.values()) # find the last pointer we need to trace back from last_state = '' for state in all_probs: if all_probs[state] == state_path_prob: last_state = state assert last_state != '', "Didn't find the last state to trace from!" # --- traceback traceback_seq = MutableSeq('', state_alphabet) loop_seq = range(1, len(sequence)) loop_seq.reverse() # last_state is the last state in the most probable state sequence. # Compute that sequence by walking backwards in time. From the i-th # state in the sequence, find the (i-1)-th state as the most # probable state preceding the i-th state. state = last_state traceback_seq.append(state) for i in loop_seq: state = pred_state_seq[(i - 1, state)] traceback_seq.append(state) # put the traceback sequence in the proper orientation traceback_seq.reverse() return traceback_seq.toseq(), state_path_prob
def viterbi(self, sequence, state_alphabet): """Calculate the most probable state path using the Viterbi algorithm. This implements the Viterbi algorithm (see pgs 55-57 in Durbin et al for a full explanation -- this is where I took my implementation ideas from), to allow decoding of the state path, given a sequence of emissions. Arguments: o sequence -- A Seq object with the emission sequence that we want to decode. o state_alphabet -- The alphabet of the possible state sequences that can be generated. """ # calculate logarithms of the initial, transition, and emission probs log_initial = self._log_transform(self.initial_prob) log_trans = self._log_transform(self.transition_prob) log_emission = self._log_transform(self.emission_prob) viterbi_probs = {} pred_state_seq = {} state_letters = state_alphabet.letters # --- recursion # loop over the training squence (i = 1 .. L) # NOTE: My index numbers are one less than what is given in Durbin # et al, since we are indexing the sequence going from 0 to # (Length - 1) not 1 to Length, like in Durbin et al. for i in range(0, len(sequence)): # loop over all of the possible i-th states in the state path for cur_state in state_letters: # e_{l}(x_{i}) emission_part = log_emission[(cur_state, sequence[i])] max_prob = 0 if i == 0: # for the first state, use the initial probability rather # than looking back to previous states max_prob = log_initial[cur_state] else: # loop over all possible (i-1)-th previous states possible_state_probs = {} for prev_state in self.transitions_to(cur_state): # a_{kl} trans_part = log_trans[(prev_state, cur_state)] # v_{k}(i - 1) viterbi_part = viterbi_probs[(prev_state, i - 1)] cur_prob = viterbi_part + trans_part possible_state_probs[prev_state] = cur_prob # calculate the viterbi probability using the max max_prob = max(possible_state_probs.values()) # v_{k}(i) viterbi_probs[(cur_state, i)] = (emission_part + max_prob) if i > 0: # get the most likely prev_state leading to cur_state for state in possible_state_probs: if possible_state_probs[state] == max_prob: pred_state_seq[(i - 1, cur_state)] = state break # --- termination # calculate the probability of the state path # loop over all states all_probs = {} for state in state_letters: # v_{k}(L) all_probs[state] = viterbi_probs[(state, len(sequence) - 1)] state_path_prob = max(all_probs.values()) # find the last pointer we need to trace back from last_state = '' for state in all_probs: if all_probs[state] == state_path_prob: last_state = state assert last_state != '', "Didn't find the last state to trace from!" # --- traceback traceback_seq = MutableSeq('', state_alphabet) loop_seq = list(range(1, len(sequence))) loop_seq.reverse() # last_state is the last state in the most probable state sequence. # Compute that sequence by walking backwards in time. From the i-th # state in the sequence, find the (i-1)-th state as the most # probable state preceding the i-th state. state = last_state traceback_seq.append(state) for i in loop_seq: state = pred_state_seq[(i - 1, state)] traceback_seq.append(state) # put the traceback sequence in the proper orientation traceback_seq.reverse() return traceback_seq.toseq(), state_path_prob
class MuGen(object): """ performs mutations and deletion/insertion with desired porbability and desired structure. Gets a Seq object, a mutation or indel dicitonary, and the probablities for each item in those dictionaries. insertprob and deleteprob are base specefic probabilities of length 4 mualphabet is a dictionary specifying the possible mutations for each letter of the sequence alphabet. muprob gives the mutation probality for each letter of the sequence alphabet.""" def __init__(self, seq, alphaproperty=None, insertprob=None, deleteprob=None, mualphabet=None, muprob=None, mupos=None, delpos=None, inpos=None, verbose=False): try: self.occureddel = list( ) # This is to keep a history of chnges made to the reference self.occuredmu = list( ) # This is necessary for writing the haplotypes in the format self.occuredins = list() # of haplotyping software's. self.inserted_allele = list( ) # keeps track of the inserted allele to be able to get them back when needed! self.alt_allele = list() # keeps track of the substituted if not isinstance(verbose, bool): raise CustomException( "ERROR: verbose must be set to either True or False. \ Default is to False") else: self.verbose = verbose if isinstance(seq, str): if alphaproperty is None: if self.verbose: print( "WARNING: No alphabet type is specified for the sequence string!" ) else: pass self.alphaproperty = Alphabet() else: self.alphaproperty = alphaproperty self.seq = MutableSeq(seq, self.alphaproperty) elif isinstance(seq, Seq): self.alphaproperty = seq.__getattribute__('alphabet') self.seq = seq.tomutable() elif isinstance(seq, MutableSeq): self.alphaproperty = seq.__getattribute__('alphabet') self.seq = copy.deepcopy(seq) else: raise CustomException( "ERROR: Should provide a Seq or MutableSeq object, \n \ or a string sequence!") self.alphabet = set(str(self.seq)) self.ref = str(self.seq) if not delpos: self.delpos = [] else: if set(delpos).issubset(set(range(len(self.ref)))): self.delpos = list( delpos) # Deletion by specifying the positions else: raise CustomException( "ERROR: Deletion positions exceed the range of the reference or are not positive integers!" ) if not inpos: self.inpos = [] else: if set(inpos).issubset(set(range(len(self.ref)))): self.inpos = list( inpos) # Insertion by specifying the positions else: raise CustomException( "ERROR: Insertion positions exceed the range of the reference or are not positive integers!" ) if not mupos: self.mupos = [] else: if set(mupos).issubset(set(range(len(self.ref)))): self.mupos = list( mupos) # Mutation by specifying the positions else: raise CustomException( "ERROR: Mutation positions exceed the range of the reference or are not positive integers!" ) if not mualphabet: if self.verbose: print( "WARNING: You have specified no mutation alphabet! Mutations are set to random \ letters!") self.mualphabet = dict() for key in self.alphabet: self.mualphabet[key] = ''.join(self.alphabet - { key }) # Non-specified mutations could happen to any letter else: mualphabet = dict([(str(k), str(v)) for k, v in mualphabet.iteritems()]) for key, value in mualphabet.iteritems(): if len(key) != 1: raise CustomException( "ERROR: the mutation alphabet deals with point mutations! Only single letters are\ allowed as keys!") elif key in set(''.join(value)): raise CustomException( "ERROR: Wrong mutation values specified! A letter could just be substituted with a\ different letter for mutation!") if set(mualphabet.keys()) == self.alphabet and set(''.join( mualphabet.values())) <= self.alphabet: self.mualphabet = copy.deepcopy(mualphabet) elif set(mualphabet.keys()) < self.alphabet and set(''.join( mualphabet.values())) < self.alphabet: if self.verbose: print( "WARNING: Mutation is not specified for some letters! Those mutations are set\ to random letters!") self.mualphabet = copy.deepcopy( mualphabet ) # Whatever has been specified for mutation alphabet is kep intact for key in self.alphabet - set(mualphabet.keys()): self.mualphabet[key] = ''.join( self.alphabet - {key} ) # Non-specified mutations could happen to any letter else: if self.verbose: print( "WARNING: Mutation alphabet is not compatible with sequence alphabet! Both alphabets are\ updated and\nunspecified mutations are set to random letters!") new_mualphabet = dict( ) # As mutation may introduce novel alleles in the sequence, alphabet is updated first for key, value in mualphabet.iteritems( ): # Whatever has been specified for mutation alphabet is kep intact self.alphabet.add( key) # Only the alphabet is updated if necessary self.alphabet |= (set(''.join(value)) - self.alphabet) new_mualphabet.update({key: value}) for key in self.alphabet - set(new_mualphabet.keys()): new_mualphabet[key] = ''.join( self.alphabet - {key} ) # Non-specified mutations could happen to any letter self.mualphabet = copy.deepcopy(new_mualphabet) if not insertprob: self.insertprob = dict( ) # If no insertprob is given, it is set to zero everywhere for key in self.alphabet: self.insertprob[key] = 0 else: if set(list(insertprob.keys())) != self.alphabet: if self.verbose: print( "WARNING: Missing/Invalid letter(s) in insertion probability!\n\ Probabilities are set to zero for missing letters! Invalid letters are ignored!" ) new_insertprob = dict() for key, value in insertprob.iteritems(): if value >= 0 and value <= 1: new_insertprob.update({key: value}) else: raise CustomException( "ERROR: Insertion probability must be >=0 and <=1!" ) for key in self.alphabet - set(new_insertprob.keys()): new_insertprob[key] = 0 self.insertprob = copy.deepcopy(new_insertprob) if not deleteprob: # If no deleteprob is given, it is set to zero everywhere self.deleteprob = dict() for key in self.alphabet: self.deleteprob[key] = 0 else: if set(list(deleteprob.keys())) != self.alphabet: if self.verbose: print( "WARNING: Missing/Invalid letter(s) in deletion probability!\n\ Probabilities are set to zero for missing letters! Invalid letters are ignored!" ) new_deleteprob = dict() for key, value in deleteprob.iteritems(): if value >= 0 and value <= 1: new_deleteprob.update({key: value}) else: raise CustomException( "ERROR: Deletion probability must be >=0 and <=1!") for key in self.alphabet - set(new_deleteprob.keys()): new_deleteprob[key] = 0 self.deleteprob = copy.deepcopy(new_deleteprob) if not muprob: self.muprob = dict( ) # If no muprob is given, it is set to zero everywhere for key in self.alphabet: self.muprob[key] = 0 else: if set(list(muprob.keys())) != self.alphabet: if self.verbose: print( "WARNING: Missing/Invalid letter(s) in mutation probability!\n\ Probabilities are set to zero for missing letters! Invalid letters are ignored!" ) new_muprob = dict() for key, value in muprob.iteritems(): if value >= 0 and value <= 1: new_muprob.update({key: value}) else: raise CustomException( "ERROR: Mutation probability must be >=0 and <=1!") for key in self.alphabet - set(new_muprob.keys()): new_muprob[key] = 0 self.muprob = copy.deepcopy(new_muprob) except CustomException as instance: print(instance) sys.exit(2) else: if self.verbose: print( "MuGen object successfully created.\nWARNING: MuGen sequence is case sensitive!" ) def __repr__(self): return "Haplotype: %s, \n Reference sequence: %s, \n Mutation probabilty: %s, \n Mutations: %s, \n \ Insertion probabilty: %s, \n Deletion Probability: %s, \n \ Insertion positions: %s, \n Deletion positions: %s, \n Mutation positions: %s \n" % ( self.seq, self.ref, self.muprob, self.mualphabet, self.insertprob, self.deleteprob, self.inpos, self.delpos, self.mupos) def __str__(self): return repr(self) def get_hap(self): # Access Methods return self.seq def get_ref(self): return self.ref def get_insertprob(self): return self.insertprob def get_deleteprob(self): return self.deleteprob def get_muprob(self): return self.muprob def get_mualphabet(self): return self.mualphabet def get_mupos(self): return self.mupos def get_inpos(self): return self.inpos def get_delpos(self): return self.delpos def get_occureddelpos(self): return self.occureddel def get_occuredmupos(self): return self.occuredmu def get_occuredinspos(self): return self.occuredins def get_ins_allele(self): return self.inserted_allele def get_mu_allele(self): return self.alt_allele def set_ref(self, ref): # Modifier methods """Changes the reference sequence of the MuGen object. Could become problematic if the new reference has a different length than the current reference, while indel and mutation positions are specified. A useful method if reference is a mutable seq entity which is constantly called and changed by other methods and calsses.""" try: if set(str(ref)).issubset(self.alphabet): if not set(self.mupos).issubset(set(range(len(str(ref))))): raise CustomException( "ERROR: Mutation positions exceed the range of the new reference!" ) elif not set(self.inpos).issubset(set(range(len(str(ref))))): raise CustomException( "ERROR: Insertion positions exceed the range of the new reference!" ) elif not set(self.delpos).issubset(set(range(len(str(ref))))): raise CustomException( "ERROR: Deletion positions exceed the range of the new reference!" ) else: self.ref = str(ref) else: raise CustomException( "ERROR: the new reference is not compatible with the current alphabet!" ) except CustomException as instance: print("Failed to update the reference!") print(instance) except: print("Failed to update the reference!") raise else: if self.verbose: print("The reference sequence has been updated!") def set_pos( self, inpos=None, delpos=None, mupos=None, ): """Changes the insertion, deletion and substitution sites of the MuGen object. A useful method if posmu and probmu methods are constantly called.""" try: changedel = 0 # If set to 1, delpos is changed. Otherwise no change to delpos. changein = 0 # If set to 1, inpos is changed. Otherwise no change to inpos. changemu = 0 # If set to 1, mupos is changed. Otherwise no change to mupos. if delpos is None: # Default is no change pass else: if set(delpos).issubset(set(range(len(self.ref)))): changedel = 1 else: raise CustomException( "ERROR: New deletion positions exceed the range of the reference or are not positive integers!" ) if inpos is None: # Deafult is no change pass else: if set(inpos).issubset(set(range(len(self.ref)))): changein = 1 else: raise CustomException( "ERROR: New insertion positions exceed the range of the reference or are not positive integers!" ) if mupos is None: # Default is no change pass else: if set(mupos).issubset(set(range(len(self.ref)))): changemu = 1 else: raise CustomException( "ERROR: New mutation positions exceed the range of the reference or are not positive integers!" ) if changedel: self.delpos = list(delpos) # Update delpos else: pass if changein: self.inpos = list(inpos) # Update inpos else: pass if changemu: self.mupos = list(mupos) # Update mupos else: pass except CustomException as instance: print("Failed to update indel and mutation positions!") print(instance) except: print("Failed to update indel and mutation positions!") raise else: if self.verbose: print("Indel and mutation positions updated!") def set_prob(self, insertprob=None, deleteprob=None, muprob=None): """Changes the insertion, deletion and mutation probabilities of the MuGen object. A useful method if posmu and probmu methods are constantly called.""" try: noinsert = -1 nodel = -1 nomu = -1 if insertprob is None: # Default to no change noinsert = 0 elif not insertprob: noinsert = 1 elif set(list(insertprob.keys())) != self.alphabet: if self.verbose: print( "WARNING: Missing/Invalid letter(s) in insertion probability!\n\ Probabilities are set to zero for missing letters! Invalid letters are ignored!" ) new_insertprob = dict() for key, value in insertprob.iteritems(): if value >= 0 and value <= 1: new_insertprob.update({key: value}) else: raise CustomException( "ERROR: Insertion probability must be >=0 and <=1!" ) for key in self.alphabet - set(new_insertprob.keys()): new_insertprob[key] = 0 else: new_insertprob = copy.deepcopy(insertprob) if deleteprob is None: # Default to no change nodel = 0 elif not deleteprob: # If empty deleteprob is given, it is set to zero everywhere nodel = 1 elif set(list(deleteprob.keys())) != self.alphabet: if self.verbose: print( "WARNING: Missing/Invalid letter(s) in deletion probability!\n\ Probabilities are set to zero for missing letters! Invalid letters are ignored!" ) new_deleteprob = dict() for key, value in deleteprob.iteritems(): if value >= 0 and value <= 1: new_deleteprob.update({key: value}) else: raise CustomException( "ERROR: Deletion probability must be >=0 and <=1!") for key in self.alphabet - set(new_deleteprob.keys()): new_deleteprob[key] = 0 else: new_deleteprob = copy.deepcopy(deleteprob) if muprob is None: # Default to no change nomu = 0 elif not muprob: nomu = 1 elif set(list(muprob.keys())) != self.alphabet: if self.verbose: print( "WARNING: Missing/Invalid letter(s) in mutation probability!\n\ Probabilities are set to zero for missing letters! Invalid letters are ignored!" ) new_muprob = dict() for key, value in muprob.iteritems(): if value >= 0 and value <= 1: new_muprob.update({key: value}) else: raise CustomException( "ERROR: Mutation probability must be >=0 and <=1!") for key in self.alphabet - set(new_muprob.keys()): new_muprob[key] = 0 else: new_muprob = copy.deepcopy(muprob) if nodel == 0: pass elif nodel == 1: self.deleteprob = dict() for key in self.alphabet: self.deleteprob[key] = 0 else: self.deleteprob = copy.deepcopy( new_deleteprob) # Update deleteprob if nomu == 0: pass elif nomu == 1: self.muprob = dict( ) # If empty muprob is given, it is set to zero everywhere for key in self.alphabet: self.muprob[key] = 0 else: self.muprob = copy.deepcopy(new_muprob) # Update muprob if noinsert == 0: pass elif noinsert == 1: self.insertprob = dict( ) # If empty insertprob is given, it is set to zero everywhere for key in self.alphabet: self.insertprob[key] = 0 else: self.insertprob = copy.deepcopy( new_insertprob) # Update insertprob except CustomException as instance: print(instance) print("Failed to update indel and mutation probabilities!") except: print("Failed to update indel and mutation probabilities!") raise else: if self.verbose: print("Indel and mutation probabilities successfully updated!") def set_mualphabet(self, mualphabet=None): """Changes the mutation alphabet of the MuGen object. A useful method if posmu and probmu methods are constantly called.""" try: if not mualphabet: if self.verbose: print( "WARNING: You have specified no mutation alphabet! Mutations are set to random \ letters!") self.mualphabet = dict() for key in self.alphabet: self.mualphabet[key] = ''.join(self.alphabet - { key }) # Non-specified mutations could happen to any letter else: mualphabet = dict([(str(k), str(v)) for k, v in mualphabet.iteritems()]) for key, value in mualphabet.iteritems(): if len(key) != 1: raise CustomException( "ERROR: the mutation alphabet deals with point mutations! Only single letters are\ allowed as keys!") elif key in set(''.join(value)): raise CustomException( "ERROR: Wrong mutation values specified! A letter could just be substituted with a\ different letter for mutation!") if set(mualphabet.keys()) == self.alphabet and set(''.join( mualphabet.values())) <= self.alphabet: self.mualphabet = copy.deepcopy(mualphabet) elif set(mualphabet.keys()) < self.alphabet and set(''.join( mualphabet.values())) < self.alphabet: if self.verbose: print( "WARNING: Mutation is not specified for some letters! Those mutations are set\ to random letters!") self.mualphabet = copy.deepcopy( mualphabet ) # Whatever has been specified for mutation alphabet is kep intact for key in self.alphabet - set(mualphabet.keys()): self.mualphabet[key] = ''.join( self.alphabet - {key} ) # Non-specified mutations could happen to any letter else: if self.verbose: print( "WARNING: Mutation alphabet is not compatible with sequence alphabet! Both alphabets are\ updated and\nunspecified mutations are set to random letters!") new_mualphabet = dict( ) # As mutation may introduce novel alleles in the sequence, alphabet is updated first for key, value in mualphabet.iteritems( ): # Whatever has been specified for mutation alphabet is kep intact self.alphabet.add( key) # Only the alphabet is updated if necessary self.alphabet |= (set(''.join(value)) - self.alphabet) new_mualphabet.update({key: value}) for key in self.alphabet - set(new_mualphabet.keys()): new_mualphabet[key] = ''.join( self.alphabet - {key} ) # Non-specified mutations could happen to any letter self.mualphabet = copy.deepcopy(new_mualphabet) except CustomException as instance: print(instance) print("Mualphabet could not be updated!") except: print("Mualphabet could not be updated!") raise else: if self.verbose: print("Mualphabet successfully updated!") def probmu(self): self.occuredmu = list() self.occureddel = list() self.occuredins = list() self.inserted_allele = list() self.alt_allele = list() """Operates on a MuGen object, and returns a Seq object obtained by making random changes to the reference sequence of the MuGen object, using the probabilities given to MuGen""" self.seq = [] for __site, __base in enumerate(self.ref): if __site in set(self.mupos) | set(self.inpos) | set(self.delpos): self.seq.append( __base) # No change is made at indel/mutation positions else: __prob = { 'ins': self.insertprob.get(__base), 'del': self.deleteprob.get(__base), 'sub': self.muprob.get(__base) } __error = random.choice( ['ins', 'del', 'sub', 'sub']) # An error occurs randomly: insertion or \ # deletion or substitution __rnd = float(int( random.random() * 100000)) / 100000 # The probability that this error is \ # not corrected by replication machinary is determined \ if __rnd < __prob.get( __error): # by insertprob,deleteprob and muprob if __error == 'sub': self.seq.append( random.choice(self.mualphabet.get(__base)) ) # Substitute tha letter with one from the mutation alphabet self.occuredmu.append( __site ) # Update the list of the sites where a mutation has occured self.alt_allele.extend([ self.seq[-1] ]) # Update the list of alternative alleles elif __error == 'ins': self.seq.append(__base) self.seq.append( random.choice(list(self.alphabet)) ) # Insert a random letter right after the letter self.occuredins.append( __site ) # Update the list of the sites after which an insertion has occured self.inserted_allele.extend([ __base + self.seq[-1] ]) # Update the list of inserted alleles else: self.occureddel.append( __site ) # Delete the letter in the progeny sequence by just not adding it else: # Update the list of the sites which are deleted in the progeny sequence self.seq.append( __base ) # No change is induced at the site in the progeny sequence self.seq = ''.join(self.seq) self.seq = MutableSeq(self.seq, self.alphaproperty) if (self.occuredins): _ins_allele = zip(self.occuredins, self.inserted_allele) _ins_allele.sort(key=lambda tup: tup[ 0]) # Sort the occured change positions in ascending order self.occuredins, self.inserted_allele = zip(*_ins_allele) self.occuredins = list(self.occuredins) self.inserted_allele = list(self.inserted_allele) _ins_allele = None else: self.inserted_allele = [] self.occuredins = [] if (self.occuredmu): _alt_allele = zip(self.occuredmu, self.alt_allele) _alt_allele.sort(key=lambda tup: tup[0]) self.occuredmu, self.alt_allele = zip(*_alt_allele) self.occuredmu = list(self.occuredmu) self.alt_allele = list(self.alt_allele) _alt_allele = None else: self.occuredmu = [] self.alt_allele = [] if (self.occureddel): self.occureddel.sort() else: self.occureddel = [] if self.verbose: print( "WARNING: If indel/mutation positions are specified, MuGen.probmu() makes no change at those sites. \n \ Use MuGen.posmu() or Mugen.hapchanger() to apply changes at those sites!") print("Changes made to the haplotype!") def posmu(self): """Operates on a MuGen object, and returns a Seq object obtained by making specefic changes at specefic locations on the reference sequence of the MuGen object, using the indel and mutation positions already given to MuGen""" __change = [None] * len(self.ref) self.occuredmu = list() self.occureddel = list() self.occuredins = list() self.inserted_allele = list( ) # Preservation and change site are determined self.alt_allele = list() for __site in self.inpos: # Preservation and change site are determined __change[__site] = 'ins' # with respect to the reference seq for __site in self.delpos: # type of the change is also specified __change[__site] = 'del' # The substituion base at the for __site in self.mupos: # specified position is determined __change[__site] = 'sub' # from the mutation alphabet. self.seq = [] for __site, __error in iter(zip(range(len(self.ref)), __change)): __base = self.ref[__site] if __error is None: self.seq.append(__base) elif __error == 'sub': self.seq.append( random.choice(self.mualphabet.get(__base)) ) # Substitute tha letter with one from the mutation alphabet self.occuredmu.append( __site ) # Update the list of the sites where a mutation has occured self.alt_allele.extend( [self.seq[-1]]) # Update the list of alternative alleles elif __error == 'ins': self.seq.append(__base) self.seq.append(random.choice( list(self.alphabet ))) # Insert a random letter right after the letter self.occuredins.append( __site ) # Update the list of the sites after which an insertion has occured self.inserted_allele.extend([ __base + self.seq[-1] ]) # Update the list of inserted alleles else: self.occureddel.append( __site ) # Delete the letter in the progeny sequence by just not adding it self.seq = ''.join(self.seq) self.seq = MutableSeq( self.seq, self.alphaproperty ) # Update the list of the sites which are deleted in the progeny sequence if self.occuredins: _ins_allele = zip(self.occuredins, self.inserted_allele) _ins_allele.sort( key=lambda tup: tup[0]) # Sort the occured change positions self.occuredins, self.inserted_allele = zip(*_ins_allele) self.occuredins = list(self.occuredins) self.inserted_allele = list(self.inserted_allele) _ins_allele = None else: self.inserted_allele = [] self.occuredins = [] if (self.occuredmu): _alt_allele = zip(self.occuredmu, self.alt_allele) _alt_allele.sort(key=lambda tup: tup[0]) self.occuredmu, self.alt_allele = zip(*_alt_allele) self.occuredmu = list(self.occuredmu) self.alt_allele = list(self.alt_allele) _alt_allele = None else: self.occuredmu = [] self.alt_allele = [] if (self.occureddel): self.occureddel.sort() else: self.occureddel = [] if self.verbose: print( "WARNING: if there are overlaps betweeen deletion, insertion and mutation positions, \n \ just one of the changes takes place with the following priority: \n \ 1)Mutation 2)Deletion 3)Insertion. \n") print("Changes made to the haplotype!") def hapchanger(self): """Operates on a MuGen object, and returns a Seq object obtained by making random and specified changes to the reference sequence of the MuGen object, using the probabilities as well as the positions given to MuGen.""" self.seq = [] self.occuredmu = list() self.occureddel = list() self.occuredins = list() self.inserted_allele = list() self.alt_allele = list() for __site, __base in enumerate(self.ref): if __site in set( self.mupos ): # Making specified changes at the specified positions self.seq.append( random.choice(self.mualphabet.get(__base)) ) # Induce mutation at the site whose position is given self.occuredmu.append( __site ) # Update the list of the sites where a mutation has occured self.alt_allele.extend( [self.seq[-1]]) # Update the list of alternative alleles elif __site in set(self.inpos): self.seq.append( __base ) # Make an insertion right after the site whose position is given self.seq.append(random.choice(list(self.alphabet))) self.occuredins.append( __site ) # Update the list of the sites after which an insertion has occured self.inserted_allele.extend([ __base + self.seq[-1] ]) # Update the list of inserted alleles elif __site in set(self.delpos): self.occureddel.append( __site) # Update the list of the sited with deleted letter else: # If not change is specified at the position, \ # make a random change according to the prob model __prob = { 'ins': self.insertprob.get(__base), 'del': self.deleteprob.get(__base), 'sub': self.muprob.get(__base) } __error = random.choice( ['ins', 'del', 'sub', 'sub']) # An error occurs randomly: insertion or \ # deletion or substitution __rnd = float(int( random.random() * 100000)) / 100000 # The probability that this error is \ # not corrected by replication machinary is determined \ if __rnd < __prob.get( __error): # by insertprob,deleteprob and muprob if __error == 'sub': self.seq.append( random.choice(self.mualphabet.get(__base))) self.occuredmu.append( __site ) # Update the list of the sites where a mutation has occured self.alt_allele.extend([ self.seq[-1] ]) # Update the list of alternative alleles elif __error == 'ins': self.seq.append(__base) self.seq.append(random.choice(list(self.alphabet))) self.occuredins.append( __site ) # Update the list of the sites after which an insertion has occured self.inserted_allele.extend([ __base + self.seq[-1] ]) # Update the list of inserted alleles elif __error == 'del': self.occureddel.append( __site ) # Update the list of the sited with deleted letter else: self.seq.append(__base) self.seq = ''.join(self.seq) self.seq = MutableSeq(self.seq, self.alphaproperty) if (self.occuredins): _ins_allele = zip(self.occuredins, self.inserted_allele) _ins_allele.sort( key=lambda tup: tup[0]) # Sort the occured change positions self.occuredins, self.inserted_allele = zip(*_ins_allele) self.occuredins = list(self.occuredins) self.inserted_allele = list(self.inserted_allele) _ins_allele = None else: self.inserted_allele = [] self.occuredins = [] if (self.occuredmu): _alt_allele = zip(self.occuredmu, self.alt_allele) _alt_allele.sort(key=lambda tup: tup[0]) self.occuredmu, self.alt_allele = zip(*_alt_allele) self.occuredmu = list(self.occuredmu) self.alt_allele = list(self.alt_allele) _alt_allele = None else: self.occuredmu = [] self.alt_allele = [] if (self.occureddel): self.occureddel.sort() else: self.occureddel = [] if self.verbose: print("Changes made to the haplotype!")
def viterbi(self, sequence, state_alphabet): """Calculate the most probable state path using the Viterbi algorithm. This implements the Viterbi algorithm (see pgs 55-57 in Durbin et al for a full explanation -- this is where I took my implementation ideas from), to allow decoding of the state path, given a sequence of emissions. Arguments: o sequence -- A Seq object with the emission sequence that we want to decode. o state_alphabet -- The alphabet of the possible state sequences that can be generated. """ # calculate logarithms of the transition and emission probs log_trans = self._log_transform(self.transition_prob) log_emission = self._log_transform(self.emission_prob) viterbi_probs = {} pred_state_seq = {} state_letters = state_alphabet.letters # --- initialization # # NOTE: My index numbers are one less than what is given in Durbin # et al, since we are indexing the sequence going from 0 to # (Length - 1) not 1 to Length, like in Durbin et al. # # v_{0}(0) = 1 viterbi_probs[(state_letters[0], -1)] = 1 # v_{k}(0) = 0 for k > 0 for state_letter in state_letters[1:]: viterbi_probs[(state_letter, -1)] = 0 # --- recursion # loop over the training squence (i = 1 .. L) for i in range(0, len(sequence)): # now loop over all of the letters in the state path for main_state in state_letters: # e_{l}(x_{i}) emission_part = log_emission[(main_state, sequence[i])] # loop over all possible states possible_state_probs = {} for cur_state in self.transitions_from(main_state): # a_{kl} trans_part = log_trans[(cur_state, main_state)] # v_{k}(i - 1) viterbi_part = viterbi_probs[(cur_state, i - 1)] cur_prob = viterbi_part + trans_part possible_state_probs[cur_state] = cur_prob # finally calculate the viterbi probability using the max max_prob = max(possible_state_probs.values()) viterbi_probs[(main_state, i)] = (emission_part + max_prob) # now get the most likely state for state in possible_state_probs: if possible_state_probs[state] == max_prob: pred_state_seq[(i - 1, main_state)] = state break # --- termination # calculate the probability of the state path # loop over all letters all_probs = {} for state in state_letters: # v_{k}(L) viterbi_part = viterbi_probs[(state, len(sequence) - 1)] # a_{k0} transition_part = log_trans[(state, state_letters[0])] all_probs[state] = viterbi_part * transition_part state_path_prob = max(all_probs.values()) # find the last pointer we need to trace back from last_state = '' for state in all_probs: if all_probs[state] == state_path_prob: last_state = state assert last_state != '', "Didn't find the last state to trace from!" # --- traceback traceback_seq = MutableSeq('', state_alphabet) loop_seq = range(0, len(sequence)) loop_seq.reverse() cur_state = last_state for i in loop_seq: traceback_seq.append(cur_state) cur_state = pred_state_seq[(i - 1, cur_state)] # put the traceback sequence in the proper orientation traceback_seq.reverse() return traceback_seq.toseq(), state_path_prob
class TestMutableSeq(unittest.TestCase): def setUp(self): self.s = Seq.Seq("TCAAAAGGATGCATCATG", IUPAC.unambiguous_dna) self.mutable_s = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna) def test_mutableseq_creation(self): """Test creating MutableSeqs in multiple ways""" mutable_s = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna) self.assertIsInstance(mutable_s, MutableSeq, "Creating MutableSeq") mutable_s = self.s.tomutable() self.assertIsInstance(mutable_s, MutableSeq, "Converting Seq to mutable") array_seq = MutableSeq( array.array(array_indicator, "TCAAAAGGATGCATCATG"), IUPAC.ambiguous_dna) self.assertIsInstance(array_seq, MutableSeq, "Creating MutableSeq using array") def test_repr(self): self.assertEqual( "MutableSeq('TCAAAAGGATGCATCATG', IUPACAmbiguousDNA())", repr(self.mutable_s)) def test_truncated_repr(self): seq = "TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGA" expected = "MutableSeq('TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATG...GGA', IUPACAmbiguousDNA())" self.assertEqual(expected, repr(MutableSeq(seq, IUPAC.ambiguous_dna))) def test_equal_comparison(self): """Test __eq__ comparison method""" self.assertEqual(self.mutable_s, "TCAAAAGGATGCATCATG") def test_equal_comparison_of_incompatible_alphabets(self): with warnings.catch_warnings(record=True): self.mutable_s == MutableSeq('UCAAAAGGA', IUPAC.ambiguous_rna) def test_not_equal_comparison(self): """Test __ne__ comparison method""" self.assertNotEqual(self.mutable_s, "other thing") def test_less_than_comparison(self): """Test __lt__ comparison method""" self.assertTrue(self.mutable_s[:-1] < self.mutable_s) def test_less_than_comparison_of_incompatible_alphabets(self): with warnings.catch_warnings(record=True): self.mutable_s[:-1] < MutableSeq("UCAAAAGGAUGCAUCAUG", IUPAC.ambiguous_rna) def test_less_than_comparison_without_alphabet(self): self.assertTrue(self.mutable_s[:-1] < "TCAAAAGGATGCATCATG") def test_less_than_or_equal_comparison(self): """Test __le__ comparison method""" self.assertTrue(self.mutable_s[:-1] <= self.mutable_s) def test_less_than_or_equal_comparison_of_incompatible_alphabets(self): with warnings.catch_warnings(record=True): self.mutable_s[:-1] <= MutableSeq("UCAAAAGGAUGCAUCAUG", IUPAC.ambiguous_rna) def test_less_than_or_equal_comparison_without_alphabet(self): self.assertTrue(self.mutable_s[:-1] <= "TCAAAAGGATGCATCATG") def test_add_method(self): """Test adding wrong type to MutableSeq""" with self.assertRaises(TypeError): self.mutable_s + 1234 def test_radd_method(self): self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG", self.mutable_s.__radd__(self.mutable_s)) def test_radd_method_incompatible_alphabets(self): with self.assertRaises(TypeError): self.mutable_s.__radd__( MutableSeq("UCAAAAGGA", IUPAC.ambiguous_rna)) def test_radd_method_using_seq_object(self): self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG", self.mutable_s.__radd__(self.s)) def test_radd_method_wrong_type(self): with self.assertRaises(TypeError): self.mutable_s.__radd__(1234) def test_as_string(self): self.assertEqual("TCAAAAGGATGCATCATG", str(self.mutable_s)) def test_length(self): self.assertEqual(18, len(self.mutable_s)) def test_converting_to_immutable(self): self.assertIsInstance(self.mutable_s.toseq(), Seq.Seq) def test_first_nucleotide(self): self.assertEqual('T', self.mutable_s[0]) def test_setting_slices(self): self.assertEqual(MutableSeq('CAAA', IUPAC.ambiguous_dna), self.mutable_s[1:5], "Slice mutable seq") self.mutable_s[1:3] = "GAT" self.assertEqual( MutableSeq("TGATAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Set slice with string and adding extra nucleotide") self.mutable_s[1:3] = self.mutable_s[5:7] self.assertEqual( MutableSeq("TAATAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Set slice with MutableSeq") self.mutable_s[1:3] = array.array(array_indicator, "GAT") self.assertEqual( MutableSeq("TGATTAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Set slice with array") def test_setting_item(self): self.mutable_s[3] = "G" self.assertEqual(MutableSeq("TCAGAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_deleting_slice(self): del self.mutable_s[4:5] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_deleting_item(self): del self.mutable_s[3] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_appending(self): self.mutable_s.append("C") self.assertEqual( MutableSeq("TCAAAAGGATGCATCATGC", IUPAC.ambiguous_dna), self.mutable_s) def test_inserting(self): self.mutable_s.insert(4, "G") self.assertEqual( MutableSeq("TCAAGAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_popping_last_item(self): self.assertEqual("G", self.mutable_s.pop()) def test_remove_items(self): self.mutable_s.remove("G") self.assertEqual(MutableSeq("TCAAAAGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Remove first G") self.assertRaises(ValueError, self.mutable_s.remove, 'Z') def test_count(self): self.assertEqual(7, self.mutable_s.count("A")) self.assertEqual(2, self.mutable_s.count("AA")) def test_index(self): self.assertEqual(2, self.mutable_s.index("A")) self.assertRaises(ValueError, self.mutable_s.index, "8888") def test_reverse(self): """Test using reverse method""" self.mutable_s.reverse() self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT", IUPAC.ambiguous_dna), self.mutable_s) def test_reverse_with_stride(self): """Test reverse using -1 stride""" self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT", IUPAC.ambiguous_dna), self.mutable_s[::-1]) def test_complement(self): self.mutable_s.complement() self.assertEqual(str("AGTTTTCCTACGTAGTAC"), str(self.mutable_s)) def test_complement_rna(self): seq = Seq.MutableSeq("AUGaaaCUG", IUPAC.unambiguous_rna) seq.complement() self.assertEqual(str("UACuuuGAC"), str(seq)) def test_complement_mixed_aphabets(self): seq = Seq.MutableSeq("AUGaaaCTG") with self.assertRaises(ValueError): seq.complement() def test_complement_rna_string(self): seq = Seq.MutableSeq("AUGaaaCUG") seq.complement() self.assertEqual('UACuuuGAC', str(seq)) def test_complement_dna_string(self): seq = Seq.MutableSeq("ATGaaaCTG") seq.complement() self.assertEqual('TACtttGAC', str(seq)) def test_reverse_complement(self): self.mutable_s.reverse_complement() self.assertEqual("CATGATGCATCCTTTTGA", str(self.mutable_s)) def test_reverse_complement_of_protein(self): seq = Seq.MutableSeq("ACTGTCGTCT", Alphabet.generic_protein) with self.assertRaises(ValueError): seq.reverse_complement() def test_to_string_method(self): """This method is currently deprecated, probably will need to remove this test soon""" with warnings.catch_warnings(record=True): self.mutable_s.tostring() def test_extend_method(self): self.mutable_s.extend("GAT") self.assertEqual( MutableSeq("TCAAAAGGATGCATCATGGAT", IUPAC.ambiguous_dna), self.mutable_s) def test_extend_with_mutable_seq(self): self.mutable_s.extend(MutableSeq("TTT", IUPAC.ambiguous_dna)) self.assertEqual( MutableSeq("TCAAAAGGATGCATCATGTTT", IUPAC.ambiguous_dna), self.mutable_s) def test_delete_stride_slice(self): del self.mutable_s[4:6 - 1] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_extract_third_nucleotide(self): """Test extracting every third nucleotide (slicing with stride 3)""" self.assertEqual(MutableSeq("TAGTAA", IUPAC.ambiguous_dna), self.mutable_s[0::3]) self.assertEqual(MutableSeq("CAGGTT", IUPAC.ambiguous_dna), self.mutable_s[1::3]) self.assertEqual(MutableSeq("AAACCG", IUPAC.ambiguous_dna), self.mutable_s[2::3]) def test_set_wobble_codon_to_n(self): """Test setting wobble codon to N (set slice with stride 3)""" self.mutable_s[2::3] = "N" * len(self.mutable_s[2::3]) self.assertEqual(MutableSeq("TCNAANGGNTGNATNATN", IUPAC.ambiguous_dna), self.mutable_s)
class TestMutableSeq(unittest.TestCase): def setUp(self): self.s = Seq.Seq("TCAAAAGGATGCATCATG", IUPAC.unambiguous_dna) self.mutable_s = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna) def test_mutableseq_creation(self): """Test creating MutableSeqs in multiple ways""" mutable_s = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna) self.assertIsInstance(mutable_s, MutableSeq, "Creating MutableSeq") mutable_s = self.s.tomutable() self.assertIsInstance(mutable_s, MutableSeq, "Converting Seq to mutable") array_seq = MutableSeq(array.array(array_indicator, "TCAAAAGGATGCATCATG"), IUPAC.ambiguous_dna) self.assertIsInstance(array_seq, MutableSeq, "Creating MutableSeq using array") def test_repr(self): self.assertEqual("MutableSeq('TCAAAAGGATGCATCATG', IUPACAmbiguousDNA())", repr(self.mutable_s)) def test_truncated_repr(self): seq = "TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGA" expected = "MutableSeq('TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATG...GGA', IUPACAmbiguousDNA())" self.assertEqual(expected, repr(MutableSeq(seq, IUPAC.ambiguous_dna))) def test_equal_comparison(self): """Test __eq__ comparison method""" self.assertEqual(self.mutable_s, "TCAAAAGGATGCATCATG") def test_equal_comparison_of_incompatible_alphabets(self): with warnings.catch_warnings(record=True): self.mutable_s == MutableSeq('UCAAAAGGA', IUPAC.ambiguous_rna) def test_not_equal_comparison(self): """Test __ne__ comparison method""" self.assertNotEqual(self.mutable_s, "other thing") def test_less_than_comparison(self): """Test __lt__ comparison method""" self.assertTrue(self.mutable_s[:-1] < self.mutable_s) def test_less_than_comparison_of_incompatible_alphabets(self): with warnings.catch_warnings(record=True): self.mutable_s[:-1] < MutableSeq("UCAAAAGGAUGCAUCAUG", IUPAC.ambiguous_rna) def test_less_than_comparison_without_alphabet(self): self.assertTrue(self.mutable_s[:-1] < "TCAAAAGGATGCATCATG") def test_less_than_or_equal_comparison(self): """Test __le__ comparison method""" self.assertTrue(self.mutable_s[:-1] <= self.mutable_s) def test_less_than_or_equal_comparison_of_incompatible_alphabets(self): with warnings.catch_warnings(record=True): self.mutable_s[:-1] <= MutableSeq("UCAAAAGGAUGCAUCAUG", IUPAC.ambiguous_rna) def test_less_than_or_equal_comparison_without_alphabet(self): self.assertTrue(self.mutable_s[:-1] <= "TCAAAAGGATGCATCATG") def test_add_method(self): """Test adding wrong type to MutableSeq""" with self.assertRaises(TypeError): self.mutable_s + 1234 def test_radd_method(self): self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG", self.mutable_s.__radd__(self.mutable_s)) def test_radd_method_incompatible_alphabets(self): with self.assertRaises(TypeError): self.mutable_s.__radd__(MutableSeq("UCAAAAGGA", IUPAC.ambiguous_rna)) def test_radd_method_using_seq_object(self): self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG", self.mutable_s.__radd__(self.s)) def test_radd_method_wrong_type(self): with self.assertRaises(TypeError): self.mutable_s.__radd__(1234) def test_as_string(self): self.assertEqual("TCAAAAGGATGCATCATG", str(self.mutable_s)) def test_length(self): self.assertEqual(18, len(self.mutable_s)) def test_converting_to_immutable(self): self.assertIsInstance(self.mutable_s.toseq(), Seq.Seq) def test_first_nucleotide(self): self.assertEqual('T', self.mutable_s[0]) def test_setting_slices(self): self.assertEqual(MutableSeq('CAAA', IUPAC.ambiguous_dna), self.mutable_s[1:5], "Slice mutable seq") self.mutable_s[1:3] = "GAT" self.assertEqual(MutableSeq("TGATAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Set slice with string and adding extra nucleotide") self.mutable_s[1:3] = self.mutable_s[5:7] self.assertEqual(MutableSeq("TAATAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Set slice with MutableSeq") self.mutable_s[1:3] = array.array(array_indicator, "GAT") self.assertEqual(MutableSeq("TGATTAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Set slice with array") def test_setting_item(self): self.mutable_s[3] = "G" self.assertEqual(MutableSeq("TCAGAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_deleting_slice(self): del self.mutable_s[4:5] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_deleting_item(self): del self.mutable_s[3] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_appending(self): self.mutable_s.append("C") self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGC", IUPAC.ambiguous_dna), self.mutable_s) def test_inserting(self): self.mutable_s.insert(4, "G") self.assertEqual(MutableSeq("TCAAGAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_popping_last_item(self): self.assertEqual("G", self.mutable_s.pop()) def test_remove_items(self): self.mutable_s.remove("G") self.assertEqual(MutableSeq("TCAAAAGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Remove first G") self.assertRaises(ValueError, self.mutable_s.remove, 'Z') def test_count(self): self.assertEqual(7, self.mutable_s.count("A")) self.assertEqual(2, self.mutable_s.count("AA")) def test_index(self): self.assertEqual(2, self.mutable_s.index("A")) self.assertRaises(ValueError, self.mutable_s.index, "8888") def test_reverse(self): """Test using reverse method""" self.mutable_s.reverse() self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT", IUPAC.ambiguous_dna), self.mutable_s) def test_reverse_with_stride(self): """Test reverse using -1 stride""" self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT", IUPAC.ambiguous_dna), self.mutable_s[::-1]) def test_complement(self): self.mutable_s.complement() self.assertEqual(str("AGTTTTCCTACGTAGTAC"), str(self.mutable_s)) def test_complement_rna(self): seq = Seq.MutableSeq("AUGaaaCUG", IUPAC.unambiguous_rna) seq.complement() self.assertEqual(str("UACuuuGAC"), str(seq)) def test_complement_mixed_aphabets(self): seq = Seq.MutableSeq("AUGaaaCTG") with self.assertRaises(ValueError): seq.complement() def test_complement_rna_string(self): seq = Seq.MutableSeq("AUGaaaCUG") seq.complement() self.assertEqual('UACuuuGAC', str(seq)) def test_complement_dna_string(self): seq = Seq.MutableSeq("ATGaaaCTG") seq.complement() self.assertEqual('TACtttGAC', str(seq)) def test_reverse_complement(self): self.mutable_s.reverse_complement() self.assertEqual("CATGATGCATCCTTTTGA", str(self.mutable_s)) def test_reverse_complement_of_protein(self): seq = Seq.MutableSeq("ACTGTCGTCT", Alphabet.generic_protein) with self.assertRaises(ValueError): seq.reverse_complement() def test_to_string_method(self): """This method is currently deprecated, probably will need to remove this test soon""" with warnings.catch_warnings(record=True): self.mutable_s.tostring() def test_extend_method(self): self.mutable_s.extend("GAT") self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGGAT", IUPAC.ambiguous_dna), self.mutable_s) def test_extend_with_mutable_seq(self): self.mutable_s.extend(MutableSeq("TTT", IUPAC.ambiguous_dna)) self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGTTT", IUPAC.ambiguous_dna), self.mutable_s) def test_delete_stride_slice(self): del self.mutable_s[4:6 - 1] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_extract_third_nucleotide(self): """Test extracting every third nucleotide (slicing with stride 3)""" self.assertEqual(MutableSeq("TAGTAA", IUPAC.ambiguous_dna), self.mutable_s[0::3]) self.assertEqual(MutableSeq("CAGGTT", IUPAC.ambiguous_dna), self.mutable_s[1::3]) self.assertEqual(MutableSeq("AAACCG", IUPAC.ambiguous_dna), self.mutable_s[2::3]) def test_set_wobble_codon_to_n(self): """Test setting wobble codon to N (set slice with stride 3)""" self.mutable_s[2::3] = "N" * len(self.mutable_s[2::3]) self.assertEqual(MutableSeq("TCNAANGGNTGNATNATN", IUPAC.ambiguous_dna), self.mutable_s)
def get_optimal_alignment(self): """Follow the traceback to get the optimal alignment.""" # intialize the two sequences which will return the alignment align_seq1 = MutableSeq(array.array("c"), Alphabet.Gapped(IUPAC.protein, GAP_CHAR)) align_seq2 = MutableSeq(array.array("c"), Alphabet.Gapped(IUPAC.protein, GAP_CHAR)) # take care of the initial case with the bottom corner matrix # item current_cell = self.dpmatrix[(len(self.seq1), len(self.seq2))] align_seq1.append(current_cell.seq1item) align_seq2.append(current_cell.seq2item) next_cell = current_cell.get_parent() current_cell = next_cell next_cell = current_cell.get_parent() # keeping adding sequence until we reach (0, 0) while next_cell: # add the new sequence--three cases: # 1. Move up diaganolly, add a new seq1 and seq2 to the # aligned sequences if ((next_cell.col_pos == current_cell.col_pos - 1) and (next_cell.row_pos == current_cell.row_pos - 1)): # print "case 1 -> seq1 %s, seq2 %s" % ( # current_cell.seq1item, current_cell.seq2item) align_seq1.append(current_cell.seq1item) align_seq2.append(current_cell.seq2item) # 2. Move upwards, add a new seq2 and a gap in seq1 elif ((next_cell.col_pos == current_cell.col_pos) and (next_cell.row_pos == current_cell.row_pos - 1)): #print "case 2 -> seq2 %s" % current_cell.seq2item align_seq1.append(GAP_CHAR) align_seq2.append(current_cell.seq2item) # 3. Move to the right, add a new seq1 and a gap in seq2 elif ((next_cell.col_pos == current_cell.col_pos - 1) and (next_cell.row_pos == current_cell.row_pos)): #print "case 3 -> seq1 % s" % current_cell.seq1item align_seq1.append(current_cell.seq1item) align_seq2.append(GAP_CHAR) # now move on to the next sequence current_cell = next_cell next_cell = current_cell.get_parent() # reverse the returned alignments since we are reading them in # backwards align_seq1.reverse() align_seq2.reverse() return align_seq1.toseq(), align_seq2.toseq()
def viterbi(self, sequence, state_alphabet): """Calculate the most probable state path using the Viterbi algorithm. This implements the Viterbi algorithm (see pgs 55-57 in Durbin et al for a full explanation -- this is where I took my implementation ideas from), to allow decoding of the state path, given a sequence of emissions. Arguments: o sequence -- A Seq object with the emission sequence that we want to decode. o state_alphabet -- The alphabet of the possible state sequences that can be generated. """ # calculate logarithms of the transition and emission probs log_trans = self._log_transform(self.transition_prob) log_emission = self._log_transform(self.emission_prob) viterbi_probs = {} pred_state_seq = {} state_letters = state_alphabet.letters # --- initialization # # NOTE: My index numbers are one less than what is given in Durbin # et al, since we are indexing the sequence going from 0 to # (Length - 1) not 1 to Length, like in Durbin et al. # # v_{0}(0) = 1 viterbi_probs[(state_letters[0], -1)] = 1 # v_{k}(0) = 0 for k > 0 for state_letter in state_letters[1:]: viterbi_probs[(state_letter, -1)] = 0 # --- recursion # loop over the training squence (i = 1 .. L) for i in range(0, len(sequence)): # now loop over all of the letters in the state path for main_state in state_letters: # e_{l}(x_{i}) emission_part = log_emission[(main_state, sequence[i])] # loop over all possible states possible_state_probs = {} for cur_state in self.transitions_from(main_state): # a_{kl} trans_part = log_trans[(cur_state, main_state)] # v_{k}(i - 1) viterbi_part = viterbi_probs[(cur_state, i - 1)] cur_prob = viterbi_part + trans_part possible_state_probs[cur_state] = cur_prob # finally calculate the viterbi probability using the max max_prob = max(possible_state_probs.values()) viterbi_probs[(main_state, i)] = (emission_part + max_prob) # now get the most likely state for state in possible_state_probs.keys(): if possible_state_probs[state] == max_prob: pred_state_seq[(i - 1, main_state)] = state break # --- termination # calculate the probability of the state path # loop over all letters all_probs = {} for state in state_letters: # v_{k}(L) viterbi_part = viterbi_probs[(state, len(sequence) - 1)] # a_{k0} transition_part = log_trans[(state, state_letters[0])] all_probs[state] = viterbi_part * transition_part state_path_prob = max(all_probs.values()) # find the last pointer we need to trace back from last_state = '' for state in all_probs.keys(): if all_probs[state] == state_path_prob: last_state = state assert last_state != '', "Didn't find the last state to trace from!" # --- traceback traceback_seq = MutableSeq('', state_alphabet) loop_seq = range(0, len(sequence)) loop_seq.reverse() cur_state = last_state for i in loop_seq: traceback_seq.append(cur_state) cur_state = pred_state_seq[(i - 1, cur_state)] # put the traceback sequence in the proper orientation traceback_seq.reverse() return traceback_seq.toseq(), state_path_prob