def create_genome_seq(aligned): aligned_seq = aligned.seq if type(aligned.seq) == str else aligned.seq.decode('UTF-8') genome_seq = MutableSeq(aligned_seq) # see samtools documentation for MD string err = re.findall(MD_REGEX, aligned.opt("MD")) seq_ix = 0 # step through sequence for matched_bases, curr_err in err: seq_ix += int(matched_bases) assert '^' not in curr_err assert curr_err != genome_seq[seq_ix] genome_seq[seq_ix] = curr_err seq_ix += 1 if aligned.is_reverse: genome_seq.reverse_complement() return genome_seq
def Gthg01471(): ori=Seq("ATGAGCATAAGTTTATCGGTTCCAAAATGGTTATTAACAGTTTTATCAATTTTATCTTTAGTCGTAGCATTTATTTTCGGTACCGTTTCCAATGCATCAGCAACAATTAACTATGGGGAGGAAGTCGCGGCAGTAGCAAATGACTATGTAGGAAGCCCATATAAATATGGAGGTACAACGCCAAAAGGATTTGATGCGAGTGGCTTTACTCAGTATGTGTATAAAAATGCTGCAACCAAATTGGCTATTCCGCGAACGAGTGCCGCACAGTATAAAGTCGGTAAATTTGTTAAACAAAGTGCGTTACAAAGAGGCGATTTAGTGTTTTATGCAACAGGAGCAAAAGGAAAGGTATCCTTTGTGGGAATTTATAATGGAAATGGTACGTTTATTGGTGCCACATCAAAAGGGGTAAAAGTGGTTAAAATGAGTGATAAATATTGGAAAGACCGGTATATAGGGGCTAAGCGAGTCATTAAGTAA", IUPAC.unambiguous_dna) mut=MutableSeq("ATGAGCATAAGTTTATCGGTTCCAAAATGGTTATTAACAGTTTTATCAATTTTATCTTTAGTCGTAGCATTTATTTTCGGTACCGTTTCCAATGCATCAGCAACAATTAACTATGGGGAGGAAGTCGCGGCAGTAGCAAATGACTATGTAGGAAGCCCATATAAATATGGAGGTACAACGCCAAAAGGATTTGATGCGAGTGGCTTTACTCAGTATGTGTATAAAAATGCTGCAACCAAATTGGCTATTCCGCGAACGAGTGCCGCACAGTATAAAGTCGGTAAATTTGTTAAACAAAGTGCGTTACAAAGAGGCGATTTAGTGTTTTATGCAACAGGAGCAAAAGGAAAGGTATCCTTTGTGGGAATTTATAATGGAAATGGTACGTTTATTGGTGCCACATCAAAAGGGGTAAAAGTGGTTAAAATGAGTGATAAATATTGGAAAGACCGGTATATAGGGGCTAAGCGAGTCATTAAGTAA", IUPAC.unambiguous_dna) a="AGTCGA" b="GACTAG" for i,v in enumerate([259,277,282,295,299,306]): print(mut[v-1]+a[i]) mut[v-1]=b[i] print(ori.translate()) print(mut.toseq().translate())
def generate_rolls(num_rolls): """Generate a bunch of rolls corresponding to the casino probabilities. Returns: - The generate roll sequence - The state sequence that generated the roll. """ # start off in the fair state cur_state = 'F' roll_seq = MutableSeq('', DiceRollAlphabet()) state_seq = MutableSeq('', DiceTypeAlphabet()) # generate the sequence for roll in range(num_rolls): state_seq.append(cur_state) # generate a random number chance_num = random.random() # add on a new roll to the sequence new_roll = _loaded_dice_roll(chance_num, cur_state) roll_seq.append(new_roll) # now give us a chance to switch to a new state chance_num = random.random() if cur_state == 'F': if chance_num <= .05: cur_state = 'L' elif cur_state == 'L': if chance_num <= .1: cur_state = 'F' return roll_seq.toseq(), state_seq.toseq()
def random_population(genome_alphabet, genome_size, num_organisms, fitness_calculator): """Generate a population of individuals with randomly set genomes. Arguments: o genome_alphabet -- An Alphabet object describing all of the possible letters that could potentially be in the genome of an organism. o genome_size -- The size of each organisms genome. o num_organism -- The number of organisms we want in the population. o fitness_calculator -- A function that will calculate the fitness of the organism when given the organisms genome. """ all_orgs = [] # a random number generator to get letters for the genome letter_rand = random.Random() # figure out what type of characters are in the alphabet if isinstance(genome_alphabet.letters[0], str): if sys.version_info[0] == 3: alphabet_type = "u" # Use unicode string on Python 3 else: alphabet_type = "c" # Use byte string on Python 2 elif isinstance(genome_alphabet.letters[0], int): alphabet_type = "i" elif isinstance(genome_alphabet.letters[0], float): alphabet_type = "d" else: raise ValueError( "Alphabet type is unsupported: %s" % genome_alphabet.letters) for org_num in range(num_organisms): new_genome = MutableSeq(array.array(alphabet_type), genome_alphabet) # generate the genome randomly for gene_num in range(genome_size): new_gene = letter_rand.choice(genome_alphabet.letters) new_genome.append(new_gene) # add the new organism with this genome all_orgs.append(Organism(new_genome, fitness_calculator)) return all_orgs
def add_to_pileup_dict(sams, aligned_read_set, pileup_dict): # sanity check that all the qnames (RNA read IDs) are the same for read in aligned_read_set: assert read.qname == aligned_read_set[0].qname if not True in [read.is_unmapped for read in aligned_read_set]: # all alignments mapped for read in aligned_read_set: for op, op_len in read.cigar: if op > 0 and op < 7: # do not sample reads where there are insertions or deletions return assert len(read.seq) == len(aligned_read_set[0].seq) # if aligned reads are reversed, we reverse them and hold on to that info. pos_dicts = [dict(read.aligned_pairs) for read in aligned_read_set] genome_seqs = [create_genome_seq(read) for read in aligned_read_set] qual = bytearray(aligned_read_set[0].qual, 'utf-8') seq = MutableSeq(aligned_read_set[0].seq if type(aligned_read_set[0].seq) == str else aligned_read_set[0].seq.decode('UTF-8')) if aligned_read_set[0].is_reverse: seq.reverse_complement() qual = qual[::-1] for genome_seq in genome_seqs: assert len(genome_seq) == len(seq) for i in range(0, len(seq)): # need (chrom, pos, genome_seq[i]) tuples for each aligned_read chroms = [sam.getrname(a.tid) for sam, a in izip(sams, aligned_read_set)] positions = [d[i] if not a.is_reverse else d[len(seq) - i - 1] for d, a in zip(pos_dicts, aligned_read_set)] genome_seq_i = [g[i] for g in genome_seqs] genomic_locs = tuple(zip(chroms, positions, genome_seq_i)) pileup_dict[genomic_locs][seq[i]][qual[i]] += 1
def __init__(self, sequence, hmmLength, origSeqLength, evalue, seqStart=None, seqEnd=None, hmmStart=None, hmmEnd=None): """Intialise HMMSequence with the hmmer unit. Must run align and determineGapPositions. Parameters: unit - HMMUnit object. hmmLength - int. length of the HMM. align - boolean. Enables alignment algorithm based on HMM values. See HMMSequence.align() for more info. gaps - boolean. Enables gap counting algorithm to create pileup. See HMMSequence.determineGapPositions() """ self.hmmLength = int(hmmLength) self.gaps = [0]*self.hmmLength self.origSeqLength = origSeqLength self.evalue = evalue self.seqStart = seqStart self.seqEnd = seqEnd self.hmmStart = hmmStart self.hmmEnd = hmmEnd HMMPileUp.total_seqs += 1 MutableSeq.__init__(self, sequence)
def seq_batch_query(): con = sqlite3.connect('./data/DB') cur = con.cursor() list_file = input('list file name:\n') with open(list_file, 'r') as In: organism_list = In.read().split(sep='\n') cur.execute('CREATE TABLE IF NOT EXISTS tasklist (Name TEXT);') for organism in organism_list: cur.execute('INSERT INTO tasklist (Name) VALUES (?);', (organism,)) cur.execute( 'SELECT Taxon, Organism, Name, Type, Strand, Sequence, Head FROM main WHERE Organism IN (SELECT Name FROM tasklist) ORDER BY Head', (organism)) result = cur.fetchall() cur.execute('DROP TABLE tasklist;') cur.close() con.close() query_result = [] for i in result: title = '|'.join([str(i[0]), i[1], i[2], i[3]]) filename = i[2] sequence = MutableSeq(i[5]) if i[4] == '-1': sequence.seq = sequence.reverse_complement() record = [title, filename, sequence] query_result.append(record) for i in query_result: with open(''.join(['./out/', i[1], '.fasta']), 'a') as Fileout: Fileout.write('>%s\n%s\n' % (i[0], i[2])) # rps12 may have larger than 50k fragments, here to filter it rps12 = SeqIO.parse('./out/rps12.fasta', 'fasta') rps12short = list() for item in rps12: if len(item.seq) < 4000: rps12short.append(item) SeqIO.write(rps12short, './out/rps12short.fasta', 'fasta') print('Done.\n')
def viterbi(self, sequence, state_alphabet): """Calculate the most probable state path using the Viterbi algorithm. This implements the Viterbi algorithm (see pgs 55-57 in Durbin et al for a full explanation -- this is where I took my implementation ideas from), to allow decoding of the state path, given a sequence of emissions. Arguments: o sequence -- A Seq object with the emission sequence that we want to decode. o state_alphabet -- The alphabet of the possible state sequences that can be generated. """ # calculate logarithms of the initial, transition, and emission probs log_initial = self._log_transform(self.initial_prob) log_trans = self._log_transform(self.transition_prob) log_emission = self._log_transform(self.emission_prob) viterbi_probs = {} pred_state_seq = {} state_letters = state_alphabet.letters # --- recursion # loop over the training squence (i = 1 .. L) # NOTE: My index numbers are one less than what is given in Durbin # et al, since we are indexing the sequence going from 0 to # (Length - 1) not 1 to Length, like in Durbin et al. for i in range(0, len(sequence)): # loop over all of the possible i-th states in the state path for cur_state in state_letters: # e_{l}(x_{i}) emission_part = log_emission[(cur_state, sequence[i])] max_prob = 0 if i == 0: # for the first state, use the initial probability rather # than looking back to previous states max_prob = log_initial[cur_state] else: # loop over all possible (i-1)-th previous states possible_state_probs = {} for prev_state in self.transitions_to(cur_state): # a_{kl} trans_part = log_trans[(prev_state, cur_state)] # v_{k}(i - 1) viterbi_part = viterbi_probs[(prev_state, i - 1)] cur_prob = viterbi_part + trans_part possible_state_probs[prev_state] = cur_prob # calculate the viterbi probability using the max max_prob = max(possible_state_probs.values()) # v_{k}(i) viterbi_probs[(cur_state, i)] = (emission_part + max_prob) if i > 0: # get the most likely prev_state leading to cur_state for state in possible_state_probs: if possible_state_probs[state] == max_prob: pred_state_seq[(i - 1, cur_state)] = state break # --- termination # calculate the probability of the state path # loop over all states all_probs = {} for state in state_letters: # v_{k}(L) all_probs[state] = viterbi_probs[(state, len(sequence) - 1)] state_path_prob = max(all_probs.values()) # find the last pointer we need to trace back from last_state = '' for state in all_probs: if all_probs[state] == state_path_prob: last_state = state assert last_state != '', "Didn't find the last state to trace from!" # --- traceback traceback_seq = MutableSeq('', state_alphabet) loop_seq = list(range(1, len(sequence))) loop_seq.reverse() # last_state is the last state in the most probable state sequence. # Compute that sequence by walking backwards in time. From the i-th # state in the sequence, find the (i-1)-th state as the most # probable state preceding the i-th state. state = last_state traceback_seq.append(state) for i in loop_seq: state = pred_state_seq[(i - 1, state)] traceback_seq.append(state) # put the traceback sequence in the proper orientation traceback_seq.reverse() return traceback_seq.toseq(), state_path_prob
def setUp(self): self.s = Seq.Seq("TCAAAAGGATGCATCATG", IUPAC.unambiguous_dna) self.mutable_s = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna)
def genome_generator(): """Generate a genome for testing purposes. """ return MutableSeq("1234", TestAlphabet())
def count_one_fraction(alignment, refname, debug, start_offset, end_trail): """ Don't bother with expected/allowed mutations, just find everything and filter later Final format: {DNA error: [(protein error), fraction, 1. Read reference file 2. Scan over reference sequence to generate all possible mutations 3. For each ref & read in multiple alignment: - verify the read is good quality - call the mutation - add to count table 4. Print counts """ # use a regular dictionary # when a protein mutation is first encountered, create an entry one_lane_counts = {} # reading & looping over read/reference sequence in multiple sequence alignment # use AlignIO parser and keep sequence only, allowing it to change (important for gap shifts) for pair in AlignIO.parse(alignment, "fasta", alphabet=IUPAC.ambiguous_dna, seq_count=2): # both read and ref are MutableSeq ref = pair[0].seq.tomutable() read = pair[1].seq.tomutable() read = MutableSeq(str(read).replace('N', '.'), read.alphabet) readname = pair[1].id # trim sequencing read to reference ref, read = trim_read(ref, read) # if read_is_wt(read, ref): # if debug: # trimmed_read = re.search(r'^-+([AGCTN][ACGTN-]+[ACGTN])-+$', str(read)) # print() # print(trimmed_read.group(1)) # printErrors("WT", read, ref, True) # continue dna_errors, dna_hgvs, prot_errors = None, None, None try: dna_errors = find_DNA_diff(read, ref, debug, start_offset, end_trail) # errors = a tuple dna_hgvs = find_DNA_hgvs( read, ref, refname, debug, start_offset, end_trail) # string according to HGVS format (ish) prot_errors = find_protein_diff(read, ref, debug, start_offset, end_trail) # print() # print(readname) # print(dna_hgvs, prot_errors) # printErrors(dna_errors, read, ref, True) except: if not dna_errors: print(dna_errors) print_coloured_diff(readname, read, ref, debug) raise try: one_lane_counts[prot_errors]['total'] += 1 one_lane_counts[prot_errors]['dna'][dna_errors] += 1 one_lane_counts[prot_errors]['dna_hgvs'][dna_hgvs] += 1 except KeyError: one_lane_counts[prot_errors] = { 'dna': defaultdict(int), 'dna_hgvs': defaultdict(int), 'total': 1 } one_lane_counts[prot_errors]['dna'][dna_errors] += 1 one_lane_counts[prot_errors]['dna_hgvs'][dna_hgvs] += 1 # count the mutations n = 0 threshold = 10 for error in one_lane_counts.keys(): if one_lane_counts[error]['total'] > threshold: n += 1 print( 'Fount {0} total protein mutations, of which {1} have more than {2} counts' .format(len(one_lane_counts), n, threshold)) return one_lane_counts
def posmu(self): """Operates on a MuGen object, and returns a Seq object obtained by making specefic changes at specefic locations on the reference sequence of the MuGen object, using the indel and mutation positions already given to MuGen""" __change = [None] * len(self.ref) self.occuredmu = list() self.occureddel = list() self.occuredins = list() self.inserted_allele = list() # Preservation and change site are determined self.alt_allele = list() for __site in self.inpos: # Preservation and change site are determined __change[ __site] = 'ins' # with respect to the reference seq for __site in self.delpos: # type of the change is also specified __change[__site] = 'del' # The substituion base at the for __site in self.mupos: # specified position is determined __change[__site] = 'sub' # from the mutation alphabet. self.seq = [] for __site, __error in iter( zip(range(len(self.ref)), __change)): __base = self.ref[__site] if __error is None: self.seq.append(__base) elif __error == 'sub': self.seq.append(random.choice( self.mualphabet.get( __base))) # Substitute tha letter with one from the mutation alphabet self.occuredmu.append( __site) # Update the list of the sites where a mutation has occured self.alt_allele.extend([self.seq[ -1]]) # Update the list of alternative alleles elif __error == 'ins': self.seq.append(__base) self.seq.append(random.choice(list( self.alphabet))) # Insert a random letter right after the letter self.occuredins.append( __site) # Update the list of the sites after which an insertion has occured self.inserted_allele.extend([__base + self.seq[ -1]]) # Update the list of inserted alleles else: self.occureddel.append( __site) # Delete the letter in the progeny sequence by just not adding it self.seq = ''.join(self.seq) self.seq = MutableSeq(self.seq, self.alphaproperty) # Update the list of the sites which are deleted in the progeny sequence if self.occuredins: _ins_allele = zip(self.occuredins, self.inserted_allele) _ins_allele.sort(key=lambda tup: tup[ 0]) # Sort the occured change positions self.occuredins, self.inserted_allele = zip( *_ins_allele) self.occuredins = list(self.occuredins) self.inserted_allele = list(self.inserted_allele) _ins_allele = None else: self.inserted_allele = [] self.occuredins = [] if (self.occuredmu): _alt_allele = zip(self.occuredmu, self.alt_allele) _alt_allele.sort(key=lambda tup: tup[0]) self.occuredmu, self.alt_allele = zip(*_alt_allele) self.occuredmu = list(self.occuredmu) self.alt_allele = list(self.alt_allele) _alt_allele = None else: self.occuredmu = [] self.alt_allele = [] if (self.occureddel): self.occureddel.sort() else: self.occureddel = [] if self.verbose: print("WARNING: if there are overlaps betweeen deletion, insertion and mutation positions, \n \ just one of the changes takes place with the following priority: \n \ 1)Mutation 2)Deletion 3)Insertion. \n") print("Changes made to the haplotype!")
markovBuilder.set_emission_score('O', 'C', .33) markovBuilder.set_emission_score('O', 'G', .33) markovBuilder.set_emission_score('O', 'S', .33) markovBuilder.set_emission_score('P', 'A', .67) markovBuilder.set_emission_score('P', 'T', .33) #Menginisialisasi Hidden Markov Model markovModel = markovBuilder.get_markov_model() #3 sequence yang akan dialign seq1 = Seq('ATGA', arrayDNA()) seq2 = Seq('A CCA', arrayDNA()) seq3 = Seq('ACAST', arrayDNA()) #state untuk tiap sequence seq1State = MutableSeq('MNOP', arrayState()) seq2State = MutableSeq('MDIOP', arrayState()) seq3State = MutableSeq('MNIOP', arrayState()) seq = [seq1, seq2, seq3] states = [seq1State, seq2State, seq3State] #training Hidden Markov Model dengan sequence di atas trainer = Trainer.KnownStateTrainer(markovModel) for i in range(len(seq)): trainingseq = Trainer.TrainingSequence(seq[i], states[i]) trainedhmm = trainer.train([trainingseq]) #contoh query yang lain testSeq = Seq('ATSA', arrayDNA()) testState = MutableSeq('MNOP', arrayState())
# Direct translation (DNA -> Protein from Bio.Seq import Seq, translate from Bio.Alphabet import IUPAC coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna) translate(coding_dna) # we can specify other translation tables by name translate(coding_dna, table="Vertebrate Mitochondrial") # or by NCBI number translate(coding_dna, table=2) # 3.9 Transcription and Translation # 3.10 Mutable Seqs # convert existing sequence to mutable mutable_seq = my_seq.tomutable() # or directly create a mutable one from Bio.Seq import MutableSeq from Bio.Alphabet import IUPAC mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna) # now we can do mutable_seq[5] = "T" # and convert it back to an inmutable seq new_seq = mutable_seq.toseq()
def determine_synonymous(nuc_muts_on_branch, parent_diffs_from_ref, reference_gene_locations, reference_gene_codon, reference_sequence_nt, reference_sequence_aa): """ Check every nucleotide mutation that occurred on a branch to determine whether or not it is synonymous. For each node, all nucleotide mutations that occurred in parents of the node are applied to the reference sequence to give the genome prior to this node. Then, each nucleotide mutation at the node is made to the appropriate codon from this genome and determined to be synonymous or nonsynonymous. Returns a dictionary of synonymous mutations where the key is a gene and the value is a list of synonymous mutations in this gene. """ parent_diffs_pos = [int(k) for k, v in parent_diffs_from_ref.items()] # make dictionary of synonymous (and noncoding) mutations to add to tree syn_muts = {} # don't care about deletions because they are obviously not synonymous for mut in nuc_muts_on_branch: if mut[-1] != '-' and mut[0] != '-': mut_pos = int(mut[1:-1]) # find what gene this mut happens in if (mut_pos - 1) in reference_gene_locations.keys(): mut_gene = reference_gene_locations[mut_pos - 1] mut_codon_num = reference_gene_codon[mut_pos - 1][0] mut_codon_pos = reference_gene_codon[mut_pos - 1][1] # find the reference sequence of the codon this mutation occurs in codon_ref_aa = reference_sequence_aa[mut_gene][mut_codon_num] codon_ref_nt = reference_sequence_nt[mut_gene][( mut_codon_num * 3):(mut_codon_num * 3 + 3)] # check if a mutation occurred within the same codon in a parent # and if so, change the reference codon sequence accordingly, # to tell whether the mutation at this branch is synonymous or not codon_genome_pos = list( range((mut_pos - 1 - mut_codon_pos), (mut_pos - 1 - mut_codon_pos + 3))) parent_codon = codon_ref_nt for parent_diff in parent_diffs_pos: parent_diff_zero_based = parent_diff - 1 if parent_diff_zero_based in codon_genome_pos: parent_diff_pos = codon_genome_pos.index( parent_diff_zero_based) parent_codon = MutableSeq(str(codon_ref_nt)) parent_codon[parent_diff_pos] = parent_diffs_from_ref[ parent_diff] parent_codon = Seq(parent_codon) codon_mutated = MutableSeq(str(parent_codon)) #if deletion (or seq error) has happened at neighboring nucleotide if '-' in codon_mutated: pass else: codon_mutated[mut_codon_pos] = mut[-1] codon_mutated = Seq(codon_mutated) codon_mutated_translation = codon_mutated.translate() if str(codon_ref_aa) == str(codon_mutated_translation): if mut_gene in syn_muts.keys(): syn_muts[mut_gene] += [mut] else: syn_muts[mut_gene] = [mut] else: if 'noncoding' in syn_muts.keys(): syn_muts['noncoding'] += [mut] else: syn_muts['noncoding'] = [mut] return syn_muts
def setUp(self): genome = MutableSeq("1111", TestAlphabet()) self.organism = Organism(genome, test_fitness)
def MutableSeqFromFile(filename, alphabet): sequence_str = open(filename).read().strip() return MutableSeq(sequence_str.lower(), alphabet)
def setUp(self): self.alphabet = TestAlphabet() genome = MutableSeq("2", self.alphabet) self.org = Organism(genome, test_fitness) self.test_mutator = TestMutator()
print(coding_dna.translate(to_stop=True)) from Bio.Data import CodonTable standard_table = CodonTable.unambiguous_dna_by_name["Standard"] mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"] print(mito_table) print(mito_table.stop_codons) print(mito_table.start_codons) print(mito_table.forward_table["ACG"]) my_seq[1] = "N" mutable_seq = my_seq.tomutable() # or from Bio.Seq import MutableSeq mutable_seq = MutableSeq('GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA') mutable_seq[5] = "A" print(mutable_seq) del mutable_seq[4] mutable_seq.remove('A') print(mutable_seq) new_seq = mutable_seq.toseq() print(new_seq) from Bio.Seq import UnknownSeq unk = UnknownSeq(10) print(unk) unk = UnknownSeq(10, character="A") print(unk) unk_protein = unk.translate()
def search_mutated_feature(vcf_record, gbk_dico): ''' - Search if mutation is located within a coding sequence - determine if mutation is synonymous or not using a MutableSeq record (copy of the original record with mutation) ''' from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from copy import copy from Bio.Alphabet import IUPAC from Bio.Seq import MutableSeq from Bio.Alphabet import generic_dna # create record_alt = copy(gbk_dico[vcf_record.CHROM]) record_alt.seq = MutableSeq(str(record_alt.seq), generic_dna) results = { "mut_location": "Intergenic", "mut_type": '-', "orf_name": '-', "gene": '-' } for feature in record_alt.features: if int(vcf_record.POS) in feature and feature.type != "source": results["mut_location"] = feature.type if feature.type == 'mobile_element': results["orf_name"] = feature.qualifiers[ "mobile_element_type"][0] elif feature.type == 'CDS': results["orf_name"] = feature.qualifiers["locus_tag"][0] else: results[ "orf_name"] = "Unknown locus for feature: %s" % feature.type try: results["gene"] = feature.qualifiers["gene"][0] except KeyError: results["gene"] = '-' if feature.type == 'CDS': if len(vcf_record.ALT[0]) > 1: results["mut_type"] = 'INDEL' continue else: aa_seq_ref = str( feature.extract(record_alt.seq).translate()) # mutate reference sequence if vcf_record.ALT[0] == '*': # frameshift results["mut_type"] = 'F' else: record_alt.seq[int(vcf_record.POS) - 1] = str( vcf_record.ALT[0]) # check if synonymous or not aa_seq_alt = str( feature.extract(record_alt.seq).translate()) if str(aa_seq_ref) == str(aa_seq_alt): results["mut_type"] = 'S' else: results["mut_type"] = extract_mutation( aa_seq_ref, aa_seq_alt) return results # if no match, return empty results return results
def seq_query(): """Sequence query function, to be continued. """ query_type = input( '1.Specific fragment\n' '2.Specific Organism\n' '3.Specific gene\n' '4.All\n' '5.All cds\n' ) organize = input('Organize output?(y/n)\n') if query_type not in ['1', '2', '3', '4', '5']: raise ValueError('wrong input!\n') con = sqlite3.connect('./data/DB') cur = con.cursor() if query_type == '1': organism = input('Organism:\n') gene = input('Gene:\n') frag_type = input('Fragment type(gene, cds, rRNA, tRNA, exon, intron, spacer):\n') cur.execute( 'SELECT Taxon, Organism, Name, Type, Strand, Sequence FROM main WHERE Name LIKE ? AND Type = ? AND Organism=?', ('%' + gene + '%', frag_type, organism)) result = cur.fetchall() elif query_type == '2': organism = input('Organism:\n') frag_type = input('Fragment type(gene, cds, rRNA, tRNA, exon, intron, spacer, whole, fragments):\n') if frag_type == 'fragments': cur.execute( 'SELECT Taxon, Organism, Name, Type, Strand, Sequence, Head FROM main WHERE Organism = ? ORDER BY Head', (organism,)) else: cur.execute( 'SELECT Taxon, Organism, Name, Type, Strand, Sequence, Head FROM main WHERE Organism LIKE ? AND Type = ? ORDER BY Head', ('%' + organism + '%', frag_type)) result = cur.fetchall() elif query_type == '3': gene = input('Gene:\n') frag_type = input('Fragment type(gene, cds, rRNA, tRNA, exon, intron, spacer):\n') cur.execute( 'SELECT Taxon, Organism, Name, Type, Strand, Sequence FROM main WHERE Name LIKE ? AND Type = ? ORDER BY Taxon', ('%' + gene + '%', frag_type)) result = cur.fetchall() elif query_type == '4': cur.execute('SELECT Taxon, Organism, Name, Type, Strand, Sequence, Head FROM main ORDER BY Taxon') result = cur.fetchall() elif query_type == '5': cur.execute( 'SELECT Taxon, Organism, Name, Type, Strand, Sequence, Head FROM main WHERE type = "cds" ORDER BY Taxon') result = cur.fetchall() query_result = [] for i in result: title = '|'.join([str(i[0]), i[1], i[2], i[3]]) sequence = MutableSeq(i[5]) gene = i[2] if i[4] == '-1': sequence.seq = sequence.reverse_complement() record = [title, gene, sequence] query_result.append(record) if organize == 'y': if not exists('output'): makedirs('output') for i in query_result: file_name = ''.join([ 'output', '/', i[1].replace('/', ''), '.fasta' ]) with open(file_name, 'a') as output_file: output_file.write('>%s\n%s\n' % (i[0], i[2])) else: output = input('Enter output filename:\n') with open('.'.join([output, 'fasta']), 'w') as output_file: for i in query_result: output_file.write('>%s\n%s\n' % (i[0], i[2])) cur.close() con.close() print('Done.\n')
def __init__(self, seq, alphaproperty=None, insertprob=None, deleteprob=None, mualphabet=None, muprob=None, mupos=None, delpos=None, inpos=None, verbose=False): try: self.occureddel = list() # This is to keep a history of chnges made to the reference self.occuredmu = list() # This is necessary for writing the haplotypes in the format self.occuredins = list() # of haplotyping software's. self.inserted_allele = list() # keeps track of the inserted allele to be able to get them back when needed! self.alt_allele = list() # keeps track of the substituted if not isinstance(verbose, bool): raise CustomException("ERROR: verbose must be set to either True or False. \ Default is to False") else: self.verbose = verbose if isinstance(seq, str): if alphaproperty is None: if self.verbose: print( "WARNING: No alphabet type is specified for the sequence string!") else: pass self.alphaproperty = Alphabet() else: self.alphaproperty = alphaproperty self.seq = MutableSeq(seq, self.alphaproperty) elif isinstance(seq, Seq): self.alphaproperty = seq.__getattribute__( 'alphabet') self.seq = seq.tomutable() elif isinstance(seq, MutableSeq): self.alphaproperty = seq.__getattribute__( 'alphabet') self.seq = copy.deepcopy(seq) else: raise CustomException("ERROR: Should provide a Seq or MutableSeq object, \n \ or a string sequence!") self.alphabet = set(str(self.seq)) self.ref = str(self.seq) if not delpos: self.delpos = [] else: if set(delpos).issubset( set(range(len(self.ref)))): self.delpos = list( delpos) # Deletion by specifying the positions else: raise CustomException( "ERROR: Deletion positions exceed the range of the reference or are not positive integers!") if not inpos: self.inpos = [] else: if set(inpos).issubset( set(range(len(self.ref)))): self.inpos = list( inpos) # Insertion by specifying the positions else: raise CustomException( "ERROR: Insertion positions exceed the range of the reference or are not positive integers!") if not mupos: self.mupos = [] else: if set(mupos).issubset( set(range(len(self.ref)))): self.mupos = list( mupos) # Mutation by specifying the positions else: raise CustomException( "ERROR: Mutation positions exceed the range of the reference or are not positive integers!") if not mualphabet: if self.verbose: print("WARNING: You have specified no mutation alphabet! Mutations are set to random \ letters!") self.mualphabet = dict() for key in self.alphabet: self.mualphabet[key] = ''.join( self.alphabet - { key,'N'}) # Non-specified mutations could happen to any letter else: mualphabet = dict([(str(k), str(v)) for k, v in mualphabet.iteritems()]) for key, value in mualphabet.iteritems(): if len(key) != 1: raise CustomException("ERROR: the mutation alphabet deals with point mutations! Only single letters are\ allowed as keys!") elif key in set(''.join(value)): raise CustomException("ERROR: Wrong mutation values specified! A letter could just be substituted with a\ different letter for mutation!") if set( mualphabet.keys()) == self.alphabet and set( ''.join( mualphabet.values())) <= self.alphabet: self.mualphabet = copy.deepcopy( mualphabet) elif set( mualphabet.keys()) < self.alphabet and set( ''.join( mualphabet.values())) < self.alphabet: if self.verbose: print("WARNING: Mutation is not specified for some letters! Those mutations are set\ to random letters!") self.mualphabet = copy.deepcopy( mualphabet) # Whatever has been specified for mutation alphabet is kep intact for key in self.alphabet - set( mualphabet.keys()): self.mualphabet[key] = ''.join( self.alphabet - { key,'N'}) # Non-specified mutations could happen to any letter else: if self.verbose: print("WARNING: Mutation alphabet is not compatible with sequence alphabet! Both alphabets are\ updated and\nunspecified mutations are set to random letters!") new_mualphabet = dict() # As mutation may introduce novel alleles in the sequence, alphabet is updated first for key, value in mualphabet.iteritems(): # Whatever has been specified for mutation alphabet is kep intact self.alphabet.add( key) # Only the alphabet is updated if necessary self.alphabet |= (set(''.join( value)) - self.alphabet) new_mualphabet.update( {key: value}) for key in self.alphabet - set( new_mualphabet.keys()): new_mualphabet[key] = ''.join( self.alphabet - { key,'N'}) # Non-specified mutations could happen to any letter self.mualphabet = copy.deepcopy( new_mualphabet) if not insertprob: self.insertprob = dict() # If no insertprob is given, it is set to zero everywhere for key in self.alphabet: self.insertprob[key] = 0 else: if set(list( insertprob.keys())) != self.alphabet: if self.verbose: print("WARNING: Missing/Invalid letter(s) in insertion probability!\n\ Probabilities are set to zero for missing letters! Invalid letters are ignored!") new_insertprob = dict() for key, value in insertprob.iteritems(): if value >= 0 and value <= 1: new_insertprob.update( {key: value}) else: raise CustomException( "ERROR: Insertion probability must be >=0 and <=1!") for key in self.alphabet - set( new_insertprob.keys()): new_insertprob[key] = 0 self.insertprob = copy.deepcopy(new_insertprob) if not deleteprob: # If no deleteprob is given, it is set to zero everywhere self.deleteprob = dict() for key in self.alphabet: self.deleteprob[key] = 0 else: if set(list( deleteprob.keys())) != self.alphabet: if self.verbose: print("WARNING: Missing/Invalid letter(s) in deletion probability!\n\ Probabilities are set to zero for missing letters! Invalid letters are ignored!") new_deleteprob = dict() for key, value in deleteprob.iteritems(): if value >= 0 and value <= 1: new_deleteprob.update( {key: value}) else: raise CustomException( "ERROR: Deletion probability must be >=0 and <=1!") for key in self.alphabet - set( new_deleteprob.keys()): new_deleteprob[key] = 0 self.deleteprob = copy.deepcopy(new_deleteprob) if not muprob: self.muprob = dict() # If no muprob is given, it is set to zero everywhere for key in self.alphabet: self.muprob[key] = 0 else: if set(list(muprob.keys())) != self.alphabet: if self.verbose: print("WARNING: Missing/Invalid letter(s) in mutation probability!\n\ Probabilities are set to zero for missing letters! Invalid letters are ignored!") new_muprob = dict() for key, value in muprob.iteritems(): if value >= 0 and value <= 1: new_muprob.update({key: value}) else: raise CustomException( "ERROR: Mutation probability must be >=0 and <=1!") for key in self.alphabet - set( new_muprob.keys()): new_muprob[key] = 0 self.muprob = copy.deepcopy(new_muprob) except CustomException as instance: print(instance) sys.exit(2) else: if self.verbose: print( "MuGen object successfully created.\nWARNING: MuGen sequence is case sensitive!")
with open(args.rc_regions) as infile1: RCseqs = csv.reader(infile1, delimiter='\t') for row in RCseqs: seq = row[0] RCstart = row[1] RCstop = row[2] seq_list.append([seq, RCstart, RCstop]) # Read fasta file: fasta_seqs = list(SeqIO.parse(args.fasta, "fasta")) # Mask recombinant regions: masked_seq = [] for i in fasta_seqs: seq = MutableSeq(str(i.seq)) for j in seq_list: if i.id == j[0]: # j[0] is the sequence id in recombinant regions list start_mask = int(j[1]) - 1 # 1 based positions are 1 less in 0-based indexing end_mask = int(j[2]) # last index in range is not included in python len_mask = end_mask - start_mask seq[start_mask:end_mask] = args.maskchar * len_mask masked_seq.append(SeqRecord(Seq(str(seq)), i.id, description="")) for i in masked_seq: print("Number of characters masked in sequence " + i.id + ": " + str(str(i.seq).count(args.maskchar))) # Write masked sequences to file: SeqIO.write(masked_seq, args.out, "fasta")
while (break_nb[break_nb_ord[cl_ii]] > 2): rcl_file = tempfn + '-rcl.fa' f = open(rcl_file, 'w') nbr = 0 for rii in xrange(len(break_pos[1])): if (break_pos[0][rii] == break_nb_ord[cl_ii]): nbr += 1 pbr = break_pos[1][rii] pbseq = pb_reads[pbr] if (read_blocks[pbr][0]['refE'] == break_nb_ord[2]): readbp = read_blocks[pbr][0]['readE'] else: readbp = read_blocks[pbr][0]['readS'] pbseq = pbseq[max(0, readbp - 400):min(len(pbseq), readbp + 400)] if (read_blocks[pbr][0]['refStd'] == '-'): pbseq = MutableSeq(pbseq, generic_dna) pbseq.reverse_complement() pbseq = str(pbseq) f.write('>' + pbr + '\n') f.write(pbseq) f.write("\n") f.close() # Run Clustal clo_outfile = tempfn + '-clo-out.fa' clo_cmd = ['clustalo', '-i', rcl_file, '-o', clo_outfile, '--force'] clo_out = subprocess.check_output(clo_cmd) # Get consensus msa_out = [] for record in SeqIO.parse(clo_outfile, "fasta"): msa_out.append(str(record.seq)) clo_cons = ""
else: print "huh? ERROR" t = Seq.Seq("T", IUPAC.ambiguous_dna) u = s + t print str(u.alphabet) from Bio.Seq import MutableSeq import array print print "Testing MutableSeq" print "==================" print "Testing creating MutableSeqs in multiple ways" string_seq = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna) array_seq = MutableSeq(array.array(array_indicator, "TCAAAAGGATGCATCATG"), IUPAC.ambiguous_dna) converted_seq = s.tomutable() for test_seq in [string_seq]: print repr(test_seq) print str(test_seq) print len(test_seq) print repr(test_seq.toseq()) print test_seq[0] print repr(test_seq[1:5]) test_seq[1:3] = "GAT" print "Set slice with string:", repr(test_seq)
def setUp(self): alphabet = TestAlphabet() test_genome = MutableSeq("11*22*33*", alphabet) self.organism = Organism(test_genome, test_fitness) self.ambig_info = Schema(alphabet.alphabet_matches)
def probmu(self): self.occuredmu = list() self.occureddel = list() self.occuredins = list() self.inserted_allele = list() self.alt_allele = list() """Operates on a MuGen object, and returns a Seq object obtained by making random changes to the reference sequence of the MuGen object, using the probabilities given to MuGen""" self.seq = [] for __site, __base in enumerate(self.ref): if __site in set(self.mupos) | set(self.inpos) | set( self.delpos): self.seq.append( __base) # No change is made at indel/mutation positions else: __prob = {'ins': self.insertprob.get(__base), 'del': self.deleteprob.get(__base), 'sub': self.muprob.get(__base)} __error = random.choice(['ins', 'del', 'sub', 'sub']) # An error occurs randomly: insertion or \ # deletion or substitution __rnd = float(int( random.random() * 100000)) / 100000 # The probability that this error is \ # not corrected by replication machinary is determined \ if __rnd < __prob.get( __error): # by insertprob,deleteprob and muprob if __error == 'sub': self.seq.append(random.choice( self.mualphabet.get( __base))) # Substitute tha letter with one from the mutation alphabet self.occuredmu.append( __site) # Update the list of the sites where a mutation has occured self.alt_allele.extend([ self.seq[ -1]]) # Update the list of alternative alleles elif __error == 'ins': self.seq.append(__base) self.seq.append(random.choice( list( self.alphabet))) # Insert a random letter right after the letter self.occuredins.append( __site) # Update the list of the sites after which an insertion has occured self.inserted_allele.extend([ __base + self.seq[ -1]]) # Update the list of inserted alleles else: self.occureddel.append( __site) # Delete the letter in the progeny sequence by just not adding it else: # Update the list of the sites which are deleted in the progeny sequence self.seq.append( __base) # No change is induced at the site in the progeny sequence self.seq = ''.join(self.seq) self.seq = MutableSeq(self.seq, self.alphaproperty) if (self.occuredins): _ins_allele = zip(self.occuredins, self.inserted_allele) _ins_allele.sort(key=lambda tup: tup[ 0]) # Sort the occured change positions in ascending order self.occuredins, self.inserted_allele = zip( *_ins_allele) self.occuredins = list(self.occuredins) self.inserted_allele = list(self.inserted_allele) _ins_allele = None else: self.inserted_allele = [] self.occuredins = [] if (self.occuredmu): _alt_allele = zip(self.occuredmu, self.alt_allele) _alt_allele.sort(key=lambda tup: tup[0]) self.occuredmu, self.alt_allele = zip(*_alt_allele) self.occuredmu = list(self.occuredmu) self.alt_allele = list(self.alt_allele) _alt_allele = None else: self.occuredmu = [] self.alt_allele = [] if (self.occureddel): self.occureddel.sort() else: self.occureddel = [] if self.verbose: print("WARNING: If indel/mutation positions are specified, MuGen.probmu() makes no change at those sites. \n \ Use MuGen.posmu() or Mugen.hapchanger() to apply changes at those sites!") print("Changes made to the haplotype!")
def parse_vcf(varfile): reader = csv.reader(open(varfile), "excel-tab") for line in reader: if line[0][0] == "#": continue pos = int(line[1]) - 1 var = line[4].split(',') yield pos, var for seq_record in SeqIO.parse(sys.argv[1], 'fasta'): print >> sys.stderr, "Seq ID = %s, Length = %d" % \ (seq_record.id, len(seq_record)) seq = MutableSeq(str(seq_record.seq)) n = 0 for pos, var in parse_vcf(sys.argv[2]): # if (len(var) > 2) or (len(var[0]) > 1): # continue if (len(var) > 1) or (len(var[0]) > 1): continue else: seq[pos] = var[0] n += 1 SeqIO.write(SeqRecord(Seq(str(seq)), id=seq_record.id), sys.stdout, 'fasta') print >> sys.stderr, "Total variants = %d" % n
def hapchanger(self): """Operates on a MuGen object, and returns a Seq object obtained by making random and specified changes to the reference sequence of the MuGen object, using the probabilities as well as the positions given to MuGen.""" self.seq = [] self.occuredmu = list() self.occureddel = list() self.occuredins = list() self.inserted_allele = list() self.alt_allele = list() for __site, __base in enumerate(self.ref): if __site in set( self.mupos): # Making specified changes at the specified positions self.seq.append(random.choice( self.mualphabet.get( __base))) # Induce mutation at the site whose position is given self.occuredmu.append( __site) # Update the list of the sites where a mutation has occured self.alt_allele.extend([self.seq[ -1]]) # Update the list of alternative alleles elif __site in set(self.inpos): self.seq.append( __base) # Make an insertion right after the site whose position is given self.seq.append( random.choice(list(self.alphabet))) self.occuredins.append( __site) # Update the list of the sites after which an insertion has occured self.inserted_allele.extend([__base + self.seq[ -1]]) # Update the list of inserted alleles elif __site in set(self.delpos): self.occureddel.append( __site) # Update the list of the sited with deleted letter else: # If not change is specified at the position, \ # make a random change according to the prob model __prob = {'ins': self.insertprob.get(__base), 'del': self.deleteprob.get(__base), 'sub': self.muprob.get(__base)} __error = random.choice(['ins', 'del', 'sub', 'sub']) # An error occurs randomly: insertion or \ # deletion or substitution __rnd = float(int( random.random() * 100000)) / 100000 # The probability that this error is \ # not corrected by replication machinary is determined \ if __rnd < __prob.get( __error): # by insertprob,deleteprob and muprob if __error == 'sub': self.seq.append(random.choice(self.mualphabet.get(__base))) self.occuredmu.append(__site) # Update the list of the sites where a mutation has occured self.alt_allele.extend([self.seq[-1]]) # Update the list of alternative alleles elif __error == 'ins': self.seq.append(__base) self.seq.append(random.choice(list(self.alphabet))) self.occuredins.append(__site) # Update the list of the sites after which an insertion has occured self.inserted_allele.extend([__base + self.seq[-1]]) # Update the list of inserted alleles elif __error == 'del': self.occureddel.append(__site) # Update the list of the sited with deleted letter else: self.seq.append(__base) self.seq = ''.join(self.seq) self.seq = MutableSeq(self.seq, self.alphaproperty) if (self.occuredins): _ins_allele = zip(self.occuredins, self.inserted_allele) _ins_allele.sort(key=lambda tup: tup[ 0]) # Sort the occured change positions self.occuredins, self.inserted_allele = zip( *_ins_allele) self.occuredins = list(self.occuredins) self.inserted_allele = list(self.inserted_allele) _ins_allele = None else: self.inserted_allele = [] self.occuredins = [] if (self.occuredmu): _alt_allele = zip(self.occuredmu, self.alt_allele) _alt_allele.sort(key=lambda tup: tup[0]) self.occuredmu, self.alt_allele = zip(*_alt_allele) self.occuredmu = list(self.occuredmu) self.alt_allele = list(self.alt_allele) _alt_allele = None else: self.occuredmu = [] self.alt_allele = [] if (self.occureddel): self.occureddel.sort() else: self.occureddel = [] if self.verbose: print("Changes made to the haplotype!")
class StringMethodTests(unittest.TestCase): _examples = [ # These are length 9, a multiple of 3 for translation tests: Seq("ACGTGGGGT"), Seq("ACGUGGGGU"), Seq("GG"), Seq("A"), UnknownSeq(1), UnknownSeq(1, character="n"), UnknownSeq(1, character="N"), UnknownSeq(12, character="N"), UnknownSeq(12, character="X"), UnknownSeq(12), ] for seq in _examples[:]: if not isinstance(seq, UnknownSeq): _examples.append(MutableSeq(seq)) _start_end_values = [0, 1, 2, 1000, -1, -2, -999, None] def _test_method(self, method_name, start_end=False): """Check this method matches the plain string's method.""" self.assertIsInstance(method_name, str) for example1 in self._examples: if not hasattr(example1, method_name): # e.g. MutableSeq does not support transcribe continue str1 = str(example1) for example2 in self._examples: if not hasattr(example2, method_name): # e.g. MutableSeq does not support transcribe continue str2 = str(example2) try: i = getattr(example1, method_name)(str2) except ValueError: i = ValueError try: j = getattr(str1, method_name)(str2) except ValueError: j = ValueError self.assertEqual(i, j, "%r.%s(%r)" % (example1, method_name, str2)) try: i = getattr(example1, method_name)(example2) except ValueError: i = ValueError try: j = getattr(str1, method_name)(str2) except ValueError: j = ValueError self.assertEqual( i, j, "%r.%s(%r)" % (example1, method_name, example2)) if start_end: for start in self._start_end_values: try: i = getattr(example1, method_name)(str2, start) except ValueError: i = ValueError try: j = getattr(str1, method_name)(str2, start) except ValueError: j = ValueError self.assertEqual( i, j, "%r.%s(%r, %s)" % (example1, method_name, str2, start)) for end in self._start_end_values: try: i = getattr(example1, method_name)(str2, start, end) except ValueError: i = ValueError try: j = getattr(str1, method_name)(str2, start, end) except ValueError: j = ValueError self.assertEqual( i, j, "%r.%s(%r, %s, %s)" % (example1, method_name, str2, start, end), ) def test_str_count(self): """Check matches the python string count method.""" self._test_method("count", start_end=True) self.assertEqual(Seq("AC777GT").count("7"), 3) self.assertRaises(TypeError, Seq("AC777GT").count, 7) self.assertRaises(TypeError, Seq("AC777GT").count, None) def test_count_overlap(self): """Check count_overlap exception matches python string count method.""" self.assertEqual(Seq("AC777GT").count("77"), 1) self.assertEqual(Seq("AC777GT").count_overlap("77"), 2) self.assertEqual(Seq("AC777GT").count_overlap("7"), 3) self.assertRaises(TypeError, Seq("AC777GT").count_overlap, 7) self.assertRaises(TypeError, Seq("AC777GT").count_overlap, None) def test_str_count_overlap_GG(self): """Check our count_overlap method using GG.""" # Testing with self._examples expected = [ 3, 3, 1, 0, # Seq() Tests 0, 0, 0, 0, 0, 0, # UnknownSeq() Tests 3, 3, 1, 0, # MutableSeq() Tests ] assert len(self._examples) == len(expected) for seq, exp in zip(self._examples, expected): # Using search term GG as a string self.assertEqual(seq.count_overlap("GG"), exp) self.assertEqual(seq.count_overlap("G" * 5), 0) # Using search term GG as a Seq self.assertEqual(seq.count_overlap(Seq("GG")), exp) self.assertEqual(seq.count_overlap(Seq("G" * 5)), 0) def test_count_overlap_start_end_GG(self): """Check our count_overlap method using GG with variable ends and starts.""" # Testing Seq() and MutableSeq() with variable start and end arguments start_end_exp = [ (1, 7, 3), (3, None, 3), (3, 6, 2), (4, 6, 1), (4, -1, 2), (-5, None, 2), (-5, 7, 2), (7, -5, 0), (-100, None, 3), (None, 100, 3), (-100, 1000, 3), ] testing_seq = "GTAGGGGAG" for start, end, exp in start_end_exp: self.assertEqual( Seq(testing_seq).count_overlap("GG", start, end), exp) self.assertEqual( MutableSeq(testing_seq).count_overlap("GG", start, end), exp) # Testing Seq() and MutableSeq() with a more heterogeneous sequenece self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG"), 5) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG"), 5) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -2, -10), 0) # Testing UnknownSeq() with variable start and end arguments char_start_end_exp = [ ("N", 1, 7, 0), ("N", 1, 7, 0), ("N", -4, None, 0), ("N", -4, None, 0), ("X", 1, 7, 0), ] for char, start, end, exp in char_start_end_exp: self.assertEqual( UnknownSeq(12, character=char).count_overlap("GG", start, end), exp) self.assertEqual( UnknownSeq(12, character="X").count_overlap("GG", 1, 7), 0) # Testing UnknownSeq() with some more cases including unusual edge cases substr_start_end_exp = [ ("G", 100, 105, 0), ("G", -1, 4, 0), ("G", 4, -1, 0), ("G", -8, -2, 0), ("G", -2, -8, 0), ("G", 8, 2, 0), ("G", 2, 8, 0), ("GG", 8, 2, 0), ("GG", 2, 8, 0), ("GG", -5, -1, 0), ("GG", 1, 5, 0), ("GGG", None, None, 0), ("GGGGGGGGG", None, None, 0), ("GGG", 1, 2, 0), ] for substr, start, end, exp in substr_start_end_exp: self.assertEqual( UnknownSeq(7, character="N").count_overlap(substr, start, end), exp) self.assertEqual( UnknownSeq(7, character="N").count_overlap("GG", 1), 0) def test_str_count_overlap_NN(self): """Check our count_overlap method using NN.""" # Testing with self._examples expected = [ 0, 0, 0, 0, # Seq() Tests 0, 0, 0, 11, 0, 0, # UnknownSeq() Tests 0, 0, 0, 0, # MutableSeq() Tests ] assert len(self._examples) == len(expected) for seq, exp in zip(self._examples, expected): # Using search term NN as a string self.assertEqual(seq.count_overlap("NN"), exp) self.assertEqual(seq.count_overlap("N" * 13), 0) # Using search term NN as a Seq self.assertEqual(seq.count_overlap(Seq("NN")), exp) self.assertEqual(seq.count_overlap(Seq("N" * 13)), 0) def test_count_overlap_start_end_NN(self): """Check our count_overlap method using NN with variable ends and starts.""" # Testing Seq() and MutableSeq() with variable start and end arguments start_end_exp = [ (1, 7, 0), (3, None, 0), (3, 6, 0), (4, 6, 0), (4, -1, 0), (-5, None, 0), (-5, 7, 0), (7, -5, 0), (-100, None, 0), (None, 100, 0), (-100, 1000, 0), ] testing_seq = "GTAGGGGAG" for start, end, exp in start_end_exp: self.assertEqual( Seq(testing_seq).count_overlap("NN", start, end), exp) self.assertEqual( MutableSeq(testing_seq).count_overlap("NN", start, end), exp) # Testing Seq() and MutableSeq() with a more heterogeneous sequenece self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN"), 0) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN"), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -10, -2), 0) # Testing UnknownSeq() with variable start and end arguments char_start_end_exp = [ ("N", 1, 7, 5), ("N", 1, 7, 5), ("N", -4, None, 3), ("N", -4, None, 3), ("X", 1, 7, 0), ] for char, start, end, exp in char_start_end_exp: self.assertEqual( UnknownSeq(12, character=char).count_overlap("NN", start, end), exp) self.assertEqual( UnknownSeq(12, character="X").count_overlap("NN", 1, 7), 0) # Testing UnknownSeq() with some more cases including unusual edge cases substr_start_end_exp = [ ("N", 100, 105, 0), ("N", -1, 4, 0), ("N", 4, -1, 2), ("N", -8, -2, 5), ("N", -2, -8, 0), ("N", 8, 2, 0), ("N", 2, 8, 5), ("NN", 8, 2, 0), ("NN", 2, 8, 4), ("NN", -5, -1, 3), ("NN", 1, 5, 3), ("NNN", None, None, 5), ("NNNNNNNNN", None, None, 0), ("NNN", 1, 2, 0), ] for substr, start, end, exp in substr_start_end_exp: self.assertEqual( UnknownSeq(7, character="N").count_overlap(substr, start, end), exp) self.assertEqual( UnknownSeq(7, character="N").count_overlap("NN", 1), 5) def test_str_find(self): """Check matches the python string find method.""" self._test_method("find", start_end=True) self.assertEqual(Seq("AC7GT").find("7"), 2) self.assertRaises(TypeError, Seq("AC7GT").find, 7) self.assertRaises(TypeError, Seq("ACGT").find, None) def test_str_rfind(self): """Check matches the python string rfind method.""" self._test_method("rfind", start_end=True) self.assertEqual(Seq("AC7GT").rfind("7"), 2) self.assertRaises(TypeError, Seq("AC7GT").rfind, 7) self.assertRaises(TypeError, Seq("ACGT").rfind, None) def test_str_index(self): """Check matches the python string index method.""" self._test_method("index", start_end=True) self.assertEqual(Seq("AC7GT").index("7"), 2) self.assertRaises(TypeError, Seq("AC7GT").index, 7) self.assertRaises(TypeError, Seq("ACGT").index, None) self.assertEqual(MutableSeq("AC7GT").index("7"), 2) self.assertRaises(TypeError, MutableSeq("AC7GT").index, 7) self.assertRaises(TypeError, MutableSeq("ACGT").index, None) def test_str_rindex(self): """Check matches the python string rindex method.""" self._test_method("rindex", start_end=True) self.assertEqual(Seq("AC7GT").rindex("7"), 2) self.assertRaises(TypeError, Seq("AC7GT").rindex, 7) self.assertRaises(TypeError, Seq("ACGT").rindex, None) self.assertEqual(MutableSeq("AC7GT").rindex("7"), 2) self.assertRaises(TypeError, MutableSeq("AC7GT").rindex, 7) self.assertRaises(TypeError, MutableSeq("ACGT").rindex, None) def test_str_startswith(self): """Check matches the python string startswith method.""" self._test_method("startswith", start_end=True) self.assertTrue("ABCDE".startswith(("ABE", "OBE", "ABC"))) self.assertRaises(TypeError, Seq("ACGT").startswith, None) self.assertRaises(TypeError, MutableSeq("ACGT").startswith, None) # Now check with a tuple of sub sequences for example1 in self._examples: subs = tuple(example1[start:start + 2] for start in range(0, len(example1) - 2, 3)) subs_str = tuple(str(s) for s in subs) self.assertEqual( str(example1).startswith(subs_str), example1.startswith(subs)) self.assertEqual( str(example1).startswith(subs_str), example1.startswith(subs_str)) # strings! self.assertEqual( str(example1).startswith(subs_str, 3), example1.startswith(subs, 3)) self.assertEqual( str(example1).startswith(subs_str, 2, 6), example1.startswith(subs, 2, 6), ) def test_str_endswith(self): """Check matches the python string endswith method.""" self._test_method("endswith", start_end=True) self.assertTrue("ABCDE".endswith(("ABE", "OBE", "CDE"))) self.assertRaises(TypeError, Seq("ACGT").endswith, None) # Now check with a tuple of sub sequences for example1 in self._examples: subs = tuple(example1[start:start + 2] for start in range(0, len(example1) - 2, 3)) subs_str = tuple(str(s) for s in subs) self.assertEqual( str(example1).endswith(subs_str), example1.endswith(subs)) self.assertEqual( str(example1).startswith(subs_str), example1.startswith(subs_str)) # strings! self.assertEqual( str(example1).endswith(subs_str, 3), example1.endswith(subs, 3)) self.assertEqual( str(example1).endswith(subs_str, 2, 6), example1.endswith(subs, 2, 6)) def test_str_strip(self): """Check matches the python string strip method.""" self._test_method("strip") s = Seq(" ACGT ") m = MutableSeq(" ACGT ") self.assertEqual(s.strip(), "ACGT") self.assertRaises(TypeError, s.strip, 7) self.assertEqual(s, " ACGT ") self.assertEqual(m.strip(), "ACGT") self.assertRaises(TypeError, m.strip, 7) self.assertEqual(m, " ACGT ") self.assertEqual(m.strip(inplace=True), "ACGT") self.assertEqual(m, "ACGT") def test_str_lstrip(self): """Check matches the python string lstrip method.""" self._test_method("lstrip") s = Seq(" ACGT ") m = MutableSeq(" ACGT ") self.assertEqual(s.lstrip(), "ACGT ") self.assertRaises(TypeError, s.lstrip, 7) self.assertEqual(s, " ACGT ") self.assertEqual(m.lstrip(), "ACGT ") self.assertRaises(TypeError, m.lstrip, 7) self.assertEqual(m, " ACGT ") self.assertEqual(m.lstrip(inplace=True), "ACGT ") self.assertEqual(m, "ACGT ") def test_str_rstrip(self): """Check matches the python string rstrip method.""" self._test_method("rstrip") s = Seq(" ACGT ") m = MutableSeq(" ACGT ") self.assertEqual(s.rstrip(), " ACGT") self.assertRaises(TypeError, s.rstrip, 7) self.assertEqual(s, " ACGT ") self.assertEqual(m.rstrip(), " ACGT") self.assertRaises(TypeError, m.rstrip, 7) self.assertEqual(m, " ACGT ") self.assertEqual(m.rstrip(inplace=True), " ACGT") self.assertEqual(m, " ACGT") def test_str_split(self): """Check matches the python string split method.""" self._test_method("split") self.assertEqual(Seq("AC7GT").split("7"), "AC7GT".split("7")) self.assertRaises(TypeError, Seq("AC7GT").split, 7) self.assertEqual(MutableSeq("AC7GT").split("7"), "AC7GT".split("7")) self.assertRaises(TypeError, MutableSeq("AC7GT").split, 7) def test_str_rsplit(self): """Check matches the python string rsplit method.""" self._test_method("rsplit") self.assertEqual(Seq("AC7GT").rsplit("7"), "AC7GT".rsplit("7")) self.assertRaises(TypeError, Seq("AC7GT").rsplit, 7) self.assertEqual(MutableSeq("AC7GT").rsplit("7"), "AC7GT".rsplit("7")) self.assertRaises(TypeError, MutableSeq("AC7GT").rsplit, 7) def test_str_length(self): """Check matches the python string __len__ method.""" for example1 in self._examples: str1 = str(example1) self.assertEqual(len(example1), len(str1)) def test_str_upper(self): """Check matches the python string upper method.""" for example1 in self._examples: str1 = str(example1) self.assertEqual(example1.upper(), str1.upper()) def test_str_lower(self): """Check matches the python string lower method.""" for example1 in self._examples: str1 = str(example1) self.assertEqual(example1.lower(), str1.lower()) def test_str_encode(self): """Check matches the python string encode method.""" for example1 in self._examples: str1 = str(example1) self.assertEqual(bytes(example1), str1.encode("ascii")) def test_str_hash(self): for example1 in self._examples: if isinstance(example1, MutableSeq): continue with warnings.catch_warnings(): # Silence change in behaviour warning warnings.simplefilter("ignore", BiopythonWarning) self.assertEqual( hash(str(example1)), hash(example1), "Hash mismatch, %r for %r vs %r for %r" % (hash(str(example1)), id(example1), hash(example1), example1), ) def test_str_comparison(self): for example1 in self._examples: for example2 in self._examples: with warnings.catch_warnings(): self.assertEqual( str(example1) == str(example2), example1 == example2, "Checking %r == %r" % (example1, example2), ) self.assertEqual( str(example1) != str(example2), example1 != example2, "Checking %r != %r" % (example1, example2), ) self.assertEqual( str(example1) < str(example2), example1 < example2, "Checking %r < %r" % (example1, example2), ) self.assertEqual( str(example1) <= str(example2), example1 <= example2, "Checking %r <= %r" % (example1, example2), ) self.assertEqual( str(example1) > str(example2), example1 > example2, "Checking %r > %r" % (example1, example2), ) self.assertEqual( str(example1) >= str(example2), example1 >= example2, "Checking %r >= %r" % (example1, example2), ) def test_str_getitem(self): """Check slicing and indexing works like a string.""" for example1 in self._examples: str1 = str(example1) for i in self._start_end_values: if i is not None and abs(i) < len(example1): self.assertEqual(example1[i], str1[i]) self.assertEqual(example1[:i], str1[:i]) self.assertEqual(example1[i:], str1[i:]) for j in self._start_end_values: self.assertEqual(example1[i:j], str1[i:j]) for step in range(-3, 4): if step == 0: with self.assertRaises(ValueError) as cm: example1[i:j:step] self.assertEqual(str(cm.exception), "slice step cannot be zero") else: self.assertEqual(example1[i:j:step], str1[i:j:step]) def test_tomutable(self): """Check creating a MutableSeq object.""" for example1 in self._examples: mut = MutableSeq(example1) self.assertIsInstance(mut, MutableSeq) self.assertEqual(mut, example1) def test_toseq(self): """Check creating a Seq object.""" for example1 in self._examples: seq = Seq(example1) self.assertIsInstance(seq, Seq) self.assertEqual(seq, example1) def test_the_complement(self): """Check obj.complement() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue comp = example1.complement() str1 = str(example1) if "U" in str1 or "u" in str1: mapping = str.maketrans("ACGUacgu", "UGCAugca") else: # Default to DNA, e.g. complement("A") -> "T" not "U" mapping = str.maketrans("ACGTacgt", "TGCAtgca") self.assertEqual(str1.translate(mapping), comp) def test_the_reverse_complement(self): """Check obj.reverse_complement() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue comp = example1.reverse_complement() str1 = str(example1) if "U" in str1 or "u" in str1: mapping = str.maketrans("ACGUacgu", "UGCAugca") else: # Defaults to DNA, so reverse_complement("A") --> "T" not "U" mapping = str.maketrans("ACGTacgt", "TGCAtgca") self.assertEqual(str1.translate(mapping)[::-1], comp) def test_the_transcription(self): """Check obj.transcribe() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue tran = example1.transcribe() str1 = str(example1) if len(str1) % 3 != 0: # TODO - Check for or silence the expected warning? continue self.assertEqual(str1.replace("T", "U").replace("t", "u"), tran) def test_the_back_transcription(self): """Check obj.back_transcribe() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue tran = example1.back_transcribe() str1 = str(example1) self.assertEqual(str1.replace("U", "T").replace("u", "t"), tran) def test_the_translate(self): """Check obj.translate() method.""" mapping = "" for example1 in self._examples: if len(example1) % 3 != 0: # TODO - Check for or silence the expected warning? continue tran = example1.translate() # Try with positional vs named argument: self.assertEqual(example1.translate(11), example1.translate(table=11)) # TODO - check the actual translation, and all the optional args def test_the_translation_of_stops(self): """Check obj.translate() method with stop codons.""" misc_stops = "TAATAGTGAAGAAGG" nuc = Seq(misc_stops) self.assertEqual("***RR", nuc.translate()) self.assertEqual("***RR", nuc.translate(1)) self.assertEqual("***RR", nuc.translate("SGC0")) self.assertEqual("**W**", nuc.translate(table=2)) self.assertEqual("**WRR", nuc.translate(table="Yeast Mitochondrial")) self.assertEqual("**WSS", nuc.translate(table=5)) self.assertEqual("**WSS", nuc.translate(table=9)) self.assertEqual("**CRR", nuc.translate(table="Euplotid Nuclear")) self.assertEqual("***RR", nuc.translate(table=11)) self.assertEqual("***RR", nuc.translate(table="11")) self.assertEqual("***RR", nuc.translate(table="Bacterial")) self.assertEqual("**GRR", nuc.translate(table=25)) self.assertEqual("", nuc.translate(to_stop=True)) self.assertEqual("O*ORR", nuc.translate(table=special_table)) self.assertEqual("*QWRR", nuc.translate(table=Chilodonella_uncinata_table)) nuc = MutableSeq(misc_stops) self.assertEqual("***RR", nuc.translate()) self.assertEqual("***RR", nuc.translate(1)) self.assertEqual("***RR", nuc.translate("SGC0")) self.assertEqual("**W**", nuc.translate(table=2)) self.assertEqual("**WRR", nuc.translate(table="Yeast Mitochondrial")) self.assertEqual("**WSS", nuc.translate(table=5)) self.assertEqual("**WSS", nuc.translate(table=9)) self.assertEqual("**CRR", nuc.translate(table="Euplotid Nuclear")) self.assertEqual("***RR", nuc.translate(table=11)) self.assertEqual("***RR", nuc.translate(table="11")) self.assertEqual("***RR", nuc.translate(table="Bacterial")) self.assertEqual("**GRR", nuc.translate(table=25)) self.assertEqual("", nuc.translate(to_stop=True)) self.assertEqual("O*ORR", nuc.translate(table=special_table)) self.assertEqual("*QWRR", nuc.translate(table=Chilodonella_uncinata_table)) # These test the Bio.Seq.translate() function - move these?: self.assertEqual( "*QWRR", translate(str(nuc), table=Chilodonella_uncinata_table)) self.assertEqual("O*ORR", translate(str(nuc), table=special_table)) self.assertEqual("", translate(str(nuc), to_stop=True)) self.assertEqual("***RR", translate(str(nuc), table="Bacterial")) self.assertEqual("***RR", translate(str(nuc), table="11")) self.assertEqual("***RR", translate(str(nuc), table=11)) self.assertEqual("**W**", translate(str(nuc), table=2)) self.assertEqual(Seq("TAT").translate(), "Y") self.assertEqual(Seq("TAR").translate(), "*") self.assertEqual(Seq("TAN").translate(), "X") self.assertEqual(Seq("NNN").translate(), "X") self.assertEqual(Seq("TAt").translate(), "Y") self.assertEqual(Seq("TaR").translate(), "*") self.assertEqual(Seq("TaN").translate(), "X") self.assertEqual(Seq("nnN").translate(), "X") self.assertEqual(Seq("tat").translate(), "Y") self.assertEqual(Seq("tar").translate(), "*") self.assertEqual(Seq("tan").translate(), "X") self.assertEqual(Seq("nnn").translate(), "X") def test_the_translation_of_invalid_codons(self): """Check obj.translate() method with invalid codons.""" for codon in ["TA?", "N-N", "AC_", "Ac_"]: msg = "Translating %s should fail" % codon nuc = Seq(codon) with self.assertRaises(TranslationError, msg=msg): nuc.translate() nuc = MutableSeq(codon) with self.assertRaises(TranslationError, msg=msg): nuc.translate() def test_the_translation_of_ambig_codons(self): """Check obj.translate() method with ambiguous codons.""" for ambig_values in [ambiguous_dna_values, ambiguous_rna_values]: ambig = set(ambig_values.keys()) ambig.remove("X") for c1 in ambig: for c2 in ambig: for c3 in ambig: values = { str(Seq(a + b + c).translate()) for a in ambig_values[c1] for b in ambig_values[c2] for c in ambig_values[c3] } t = Seq(c1 + c2 + c3).translate() if t == "*": self.assertEqual(values, set("*")) elif t == "X": self.assertGreater( len(values), 1, "translate('%s') = '%s' not '%s'" % (c1 + c2 + c3, t, ",".join(values)), ) elif t == "Z": self.assertEqual(values, set("EQ")) elif t == "B": self.assertEqual(values, set("DN")) elif t == "J": self.assertEqual(values, set("LI")) else: self.assertEqual(values, set(t)) # TODO - Use the Bio.Data.IUPACData module for the # ambiguous protein mappings? def test_init_typeerror(self): """Check Seq __init__ gives TypeError exceptions.""" self.assertRaises(TypeError, Seq, ("A", "C", "G", "T")) self.assertRaises(TypeError, Seq, ["A", "C", "G", "T"]) self.assertRaises(TypeError, Seq, 1) self.assertRaises(TypeError, Seq, 1.0) def test_MutableSeq_init_typeerror(self): """Check MutableSeq __init__ gives TypeError exceptions.""" self.assertRaises(TypeError, MutableSeq, ("A", "C", "G", "T")) self.assertRaises(TypeError, MutableSeq, ["A", "C", "G", "T"]) self.assertRaises(TypeError, MutableSeq, 1) self.assertRaises(TypeError, MutableSeq, 1.0) def test_join_Seq_TypeError(self): """Checks that a TypeError is thrown for all non-iterable types.""" # No iterable types which contain non-accepted types either. spacer = Seq("NNNNN") self.assertRaises(TypeError, spacer.join, 5) self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"]) def test_join_UnknownSeq_TypeError_iter(self): """Checks that a TypeError is thrown for all non-iterable types.""" # No iterable types which contain non-accepted types either. spacer = UnknownSeq(5, character="-") self.assertRaises(TypeError, spacer.join, 5) self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"]) def test_join_MutableSeq_TypeError_iter(self): """Checks that a TypeError is thrown for all non-iterable types.""" # No iterable types which contain non-accepted types either. spacer = MutableSeq("MMMMM") self.assertRaises(TypeError, spacer.join, 5) self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"]) def test_join_Seq(self): """Checks if Seq join correctly concatenates sequence with the spacer.""" spacer = Seq("NNNNN") self.assertEqual( "N" * 15, spacer.join([Seq("NNNNN"), Seq("NNNNN")]), ) spacer1 = Seq("") spacers = [spacer1, Seq("NNNNN"), Seq("GGG")] example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"] # strings with empty spacer str_concatenated = spacer1.join(example_strings) self.assertEqual(str_concatenated, "".join(example_strings)) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(seq_concatenated, str(spacer).join(example_strings)) # Now try single sequence arguments, should join the letters for target in example_strings + example_strings_seqs: self.assertEqual( str(spacer).join(str(target)), str(spacer.join(target))) def test_join_UnknownSeq(self): """Checks if UnknownSeq join correctly concatenates sequence with the spacer.""" spacer1 = UnknownSeq(5, character="-") spacer2 = UnknownSeq(0, character="-") spacers = [spacer1, spacer2] self.assertEqual( "-" * 15, spacer1.join( [UnknownSeq(5, character="-"), UnknownSeq(5, character="-")]), ) self.assertEqual( "N" * 5 + "-" * 10, spacer1.join([Seq("NNNNN"), UnknownSeq(5, character="-")]), ) example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"] # strings with empty spacer str_concatenated = spacer2.join(example_strings) self.assertEqual(str_concatenated, "".join(example_strings)) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(seq_concatenated, str(spacer).join(example_strings)) # Now try single sequence arguments, should join the letters for target in example_strings + example_strings_seqs: self.assertEqual( str(spacer).join(str(target)), str(spacer.join(target))) def test_join_MutableSeq_mixed(self): """Check MutableSeq objects can be joined.""" spacer = MutableSeq("NNNNN") self.assertEqual( "N" * 15, spacer.join([MutableSeq("NNNNN"), MutableSeq("NNNNN")]), ) self.assertRaises( TypeError, spacer.join([Seq("NNNNN"), MutableSeq("NNNNN")]), ) def test_join_Seq_with_file(self): """Checks if Seq join correctly concatenates sequence from a file with the spacer.""" filename = "Fasta/f003" seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")] seqlist_as_strings = [str(_) for _ in seqlist] spacer = Seq("NNNNN") spacer1 = Seq("") # seq objects with spacer seq_concatenated = spacer.join(seqlist) # seq objects with empty spacer seq_concatenated1 = spacer1.join(seqlist) ref_data = ref_data1 = "" ref_data = str(spacer).join(seqlist_as_strings) ref_data1 = str(spacer1).join(seqlist_as_strings) self.assertEqual(seq_concatenated, ref_data) self.assertEqual(seq_concatenated1, ref_data1) with self.assertRaises(TypeError): spacer.join(SeqIO.parse(filename, "fasta")) def test_join_UnknownSeq_with_file(self): """Checks if UnknownSeq join correctly concatenates sequence from a file with the spacer.""" filename = "Fasta/f003" seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")] seqlist_as_strings = [str(_) for _ in seqlist] spacer = UnknownSeq(0, character="-") spacer1 = UnknownSeq(5, character="-") # seq objects with spacer seq_concatenated = spacer.join(seqlist) # seq objects with empty spacer seq_concatenated1 = spacer1.join(seqlist) ref_data = ref_data1 = "" ref_data = str(spacer).join(seqlist_as_strings) ref_data1 = str(spacer1).join(seqlist_as_strings) self.assertEqual(seq_concatenated, ref_data) self.assertEqual(seq_concatenated1, ref_data1) with self.assertRaises(TypeError): spacer.join(SeqIO.parse(filename, "fasta")) def test_join_MutableSeq(self): """Checks if MutableSeq join correctly concatenates sequence with the spacer.""" # Only expect it to take Seq objects and/or strings in an iterable! spacer1 = MutableSeq("") spacers = [ spacer1, MutableSeq("NNNNN"), MutableSeq("GGG"), ] example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"] # strings with empty spacer str_concatenated = spacer1.join(example_strings) self.assertEqual(str_concatenated, "".join(example_strings)) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(seq_concatenated, str(spacer).join(example_strings)) def test_join_MutableSeq_with_file(self): """Checks if MutableSeq join correctly concatenates sequence from a file with the spacer.""" filename = "Fasta/f003" seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")] seqlist_as_strings = [str(_) for _ in seqlist] spacer = MutableSeq("NNNNN") spacer1 = MutableSeq("") # seq objects with spacer seq_concatenated = spacer.join(seqlist) # seq objects with empty spacer seq_concatenated1 = spacer1.join(seqlist) ref_data = ref_data1 = "" ref_data = str(spacer).join(seqlist_as_strings) ref_data1 = str(spacer1).join(seqlist_as_strings) self.assertEqual(seq_concatenated, ref_data) self.assertEqual(seq_concatenated1, ref_data1) with self.assertRaises(TypeError): spacer.join(SeqIO.parse(filename, "fasta")) def test_equality(self): """Test equality when mixing types.""" self.assertEqual(Seq("6"), "6") self.assertNotEqual(Seq("6"), 6) self.assertEqual(Seq(""), "") self.assertNotEqual(Seq(""), None) self.assertEqual(Seq("None"), "None") self.assertNotEqual(Seq("None"), None) self.assertEqual(MutableSeq("6"), "6") self.assertNotEqual(MutableSeq("6"), 6) self.assertEqual(MutableSeq(""), "") self.assertNotEqual(MutableSeq(""), None) self.assertEqual(MutableSeq("None"), "None") self.assertNotEqual(MutableSeq("None"), None) self.assertEqual(UnknownSeq(1, character="6"), "6") self.assertNotEqual(UnknownSeq(1, character="6"), 6) self.assertEqual(UnknownSeq(0), "") self.assertNotEqual(UnknownSeq(0), None)
def setUp(self): self.alphabet = TestAlphabet() self.genome = MutableSeq("1234", self.alphabet) self.organism = Organism.Organism(self.genome, fitness_calculator)
def test_tomutable(self): """Check creating a MutableSeq object.""" for example1 in self._examples: mut = MutableSeq(example1) self.assertIsInstance(mut, MutableSeq) self.assertEqual(mut, example1)
str_light_chain_one, str_light_chain_two, "ATGCGTATCGATCGCGATACGATTAGGCGGAT" ] def u_crc32(seq): #NOTE - On Python 2 crc32 could return a signed int, but on Python 3 it is #always unsigned #Docs suggest should use crc32(x) & 0xffffffff for consistency. return crc32(seq) & 0xffffffff for i, seq_str in enumerate(examples): print "Example %i, length %i, %s..." % (i + 1, len(seq_str), seq_str[:10]) #Avoid cross platforms with printing floats by doing conversion explicitly def simple_LCC(s): return "%0.2f" % lcc_simp(s) def windowed_LCC(s): return ", ".join(["%0.2f" % v for v in lcc_mult(s, 20)]) for checksum in [u_crc32, crc64, gcg, seguid, simple_LCC, windowed_LCC]: #First using a string: value = checksum(seq_str) print " %s = %s" % (checksum.__name__, value) #Secondly check it works with a Seq object assert value == checksum(Seq(seq_str, single_letter_alphabet)) #Finally check it works with a MutableSeq object assert value == checksum(MutableSeq(seq_str, single_letter_alphabet))
def create_clusters_from_bowtie(self): """ The 'offset' field is actually 'abundance' The 'ref' field is actually 'cycle' offset """ with open(self.otu_txt) as f: for line in f: otuid, rest = line.strip().split(None, 1) for x in rest.split(): self.otu_info[x] = otuid self.cluster_by_otu[otuid] = {} for r in BowTieReader(self.input_bowtie, False): cid = r['ID'] otuid = self.otu_info[r['ID']] self.cluster_by_otu[otuid][cid] = {'dirty':True, 'cids':[cid], 'len':len(r['seq']), 'seq': MutableSeq(r['seq']), 'size':int(r['offset']), \ 'qual': [ord(x)-33 for x in r['qual']], 'cycle': range(int(r['ref']), int(r['ref'])+len(r['seq']))}
def test_count_overlap_start_end_NN(self): """Check our count_overlap method using NN with variable ends and starts.""" # Testing Seq() and MutableSeq() with variable start and end arguments start_end_exp = [ (1, 7, 0), (3, None, 0), (3, 6, 0), (4, 6, 0), (4, -1, 0), (-5, None, 0), (-5, 7, 0), (7, -5, 0), (-100, None, 0), (None, 100, 0), (-100, 1000, 0), ] testing_seq = "GTAGGGGAG" for start, end, exp in start_end_exp: self.assertEqual( Seq(testing_seq).count_overlap("NN", start, end), exp) self.assertEqual( MutableSeq(testing_seq).count_overlap("NN", start, end), exp) # Testing Seq() and MutableSeq() with a more heterogeneous sequenece self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN"), 0) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN"), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -10, -2), 0) # Testing UnknownSeq() with variable start and end arguments alphabet_char_start_end_exp = [ (generic_rna, "N", 1, 7, 5), (generic_dna, "N", 1, 7, 5), (generic_rna, "N", -4, None, 3), (generic_dna, "N", -4, None, 3), (generic_protein, "X", 1, 7, 0), ] for alpha, char, start, end, exp in alphabet_char_start_end_exp: self.assertEqual( UnknownSeq(12, alpha, char).count_overlap("NN", start, end), exp) self.assertEqual( UnknownSeq(12, character="X").count_overlap("NN", 1, 7), 0) # Testing UnknownSeq() with some more cases including unusual edge cases substr_start_end_exp = [ ("N", 100, 105, 0), ("N", -1, 4, 0), ("N", 4, -1, 2), ("N", -8, -2, 5), ("N", -2, -8, 0), ("N", 8, 2, 0), ("N", 2, 8, 5), ("NN", 8, 2, 0), ("NN", 2, 8, 4), ("NN", -5, -1, 3), ("NN", 1, 5, 3), ("NNN", None, None, 5), ("NNNNNNNNN", None, None, 0), ("NNN", 1, 2, 0), ] for substr, start, end, exp in substr_start_end_exp: self.assertEqual( UnknownSeq(7, character="N").count_overlap(substr, start, end), exp) self.assertEqual( UnknownSeq(7, character="N").count_overlap("NN", 1), 5)
def get_optimal_alignment(self): """Follow the traceback to get the optimal alignment.""" # intialize the two sequences which will return the alignment align_seq1 = MutableSeq(array.array("c"), Alphabet.Gapped(IUPAC.protein, GAP_CHAR)) align_seq2 = MutableSeq(array.array("c"), Alphabet.Gapped(IUPAC.protein, GAP_CHAR)) # take care of the initial case with the bottom corner matrix # item current_cell = self.dpmatrix[(len(self.seq1), len(self.seq2))] align_seq1.append(current_cell.seq1item) align_seq2.append(current_cell.seq2item) next_cell = current_cell.get_parent() current_cell = next_cell next_cell = current_cell.get_parent() # keeping adding sequence until we reach (0, 0) while next_cell: # add the new sequence--three cases: # 1. Move up diaganolly, add a new seq1 and seq2 to the # aligned sequences if ((next_cell.col_pos == current_cell.col_pos - 1) and (next_cell.row_pos == current_cell.row_pos - 1)): # print "case 1 -> seq1 %s, seq2 %s" % ( # current_cell.seq1item, current_cell.seq2item) align_seq1.append(current_cell.seq1item) align_seq2.append(current_cell.seq2item) # 2. Move upwards, add a new seq2 and a gap in seq1 elif ((next_cell.col_pos == current_cell.col_pos) and (next_cell.row_pos == current_cell.row_pos - 1)): #print "case 2 -> seq2 %s" % current_cell.seq2item align_seq1.append(GAP_CHAR) align_seq2.append(current_cell.seq2item) # 3. Move to the right, add a new seq1 and a gap in seq2 elif ((next_cell.col_pos == current_cell.col_pos - 1) and (next_cell.row_pos == current_cell.row_pos)): #print "case 3 -> seq1 % s" % current_cell.seq1item align_seq1.append(current_cell.seq1item) align_seq2.append(GAP_CHAR) # now move on to the next sequence current_cell = next_cell next_cell = current_cell.get_parent() # reverse the returned alignments since we are reading them in # backwards align_seq1.reverse() align_seq2.reverse() return align_seq1.toseq(), align_seq2.toseq()
class TestMutableSeq(unittest.TestCase): def setUp(self): self.s = Seq.Seq("TCAAAAGGATGCATCATG", IUPAC.unambiguous_dna) self.mutable_s = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna) def test_mutableseq_creation(self): """Test creating MutableSeqs in multiple ways""" mutable_s = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna) self.assertIsInstance(mutable_s, MutableSeq, "Creating MutableSeq") mutable_s = self.s.tomutable() self.assertIsInstance(mutable_s, MutableSeq, "Converting Seq to mutable") array_seq = MutableSeq(array.array(array_indicator, "TCAAAAGGATGCATCATG"), IUPAC.ambiguous_dna) self.assertIsInstance(array_seq, MutableSeq, "Creating MutableSeq using array") def test_repr(self): self.assertEqual("MutableSeq('TCAAAAGGATGCATCATG', IUPACAmbiguousDNA())", repr(self.mutable_s)) def test_truncated_repr(self): seq = "TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGA" expected = "MutableSeq('TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATG...GGA', IUPACAmbiguousDNA())" self.assertEqual(expected, repr(MutableSeq(seq, IUPAC.ambiguous_dna))) def test_equal_comparison(self): """Test __eq__ comparison method""" self.assertEqual(self.mutable_s, "TCAAAAGGATGCATCATG") def test_equal_comparison_of_incompatible_alphabets(self): with warnings.catch_warnings(record=True): self.mutable_s == MutableSeq('UCAAAAGGA', IUPAC.ambiguous_rna) def test_not_equal_comparison(self): """Test __ne__ comparison method""" self.assertNotEqual(self.mutable_s, "other thing") def test_less_than_comparison(self): """Test __lt__ comparison method""" self.assertTrue(self.mutable_s[:-1] < self.mutable_s) def test_less_than_comparison_of_incompatible_alphabets(self): with warnings.catch_warnings(record=True): self.mutable_s[:-1] < MutableSeq("UCAAAAGGAUGCAUCAUG", IUPAC.ambiguous_rna) def test_less_than_comparison_without_alphabet(self): self.assertTrue(self.mutable_s[:-1] < "TCAAAAGGATGCATCATG") def test_less_than_or_equal_comparison(self): """Test __le__ comparison method""" self.assertTrue(self.mutable_s[:-1] <= self.mutable_s) def test_less_than_or_equal_comparison_of_incompatible_alphabets(self): with warnings.catch_warnings(record=True): self.mutable_s[:-1] <= MutableSeq("UCAAAAGGAUGCAUCAUG", IUPAC.ambiguous_rna) def test_less_than_or_equal_comparison_without_alphabet(self): self.assertTrue(self.mutable_s[:-1] <= "TCAAAAGGATGCATCATG") def test_add_method(self): """Test adding wrong type to MutableSeq""" with self.assertRaises(TypeError): self.mutable_s + 1234 def test_radd_method(self): self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG", self.mutable_s.__radd__(self.mutable_s)) def test_radd_method_incompatible_alphabets(self): with self.assertRaises(TypeError): self.mutable_s.__radd__(MutableSeq("UCAAAAGGA", IUPAC.ambiguous_rna)) def test_radd_method_using_seq_object(self): self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG", self.mutable_s.__radd__(self.s)) def test_radd_method_wrong_type(self): with self.assertRaises(TypeError): self.mutable_s.__radd__(1234) def test_as_string(self): self.assertEqual("TCAAAAGGATGCATCATG", str(self.mutable_s)) def test_length(self): self.assertEqual(18, len(self.mutable_s)) def test_converting_to_immutable(self): self.assertIsInstance(self.mutable_s.toseq(), Seq.Seq) def test_first_nucleotide(self): self.assertEqual('T', self.mutable_s[0]) def test_setting_slices(self): self.assertEqual(MutableSeq('CAAA', IUPAC.ambiguous_dna), self.mutable_s[1:5], "Slice mutable seq") self.mutable_s[1:3] = "GAT" self.assertEqual(MutableSeq("TGATAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Set slice with string and adding extra nucleotide") self.mutable_s[1:3] = self.mutable_s[5:7] self.assertEqual(MutableSeq("TAATAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Set slice with MutableSeq") self.mutable_s[1:3] = array.array(array_indicator, "GAT") self.assertEqual(MutableSeq("TGATTAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Set slice with array") def test_setting_item(self): self.mutable_s[3] = "G" self.assertEqual(MutableSeq("TCAGAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_deleting_slice(self): del self.mutable_s[4:5] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_deleting_item(self): del self.mutable_s[3] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_appending(self): self.mutable_s.append("C") self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGC", IUPAC.ambiguous_dna), self.mutable_s) def test_inserting(self): self.mutable_s.insert(4, "G") self.assertEqual(MutableSeq("TCAAGAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_popping_last_item(self): self.assertEqual("G", self.mutable_s.pop()) def test_remove_items(self): self.mutable_s.remove("G") self.assertEqual(MutableSeq("TCAAAAGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s, "Remove first G") self.assertRaises(ValueError, self.mutable_s.remove, 'Z') def test_count(self): self.assertEqual(7, self.mutable_s.count("A")) self.assertEqual(2, self.mutable_s.count("AA")) def test_index(self): self.assertEqual(2, self.mutable_s.index("A")) self.assertRaises(ValueError, self.mutable_s.index, "8888") def test_reverse(self): """Test using reverse method""" self.mutable_s.reverse() self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT", IUPAC.ambiguous_dna), self.mutable_s) def test_reverse_with_stride(self): """Test reverse using -1 stride""" self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT", IUPAC.ambiguous_dna), self.mutable_s[::-1]) def test_complement(self): self.mutable_s.complement() self.assertEqual(str("AGTTTTCCTACGTAGTAC"), str(self.mutable_s)) def test_complement_rna(self): seq = Seq.MutableSeq("AUGaaaCUG", IUPAC.unambiguous_rna) seq.complement() self.assertEqual(str("UACuuuGAC"), str(seq)) def test_complement_mixed_aphabets(self): seq = Seq.MutableSeq("AUGaaaCTG") with self.assertRaises(ValueError): seq.complement() def test_complement_rna_string(self): seq = Seq.MutableSeq("AUGaaaCUG") seq.complement() self.assertEqual('UACuuuGAC', str(seq)) def test_complement_dna_string(self): seq = Seq.MutableSeq("ATGaaaCTG") seq.complement() self.assertEqual('TACtttGAC', str(seq)) def test_reverse_complement(self): self.mutable_s.reverse_complement() self.assertEqual("CATGATGCATCCTTTTGA", str(self.mutable_s)) def test_reverse_complement_of_protein(self): seq = Seq.MutableSeq("ACTGTCGTCT", Alphabet.generic_protein) with self.assertRaises(ValueError): seq.reverse_complement() def test_to_string_method(self): """This method is currently deprecated, probably will need to remove this test soon""" with warnings.catch_warnings(record=True): self.mutable_s.tostring() def test_extend_method(self): self.mutable_s.extend("GAT") self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGGAT", IUPAC.ambiguous_dna), self.mutable_s) def test_extend_with_mutable_seq(self): self.mutable_s.extend(MutableSeq("TTT", IUPAC.ambiguous_dna)) self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGTTT", IUPAC.ambiguous_dna), self.mutable_s) def test_delete_stride_slice(self): del self.mutable_s[4:6 - 1] self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna), self.mutable_s) def test_extract_third_nucleotide(self): """Test extracting every third nucleotide (slicing with stride 3)""" self.assertEqual(MutableSeq("TAGTAA", IUPAC.ambiguous_dna), self.mutable_s[0::3]) self.assertEqual(MutableSeq("CAGGTT", IUPAC.ambiguous_dna), self.mutable_s[1::3]) self.assertEqual(MutableSeq("AAACCG", IUPAC.ambiguous_dna), self.mutable_s[2::3]) def test_set_wobble_codon_to_n(self): """Test setting wobble codon to N (set slice with stride 3)""" self.mutable_s[2::3] = "N" * len(self.mutable_s[2::3]) self.assertEqual(MutableSeq("TCNAANGGNTGNATNATN", IUPAC.ambiguous_dna), self.mutable_s)
def test_reverse_complement_mutable_seq(self): s = SeqRecord(MutableSeq("ACTG")) self.assertEqual("CAGT", str(s.reverse_complement().seq))
## checkSynonymity(resultDict[gene]) ## ## except: ## pass positionList = [] for sub_element, value in resultDict[gene].items(): converted = classifydict(value) for nucleotide, valuen in converted.items(): if int(valuen[0]) > 0 and float(valuen[2]) < float(args.minqual): count +=1 # now check for synonymity, move this to functions alternativeSeq = MutableSeq(str(element.seq[start:stop]), generic_dna) ## print nucleotide alternativeSeq = mutateSequence(alternativeSeq,sub_element,nucleotide,start) alternativeSeq = Seq(str(alternativeSeq), generic_dna) if len(alternativeSeq)%3 != 0: overlap = len(alternativeSeq)%3 alternativeSeq = alternativeSeq[:-int(overlap)] altprot = alternativeSeq.translate() altprot = list2dict(altprot[0:len(protein)],0) protposition = int((sub_element-start)/3) try: if protein[protposition] != altprot[protposition]: positionList.append(sub_element) synonym = 'NonSynon'
leaf_names = [leaf.name for leaf in leaves] z = 'ancestral' for l in leaf_names: seqLIST.append([l, [RCstart, RCstop, z]]) # Build dictionary of recombinant regions for k,v in seqLIST: d[k].append(v) # Mask recombination in sequences seqALN = [] for record in SeqIO.parse(args.aln, 'fasta'): msg('Reading {} ... '.format(record.id)) seqlen = len(record.seq) regions = d.get(record.id, None) newrec = MutableSeq(str(record.seq)) if regions: for a in regions: start = int(a[0]) - 1 end = int(a[1]) lenMASK = end - start newrec[start:end] = (args.symbol)*lenMASK seqALN.append(SeqRecord(Seq(str(newrec)), record.id, description='')) # Write masked alignment to file msg('Writing masked alignment to {} ... '.format(args.out)) SeqIO.write(seqALN, args.out, 'fasta') # Write recombinant regions to file if args.regions: with open(args.regions, 'w') as csvfile:
if VERBOSE >= 1: print pname+':', gene, 'not found or with problems: skipping.' continue gene_pos = gene_poss[gene] aft_der_gene = np.concatenate([aft_der[:, :, exon_pos[0]: exon_pos[1]] for exon_pos in gene_pos], axis=2) conss_gene = gene_seqs[gene] gene_len = len(conss_gene) hist += np.histogram(aft_der_gene.ravel(), bins=bins, density=False)[0] # Collect counts syn/nonsyn nu_syn = [] nu_nonsyn = [] cod_anc = MutableSeq('AAA', unambiguous_dna) cod_new = MutableSeq('AAA', unambiguous_dna) for j in xrange(gene_len // 3): for jcod in xrange(3): for ai in xrange(4): cod_anc[:] = conss_gene[3 * j: 3 * (j+1)] # Ancestral allele, skip (we only look at propagation of MINOR alleles) if alpha[ai] == cod_anc[jcod]: continue cod_new[:] = conss_gene[3 * j: 3 * (j+1)] cod_new[jcod] = alpha[ai] aftmp = aft_der_gene[:, ai, j + jcod] aftmp = aftmp[(aftmp >= bins[0]) & (aftmp <= bins[-1])] if not len(aftmp):
seq=Seq('ATGGTCTTTCCAGACGCG',IUPAC.unambiguous_dna) print Seq.transcribe(seq) #as function, up is as method print seq[:5] #methods as string print len(seq) #seq[0]='C' #aren't mutables st=str(seq) #toString print st #tipo de dato secuencia editable from Bio.Seq import MutableSeq mut_seq=seq.tomutable() #convertirlo a tipo seq mutable print mut_seq mut_seq[0]='C' print mut_seq mut_seq=MutableSeq('ATGCCG',IUPAC.IUPACUnambiguousDNA()) #has methods as a list: append(), insert(), pop(), remove() mut_seq[1:3]='TTT' mut_seq.reverse() mut_seq.complement() print mut_seq mut_seq.reverse_complement() print mut_seq #tipo de dato metadatos de secuencia from Bio.SeqRecord import SeqRecord seqrec=SeqRecord(seq,id='001', name='My Secuencia') #2 main attributes: # id: string identifier, optional, recommended # seq: Seq object, required #additional attributes
def viterbi(self, sequence, state_alphabet): """Calculate the most probable state path using the Viterbi algorithm. This implements the Viterbi algorithm (see pgs 55-57 in Durbin et al for a full explanation -- this is where I took my implementation ideas from), to allow decoding of the state path, given a sequence of emissions. Arguments: o sequence -- A Seq object with the emission sequence that we want to decode. o state_alphabet -- The alphabet of the possible state sequences that can be generated. """ # calculate logarithms of the transition and emission probs log_trans = self._log_transform(self.transition_prob) log_emission = self._log_transform(self.emission_prob) viterbi_probs = {} pred_state_seq = {} state_letters = state_alphabet.letters # --- initialization # # NOTE: My index numbers are one less than what is given in Durbin # et al, since we are indexing the sequence going from 0 to # (Length - 1) not 1 to Length, like in Durbin et al. # # v_{0}(0) = 1 viterbi_probs[(state_letters[0], -1)] = 1 # v_{k}(0) = 0 for k > 0 for state_letter in state_letters[1:]: viterbi_probs[(state_letter, -1)] = 0 # --- recursion # loop over the training squence (i = 1 .. L) for i in range(0, len(sequence)): # now loop over all of the letters in the state path for main_state in state_letters: # e_{l}(x_{i}) emission_part = log_emission[(main_state, sequence[i])] # loop over all possible states possible_state_probs = {} for cur_state in self.transitions_from(main_state): # a_{kl} trans_part = log_trans[(cur_state, main_state)] # v_{k}(i - 1) viterbi_part = viterbi_probs[(cur_state, i - 1)] cur_prob = viterbi_part + trans_part possible_state_probs[cur_state] = cur_prob # finally calculate the viterbi probability using the max max_prob = max(possible_state_probs.values()) viterbi_probs[(main_state, i)] = (emission_part + max_prob) # now get the most likely state for state in possible_state_probs: if possible_state_probs[state] == max_prob: pred_state_seq[(i - 1, main_state)] = state break # --- termination # calculate the probability of the state path # loop over all letters all_probs = {} for state in state_letters: # v_{k}(L) viterbi_part = viterbi_probs[(state, len(sequence) - 1)] # a_{k0} transition_part = log_trans[(state, state_letters[0])] all_probs[state] = viterbi_part * transition_part state_path_prob = max(all_probs.values()) # find the last pointer we need to trace back from last_state = '' for state in all_probs: if all_probs[state] == state_path_prob: last_state = state assert last_state != '', "Didn't find the last state to trace from!" # --- traceback traceback_seq = MutableSeq('', state_alphabet) loop_seq = range(0, len(sequence)) loop_seq.reverse() cur_state = last_state for i in loop_seq: traceback_seq.append(cur_state) cur_state = pred_state_seq[(i - 1, cur_state)] # put the traceback sequence in the proper orientation traceback_seq.reverse() return traceback_seq.toseq(), state_path_prob
def viterbi(self, sequence, state_alphabet): """Calculate the most probable state path using the Viterbi algorithm. This implements the Viterbi algorithm (see pgs 55-57 in Durbin et al for a full explanation -- this is where I took my implementation ideas from), to allow decoding of the state path, given a sequence of emissions. Arguments: o sequence -- A Seq object with the emission sequence that we want to decode. o state_alphabet -- The alphabet of the possible state sequences that can be generated. """ # calculate logarithms of the initial, transition, and emission probs log_initial = self._log_transform(self.initial_prob) log_trans = self._log_transform(self.transition_prob) log_emission = self._log_transform(self.emission_prob) viterbi_probs = {} pred_state_seq = {} state_letters = state_alphabet.letters # --- recursion # loop over the training squence (i = 1 .. L) # NOTE: My index numbers are one less than what is given in Durbin # et al, since we are indexing the sequence going from 0 to # (Length - 1) not 1 to Length, like in Durbin et al. for i in range(0, len(sequence)): # loop over all of the possible i-th states in the state path for cur_state in state_letters: # e_{l}(x_{i}) emission_part = log_emission[(cur_state, sequence[i])] max_prob = 0 if i == 0: # for the first state, use the initial probability rather # than looking back to previous states max_prob = log_initial[cur_state] else: # loop over all possible (i-1)-th previous states possible_state_probs = {} for prev_state in self.transitions_to(cur_state): # a_{kl} trans_part = log_trans[(prev_state, cur_state)] # v_{k}(i - 1) viterbi_part = viterbi_probs[(prev_state, i - 1)] cur_prob = viterbi_part + trans_part possible_state_probs[prev_state] = cur_prob # calculate the viterbi probability using the max max_prob = max(possible_state_probs.values()) # v_{k}(i) viterbi_probs[(cur_state, i)] = (emission_part + max_prob) if i > 0: # get the most likely prev_state leading to cur_state for state in possible_state_probs: if possible_state_probs[state] == max_prob: pred_state_seq[(i - 1, cur_state)] = state break # --- termination # calculate the probability of the state path # loop over all states all_probs = {} for state in state_letters: # v_{k}(L) all_probs[state] = viterbi_probs[(state, len(sequence) - 1)] state_path_prob = max(all_probs.values()) # find the last pointer we need to trace back from last_state = '' for state in all_probs: if all_probs[state] == state_path_prob: last_state = state assert last_state != '', "Didn't find the last state to trace from!" # --- traceback traceback_seq = MutableSeq('', state_alphabet) loop_seq = range(1, len(sequence)) loop_seq.reverse() # last_state is the last state in the most probable state sequence. # Compute that sequence by walking backwards in time. From the i-th # state in the sequence, find the (i-1)-th state as the most # probable state preceding the i-th state. state = last_state traceback_seq.append(state) for i in loop_seq: state = pred_state_seq[(i - 1, state)] traceback_seq.append(state) # put the traceback sequence in the proper orientation traceback_seq.reverse() return traceback_seq.toseq(), state_path_prob
#print gene #YAAX = yaaX.translate(table='Bacterial', cds=True, to_stop=True) #print YAAX #playing with codon usage tables #from Bio.Data import CodonTable #standard_table = CodonTable.unambiguous_dna_by_name["Standard"] #mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"] #print standard_table #mutable seq objects from Bio.Seq import Seq from Bio.Seq import MutableSeq from Bio.Alphabet import IUPAC #my_seq = Seq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna) #mutable_seq = my_seq.tomutable() #Or just create a mutable seq! my_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna) print my_seq #my_seq_div = my_seq #my_seq_div[5:8] = 'tag' #how to do insertions???????? only can replace as many characters as indicated. wait it works now. #why 5:8? #print my_seq #why does this print as my_seq_div with SNP? #print my_seq_div #my_seq_del = my_seq_div.remove("T") #print my_seq_del my_seq_rev = my_seq.reverse() #should be able to do my_seq.reverse_complement() as well print my_seq_rev #this should be working, but it returning None fin_seq = my_seq_div.toseq() #converts back to immutable Seq Object
# How to remove all Ts from a sequence using a while loop from Bio.Seq import MutableSeq #import MutableSeq (mutable sequence) object from Bio.Alphabet import IUPAC #import IUPAC alphabets # Create a MutableSeq object called mutable_seq # The code below also works without an alphabet argument mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna) while "T" in mutable_seq: #as long as there are Ts in mutable_seq... mutable_seq.remove("T") #remove the next T print(mutable_seq) #show me the result
"AATCGTGGCTATTACTGGGATGGAGGTCACTGGCGCGACCACGGCTGGTGGAAACAACAT" + "TATGAATGGCGAGGCAATCGCTGGCACCTACACGGACCGCCGCCACCGCCGCGCCACCAT" + "AAGAAAGCTCCTCATGATCATCACGGCGGTCATGGTCCAGGCAAACATCACCGCTAA", generic_dna) print(gene.translate(table="Bacterial")) print(gene.translate(table="Bacterial", cds=True)) ##查看密码子表 from Bio.Data import CodonTable standard_table = CodonTable.unambiguous_dna_by_name["Standard"] mito_table = CodonTable.unambiguous_dna_by_id[2] print(standard_table) print(mito_table.start_codons) print(mito_table.stop_codons) print(mito_table.forward_table["ACG"]) ##可变对象 from Bio.Seq import MutableSeq mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna) print(mutable_seq) mutable_seq[5] = "C" print(mutable_seq) mutable_seq.remove("T") print(mutable_seq) mutable_seq.reverse() print(mutable_seq) new_seq = mutable_seq.toseq() print(new_seq)
def __init__(self, seqFile, format="fasta"): for seq in SeqIO.parse(seqFile, format): seq.seq = MutableSeq(seq.seq.tostring()) self.append(seq)
class MuGen(object): """ performs mutations and deletion/insertion with desired porbability and desired structure. Gets a Seq object, a mutation or indel dicitonary, and the probablities for each item in those dictionaries. insertprob and deleteprob are base specefic probabilities of length 4 mualphabet is a dictionary specifying the possible mutations for each letter of the sequence alphabet. muprob gives the mutation probality for each letter of the sequence alphabet.""" def __init__(self, seq, alphaproperty=None, insertprob=None, deleteprob=None, mualphabet=None, muprob=None, mupos=None, delpos=None, inpos=None, verbose=False): try: self.occureddel = list() # This is to keep a history of chnges made to the reference self.occuredmu = list() # This is necessary for writing the haplotypes in the format self.occuredins = list() # of haplotyping software's. self.inserted_allele = list() # keeps track of the inserted allele to be able to get them back when needed! self.alt_allele = list() # keeps track of the substituted if not isinstance(verbose, bool): raise CustomException("ERROR: verbose must be set to either True or False. \ Default is to False") else: self.verbose = verbose if isinstance(seq, str): if alphaproperty is None: if self.verbose: print( "WARNING: No alphabet type is specified for the sequence string!") else: pass self.alphaproperty = Alphabet() else: self.alphaproperty = alphaproperty self.seq = MutableSeq(seq, self.alphaproperty) elif isinstance(seq, Seq): self.alphaproperty = seq.__getattribute__( 'alphabet') self.seq = seq.tomutable() elif isinstance(seq, MutableSeq): self.alphaproperty = seq.__getattribute__( 'alphabet') self.seq = copy.deepcopy(seq) else: raise CustomException("ERROR: Should provide a Seq or MutableSeq object, \n \ or a string sequence!") self.alphabet = set(str(self.seq)) self.ref = str(self.seq) if not delpos: self.delpos = [] else: if set(delpos).issubset( set(range(len(self.ref)))): self.delpos = list( delpos) # Deletion by specifying the positions else: raise CustomException( "ERROR: Deletion positions exceed the range of the reference or are not positive integers!") if not inpos: self.inpos = [] else: if set(inpos).issubset( set(range(len(self.ref)))): self.inpos = list( inpos) # Insertion by specifying the positions else: raise CustomException( "ERROR: Insertion positions exceed the range of the reference or are not positive integers!") if not mupos: self.mupos = [] else: if set(mupos).issubset( set(range(len(self.ref)))): self.mupos = list( mupos) # Mutation by specifying the positions else: raise CustomException( "ERROR: Mutation positions exceed the range of the reference or are not positive integers!") if not mualphabet: if self.verbose: print("WARNING: You have specified no mutation alphabet! Mutations are set to random \ letters!") self.mualphabet = dict() for key in self.alphabet: self.mualphabet[key] = ''.join( self.alphabet - { key,'N'}) # Non-specified mutations could happen to any letter else: mualphabet = dict([(str(k), str(v)) for k, v in mualphabet.iteritems()]) for key, value in mualphabet.iteritems(): if len(key) != 1: raise CustomException("ERROR: the mutation alphabet deals with point mutations! Only single letters are\ allowed as keys!") elif key in set(''.join(value)): raise CustomException("ERROR: Wrong mutation values specified! A letter could just be substituted with a\ different letter for mutation!") if set( mualphabet.keys()) == self.alphabet and set( ''.join( mualphabet.values())) <= self.alphabet: self.mualphabet = copy.deepcopy( mualphabet) elif set( mualphabet.keys()) < self.alphabet and set( ''.join( mualphabet.values())) < self.alphabet: if self.verbose: print("WARNING: Mutation is not specified for some letters! Those mutations are set\ to random letters!") self.mualphabet = copy.deepcopy( mualphabet) # Whatever has been specified for mutation alphabet is kep intact for key in self.alphabet - set( mualphabet.keys()): self.mualphabet[key] = ''.join( self.alphabet - { key,'N'}) # Non-specified mutations could happen to any letter else: if self.verbose: print("WARNING: Mutation alphabet is not compatible with sequence alphabet! Both alphabets are\ updated and\nunspecified mutations are set to random letters!") new_mualphabet = dict() # As mutation may introduce novel alleles in the sequence, alphabet is updated first for key, value in mualphabet.iteritems(): # Whatever has been specified for mutation alphabet is kep intact self.alphabet.add( key) # Only the alphabet is updated if necessary self.alphabet |= (set(''.join( value)) - self.alphabet) new_mualphabet.update( {key: value}) for key in self.alphabet - set( new_mualphabet.keys()): new_mualphabet[key] = ''.join( self.alphabet - { key,'N'}) # Non-specified mutations could happen to any letter self.mualphabet = copy.deepcopy( new_mualphabet) if not insertprob: self.insertprob = dict() # If no insertprob is given, it is set to zero everywhere for key in self.alphabet: self.insertprob[key] = 0 else: if set(list( insertprob.keys())) != self.alphabet: if self.verbose: print("WARNING: Missing/Invalid letter(s) in insertion probability!\n\ Probabilities are set to zero for missing letters! Invalid letters are ignored!") new_insertprob = dict() for key, value in insertprob.iteritems(): if value >= 0 and value <= 1: new_insertprob.update( {key: value}) else: raise CustomException( "ERROR: Insertion probability must be >=0 and <=1!") for key in self.alphabet - set( new_insertprob.keys()): new_insertprob[key] = 0 self.insertprob = copy.deepcopy(new_insertprob) if not deleteprob: # If no deleteprob is given, it is set to zero everywhere self.deleteprob = dict() for key in self.alphabet: self.deleteprob[key] = 0 else: if set(list( deleteprob.keys())) != self.alphabet: if self.verbose: print("WARNING: Missing/Invalid letter(s) in deletion probability!\n\ Probabilities are set to zero for missing letters! Invalid letters are ignored!") new_deleteprob = dict() for key, value in deleteprob.iteritems(): if value >= 0 and value <= 1: new_deleteprob.update( {key: value}) else: raise CustomException( "ERROR: Deletion probability must be >=0 and <=1!") for key in self.alphabet - set( new_deleteprob.keys()): new_deleteprob[key] = 0 self.deleteprob = copy.deepcopy(new_deleteprob) if not muprob: self.muprob = dict() # If no muprob is given, it is set to zero everywhere for key in self.alphabet: self.muprob[key] = 0 else: if set(list(muprob.keys())) != self.alphabet: if self.verbose: print("WARNING: Missing/Invalid letter(s) in mutation probability!\n\ Probabilities are set to zero for missing letters! Invalid letters are ignored!") new_muprob = dict() for key, value in muprob.iteritems(): if value >= 0 and value <= 1: new_muprob.update({key: value}) else: raise CustomException( "ERROR: Mutation probability must be >=0 and <=1!") for key in self.alphabet - set( new_muprob.keys()): new_muprob[key] = 0 self.muprob = copy.deepcopy(new_muprob) except CustomException as instance: print(instance) sys.exit(2) else: if self.verbose: print( "MuGen object successfully created.\nWARNING: MuGen sequence is case sensitive!") def __repr__(self): return "Haplotype: %s, \n Reference sequence: %s, \n Mutation probabilty: %s, \n Mutations: %s, \n \ Insertion probabilty: %s, \n Deletion Probability: %s, \n \ Insertion positions: %s, \n Deletion positions: %s, \n Mutation positions: %s \n" % ( self.seq, self.ref, self.muprob, self.mualphabet, self.insertprob, self.deleteprob, self.inpos, self.delpos, self.mupos) def __str__(self): return repr(self) def get_hap(self): # Access Methods return self.seq def get_ref(self): return self.ref def get_insertprob(self): return self.insertprob def get_deleteprob(self): return self.deleteprob def get_muprob(self): return self.muprob def get_mualphabet(self): return self.mualphabet def get_mupos(self): return self.mupos def get_inpos(self): return self.inpos def get_delpos(self): return self.delpos def get_occureddelpos(self): return self.occureddel def get_occuredmupos(self): return self.occuredmu def get_occuredinspos(self): return self.occuredins def get_ins_allele(self): return self.inserted_allele def get_mu_allele(self): return self.alt_allele def set_ref(self, ref): # Modifier methods """Changes the reference sequence of the MuGen object. Could become problematic if the new reference has a different length than the current reference, while indel and mutation positions are specified. A useful method if reference is a mutable seq entity which is constantly called and changed by other methods and calsses.""" try: if set(str(ref)).issubset(self.alphabet): if not set(self.mupos).issubset( set(range(len(str(ref))))): raise CustomException( "ERROR: Mutation positions exceed the range of the new reference!") elif not set(self.inpos).issubset( set(range(len(str(ref))))): raise CustomException( "ERROR: Insertion positions exceed the range of the new reference!") elif not set(self.delpos).issubset( set(range(len(str(ref))))): raise CustomException( "ERROR: Deletion positions exceed the range of the new reference!") else: self.ref = str(ref) else: raise CustomException( "ERROR: the new reference is not compatible with the current alphabet!") except CustomException as instance: print("Failed to update the reference!") print(instance) except: print("Failed to update the reference!") raise else: if self.verbose: print( "The reference sequence has been updated!") def set_pos(self, inpos=None, delpos=None, mupos=None, ): """Changes the insertion, deletion and substitution sites of the MuGen object. A useful method if posmu and probmu methods are constantly called.""" try: changedel = 0 # If set to 1, delpos is changed. Otherwise no change to delpos. changein = 0 # If set to 1, inpos is changed. Otherwise no change to inpos. changemu = 0 # If set to 1, mupos is changed. Otherwise no change to mupos. if delpos is None: # Default is no change pass else: if set(delpos).issubset( set(range(len(self.ref)))): changedel = 1 else: raise CustomException( "ERROR: New deletion positions exceed the range of the reference or are not positive integers!") if inpos is None: # Deafult is no change pass else: if set(inpos).issubset( set(range(len(self.ref)))): changein = 1 else: raise CustomException( "ERROR: New insertion positions exceed the range of the reference or are not positive integers!") if mupos is None: # Default is no change pass else: if set(mupos).issubset( set(range(len(self.ref)))): changemu = 1 else: raise CustomException( "ERROR: New mutation positions exceed the range of the reference or are not positive integers!") if changedel: self.delpos = list(delpos) # Update delpos else: pass if changein: self.inpos = list(inpos) # Update inpos else: pass if changemu: self.mupos = list(mupos) # Update mupos else: pass except CustomException as instance: print("Failed to update indel and mutation positions!") print(instance) except: print("Failed to update indel and mutation positions!") raise else: if self.verbose: print("Indel and mutation positions updated!") def set_prob(self, insertprob=None, deleteprob=None, muprob=None): """Changes the insertion, deletion and mutation probabilities of the MuGen object. A useful method if posmu and probmu methods are constantly called.""" try: noinsert = -1 nodel = -1 nomu = -1 if insertprob is None: # Default to no change noinsert = 0 elif not insertprob: noinsert = 1 elif set(list(insertprob.keys())) != self.alphabet: if self.verbose: print("WARNING: Missing/Invalid letter(s) in insertion probability!\n\ Probabilities are set to zero for missing letters! Invalid letters are ignored!") new_insertprob = dict() for key, value in insertprob.iteritems(): if value >= 0 and value <= 1: new_insertprob.update( {key: value}) else: raise CustomException( "ERROR: Insertion probability must be >=0 and <=1!") for key in self.alphabet - set( new_insertprob.keys()): new_insertprob[key] = 0 else: new_insertprob = copy.deepcopy(insertprob) if deleteprob is None: # Default to no change nodel = 0 elif not deleteprob: # If empty deleteprob is given, it is set to zero everywhere nodel = 1 elif set(list(deleteprob.keys())) != self.alphabet: if self.verbose: print("WARNING: Missing/Invalid letter(s) in deletion probability!\n\ Probabilities are set to zero for missing letters! Invalid letters are ignored!") new_deleteprob = dict() for key, value in deleteprob.iteritems(): if value >= 0 and value <= 1: new_deleteprob.update( {key: value}) else: raise CustomException( "ERROR: Deletion probability must be >=0 and <=1!") for key in self.alphabet - set( new_deleteprob.keys()): new_deleteprob[key] = 0 else: new_deleteprob = copy.deepcopy(deleteprob) if muprob is None: # Default to no change nomu = 0 elif not muprob: nomu = 1 elif set(list(muprob.keys())) != self.alphabet: if self.verbose: print("WARNING: Missing/Invalid letter(s) in mutation probability!\n\ Probabilities are set to zero for missing letters! Invalid letters are ignored!") new_muprob = dict() for key, value in muprob.iteritems(): if value >= 0 and value <= 1: new_muprob.update({key: value}) else: raise CustomException( "ERROR: Mutation probability must be >=0 and <=1!") for key in self.alphabet - set( new_muprob.keys()): new_muprob[key] = 0 else: new_muprob = copy.deepcopy(muprob) if nodel == 0: pass elif nodel == 1: self.deleteprob = dict() for key in self.alphabet: self.deleteprob[key] = 0 else: self.deleteprob = copy.deepcopy( new_deleteprob) # Update deleteprob if nomu == 0: pass elif nomu == 1: self.muprob = dict() # If empty muprob is given, it is set to zero everywhere for key in self.alphabet: self.muprob[key] = 0 else: self.muprob = copy.deepcopy( new_muprob) # Update muprob if noinsert == 0: pass elif noinsert == 1: self.insertprob = dict() # If empty insertprob is given, it is set to zero everywhere for key in self.alphabet: self.insertprob[key] = 0 else: self.insertprob = copy.deepcopy( new_insertprob) # Update insertprob except CustomException as instance: print(instance) print( "Failed to update indel and mutation probabilities!") except: print( "Failed to update indel and mutation probabilities!") raise else: if self.verbose: print( "Indel and mutation probabilities successfully updated!") def set_mualphabet(self, mualphabet=None): """Changes the mutation alphabet of the MuGen object. A useful method if posmu and probmu methods are constantly called.""" try: if not mualphabet: if self.verbose: print("WARNING: You have specified no mutation alphabet! Mutations are set to random \ letters!") self.mualphabet = dict() for key in self.alphabet: self.mualphabet[key] = ''.join( self.alphabet - { key,'N'}) # Non-specified mutations could happen to any letter else: mualphabet = dict([(str(k), str(v)) for k, v in mualphabet.iteritems()]) for key, value in mualphabet.iteritems(): if len(key) != 1: raise CustomException("ERROR: the mutation alphabet deals with point mutations! Only single letters are\ allowed as keys!") elif key in set(''.join(value)): raise CustomException("ERROR: Wrong mutation values specified! A letter could just be substituted with a\ different letter for mutation!") if set( mualphabet.keys()) == self.alphabet and set( ''.join( mualphabet.values())) <= self.alphabet: self.mualphabet = copy.deepcopy( mualphabet) elif set( mualphabet.keys()) < self.alphabet and set( ''.join( mualphabet.values())) < self.alphabet: if self.verbose: print("WARNING: Mutation is not specified for some letters! Those mutations are set\ to random letters!") self.mualphabet = copy.deepcopy( mualphabet) # Whatever has been specified for mutation alphabet is kep intact for key in self.alphabet - set( mualphabet.keys()): self.mualphabet[key] = ''.join( self.alphabet - { key,'N'}) # Non-specified mutations could happen to any letter else: if self.verbose: print("WARNING: Mutation alphabet is not compatible with sequence alphabet! Both alphabets are\ updated and\nunspecified mutations are set to random letters!") new_mualphabet = dict() # As mutation may introduce novel alleles in the sequence, alphabet is updated first for key, value in mualphabet.iteritems(): # Whatever has been specified for mutation alphabet is kep intact self.alphabet.add( key) # Only the alphabet is updated if necessary self.alphabet |= (set(''.join( value)) - self.alphabet) new_mualphabet.update( {key: value}) for key in self.alphabet - set( new_mualphabet.keys()): new_mualphabet[key] = ''.join( self.alphabet - { key,'N'}) # Non-specified mutations could happen to any letter self.mualphabet = copy.deepcopy( new_mualphabet) except CustomException as instance: print(instance) print("Mualphabet could not be updated!") except: print("Mualphabet could not be updated!") raise else: if self.verbose: print("Mualphabet successfully updated!") def probmu(self): self.occuredmu = list() self.occureddel = list() self.occuredins = list() self.inserted_allele = list() self.alt_allele = list() """Operates on a MuGen object, and returns a Seq object obtained by making random changes to the reference sequence of the MuGen object, using the probabilities given to MuGen""" self.seq = [] for __site, __base in enumerate(self.ref): if __site in set(self.mupos) | set(self.inpos) | set( self.delpos): self.seq.append( __base) # No change is made at indel/mutation positions else: __prob = {'ins': self.insertprob.get(__base), 'del': self.deleteprob.get(__base), 'sub': self.muprob.get(__base)} __error = random.choice(['ins', 'del', 'sub', 'sub']) # An error occurs randomly: insertion or \ # deletion or substitution __rnd = float(int( random.random() * 100000)) / 100000 # The probability that this error is \ # not corrected by replication machinary is determined \ if __rnd < __prob.get( __error): # by insertprob,deleteprob and muprob if __error == 'sub': self.seq.append(random.choice( self.mualphabet.get( __base))) # Substitute tha letter with one from the mutation alphabet self.occuredmu.append( __site) # Update the list of the sites where a mutation has occured self.alt_allele.extend([ self.seq[ -1]]) # Update the list of alternative alleles elif __error == 'ins': self.seq.append(__base) self.seq.append(random.choice( list( self.alphabet))) # Insert a random letter right after the letter self.occuredins.append( __site) # Update the list of the sites after which an insertion has occured self.inserted_allele.extend([ __base + self.seq[ -1]]) # Update the list of inserted alleles else: self.occureddel.append( __site) # Delete the letter in the progeny sequence by just not adding it else: # Update the list of the sites which are deleted in the progeny sequence self.seq.append( __base) # No change is induced at the site in the progeny sequence self.seq = ''.join(self.seq) self.seq = MutableSeq(self.seq, self.alphaproperty) if (self.occuredins): _ins_allele = zip(self.occuredins, self.inserted_allele) _ins_allele.sort(key=lambda tup: tup[ 0]) # Sort the occured change positions in ascending order self.occuredins, self.inserted_allele = zip( *_ins_allele) self.occuredins = list(self.occuredins) self.inserted_allele = list(self.inserted_allele) _ins_allele = None else: self.inserted_allele = [] self.occuredins = [] if (self.occuredmu): _alt_allele = zip(self.occuredmu, self.alt_allele) _alt_allele.sort(key=lambda tup: tup[0]) self.occuredmu, self.alt_allele = zip(*_alt_allele) self.occuredmu = list(self.occuredmu) self.alt_allele = list(self.alt_allele) _alt_allele = None else: self.occuredmu = [] self.alt_allele = [] if (self.occureddel): self.occureddel.sort() else: self.occureddel = [] if self.verbose: print("WARNING: If indel/mutation positions are specified, MuGen.probmu() makes no change at those sites. \n \ Use MuGen.posmu() or Mugen.hapchanger() to apply changes at those sites!") print("Changes made to the haplotype!") def posmu(self): """Operates on a MuGen object, and returns a Seq object obtained by making specefic changes at specefic locations on the reference sequence of the MuGen object, using the indel and mutation positions already given to MuGen""" __change = [None] * len(self.ref) self.occuredmu = list() self.occureddel = list() self.occuredins = list() self.inserted_allele = list() # Preservation and change site are determined self.alt_allele = list() for __site in self.inpos: # Preservation and change site are determined __change[ __site] = 'ins' # with respect to the reference seq for __site in self.delpos: # type of the change is also specified __change[__site] = 'del' # The substituion base at the for __site in self.mupos: # specified position is determined __change[__site] = 'sub' # from the mutation alphabet. self.seq = [] for __site, __error in iter( zip(range(len(self.ref)), __change)): __base = self.ref[__site] if __error is None: self.seq.append(__base) elif __error == 'sub': self.seq.append(random.choice( self.mualphabet.get( __base))) # Substitute tha letter with one from the mutation alphabet self.occuredmu.append( __site) # Update the list of the sites where a mutation has occured self.alt_allele.extend([self.seq[ -1]]) # Update the list of alternative alleles elif __error == 'ins': self.seq.append(__base) self.seq.append(random.choice(list( self.alphabet))) # Insert a random letter right after the letter self.occuredins.append( __site) # Update the list of the sites after which an insertion has occured self.inserted_allele.extend([__base + self.seq[ -1]]) # Update the list of inserted alleles else: self.occureddel.append( __site) # Delete the letter in the progeny sequence by just not adding it self.seq = ''.join(self.seq) self.seq = MutableSeq(self.seq, self.alphaproperty) # Update the list of the sites which are deleted in the progeny sequence if self.occuredins: _ins_allele = zip(self.occuredins, self.inserted_allele) _ins_allele.sort(key=lambda tup: tup[ 0]) # Sort the occured change positions self.occuredins, self.inserted_allele = zip( *_ins_allele) self.occuredins = list(self.occuredins) self.inserted_allele = list(self.inserted_allele) _ins_allele = None else: self.inserted_allele = [] self.occuredins = [] if (self.occuredmu): _alt_allele = zip(self.occuredmu, self.alt_allele) _alt_allele.sort(key=lambda tup: tup[0]) self.occuredmu, self.alt_allele = zip(*_alt_allele) self.occuredmu = list(self.occuredmu) self.alt_allele = list(self.alt_allele) _alt_allele = None else: self.occuredmu = [] self.alt_allele = [] if (self.occureddel): self.occureddel.sort() else: self.occureddel = [] if self.verbose: print("WARNING: if there are overlaps betweeen deletion, insertion and mutation positions, \n \ just one of the changes takes place with the following priority: \n \ 1)Mutation 2)Deletion 3)Insertion. \n") print("Changes made to the haplotype!") def hapchanger(self): """Operates on a MuGen object, and returns a Seq object obtained by making random and specified changes to the reference sequence of the MuGen object, using the probabilities as well as the positions given to MuGen.""" self.seq = [] self.occuredmu = list() self.occureddel = list() self.occuredins = list() self.inserted_allele = list() self.alt_allele = list() for __site, __base in enumerate(self.ref): if __site in set( self.mupos): # Making specified changes at the specified positions self.seq.append(random.choice( self.mualphabet.get( __base))) # Induce mutation at the site whose position is given self.occuredmu.append( __site) # Update the list of the sites where a mutation has occured self.alt_allele.extend([self.seq[ -1]]) # Update the list of alternative alleles elif __site in set(self.inpos): self.seq.append( __base) # Make an insertion right after the site whose position is given self.seq.append( random.choice(list(self.alphabet))) self.occuredins.append( __site) # Update the list of the sites after which an insertion has occured self.inserted_allele.extend([__base + self.seq[ -1]]) # Update the list of inserted alleles elif __site in set(self.delpos): self.occureddel.append( __site) # Update the list of the sited with deleted letter else: # If not change is specified at the position, \ # make a random change according to the prob model __prob = {'ins': self.insertprob.get(__base), 'del': self.deleteprob.get(__base), 'sub': self.muprob.get(__base)} __error = random.choice(['ins', 'del', 'sub', 'sub']) # An error occurs randomly: insertion or \ # deletion or substitution __rnd = float(int( random.random() * 100000)) / 100000 # The probability that this error is \ # not corrected by replication machinary is determined \ if __rnd < __prob.get( __error): # by insertprob,deleteprob and muprob if __error == 'sub': self.seq.append(random.choice(self.mualphabet.get(__base))) self.occuredmu.append(__site) # Update the list of the sites where a mutation has occured self.alt_allele.extend([self.seq[-1]]) # Update the list of alternative alleles elif __error == 'ins': self.seq.append(__base) self.seq.append(random.choice(list(self.alphabet))) self.occuredins.append(__site) # Update the list of the sites after which an insertion has occured self.inserted_allele.extend([__base + self.seq[-1]]) # Update the list of inserted alleles elif __error == 'del': self.occureddel.append(__site) # Update the list of the sited with deleted letter else: self.seq.append(__base) self.seq = ''.join(self.seq) self.seq = MutableSeq(self.seq, self.alphaproperty) if (self.occuredins): _ins_allele = zip(self.occuredins, self.inserted_allele) _ins_allele.sort(key=lambda tup: tup[ 0]) # Sort the occured change positions self.occuredins, self.inserted_allele = zip( *_ins_allele) self.occuredins = list(self.occuredins) self.inserted_allele = list(self.inserted_allele) _ins_allele = None else: self.inserted_allele = [] self.occuredins = [] if (self.occuredmu): _alt_allele = zip(self.occuredmu, self.alt_allele) _alt_allele.sort(key=lambda tup: tup[0]) self.occuredmu, self.alt_allele = zip(*_alt_allele) self.occuredmu = list(self.occuredmu) self.alt_allele = list(self.alt_allele) _alt_allele = None else: self.occuredmu = [] self.alt_allele = [] if (self.occureddel): self.occureddel.sort() else: self.occureddel = [] if self.verbose: print("Changes made to the haplotype!")
print CodonTable.unambiguous_dna_by_id[2].start_codons print CodonTable.unambiguous_dna_by_id[1].forward_table['ACG'] # which aminoacid for this codon #Comparing Sequences seq1 = Seq('ACGT',IUPAC.unambiguous_dna) seq2 = Seq('ACGT',IUPAC.unambiguous_dna) seq3 = Seq('ACGT',IUPAC.protein) print id(seq1) == id(seq2) # seq1 == seq2 look for the same object print str(seq1) == str(seq2) # convert to string print str(seq1) == str(seq3) # dna similar enought to protein #MutableSeq from Bio.Seq import MutableSeq mutseq = seq1.tomutable() # convert to MutableSeq print mutseq, type(mutseq) mutSeq = MutableSeq('CGTTTAAGCTGC',IUPAC.unambiguous_dna) print mutSeq, type(mutSeq) mutseq[1]='T' # imposible on simple Seq print mutseq seq1 = mutseq.toseq() # convert to Seq mutSeq.remove('A') # remove first A mutSeq[2:-5]='TTTT' mutSeq.reverse() # reverse() and reverse_complement() change object itself print mutSeq #MutableSeq can't be a dictionary key, Seq and string can #UnknownSeq # Subclass of Seq when you know length but not the characters to save memory from Bio.Seq import UnknownSeq unk = UnknownSeq(25) print unk, len(unk), type(unk)