def test_codons(self): """PositionalBaseUsage codons should give expected codon freqs""" #one of each base should give freqs if 1/64 for everything orig = CodonUsage('UUUCCCAAAGGG') b = orig.positionalBases() final = b.codons() self.assertEqual(len(final), 64) for i in final: self.assertFloatEqual(final[i], 1.0/64) #two bases at each position should give correct freqs orig = CodonUsage('UCGAGUUCGUCG') final = orig.positionalBases().codons() exp = { 'UCG': 0.75 * 0.75 * 0.75, 'UCU': 0.75 * 0.75 * 0.25, 'UGG': 0.75 * 0.25 * 0.75, 'UGU': 0.75 * 0.25 * 0.25, 'ACG': 0.25 * 0.75 * 0.75, 'ACU': 0.25 * 0.75 * 0.25, 'AGG': 0.25 * 0.25 * 0.75, 'AGU': 0.25 * 0.25 * 0.25, } for f in final: if f in exp: self.assertFloatEqual(final[f], exp[f]) else: self.assertEqual(final[f], 0)
def test_add(self): """CodonUsage add should sum two base usages""" c = CodonUsage('UUU') c2 = CodonUsage('CCC') self.assertEqual(c + c2, CodonUsage('UUUCCC')) c += c2 self.assertEqual(c, CodonUsage('UUUCCC'))
def test_aminoAcids(self): """PositionalBaseUsage aminoAcids should return correct amino acids""" #check hand-calculated values on a particular sequence orig = CodonUsage('UCGAGUUCGUCG') final = orig.positionalBases().aminoAcids() exp = { 'S': 0.75 * 0.75 * 0.75 + 0.75 * 0.75 * 0.25 + 0.25*0.25*0.25, 'W': 0.75 * 0.25 * 0.75, 'C': 0.75 * 0.25 * 0.25, 'T': 0.25 * 0.75 * 0.75 + 0.25 * 0.75 * 0.25, 'R': 0.25 * 0.25 * 0.75, } for f in final: if f in exp: self.assertFloatEqual(final[f], exp[f]) else: self.assertEqual(final[f], 0) #test for unbiased freqs on a couple of different genetic codes orig = CodonUsage('UUUCCCAAAGGG') final = orig.positionalBases().aminoAcids() SGC = GeneticCodes[1] for aa in final: self.assertEqual(final[aa], len(SGC[aa])/64.0) mt = GeneticCodes[2] final_mt = orig.positionalBases().aminoAcids(mt) self.assertNotEqual(final, final_mt) for aa in final_mt: self.assertEqual(final_mt[aa], len(mt[aa])/64.0)
def test_positionalBases(self): """CodonUsage bases should count bases at each position correctly""" freqs = {'UUC':5, 'AUA':10, 'AUG':10, 'CGC':3, 'AGG':2, 'XYZ':8, 'UAA':2, 'UGA':1} u = CodonUsage(freqs) b = u.positionalBases() assert isinstance(b, PositionalBaseUsage) first, second, third = b self.assertEqual(first, BaseUsage({'U':8,'C':3,'A':22,'X':8})) self.assertEqual(second, BaseUsage({'U':25,'C':0,'A':2,'G':6,'Y':8})) self.assertEqual(third, BaseUsage({'C':8,'A':13,'G':12,'Z':8})) #check that it also works when we purge p = u.positionalBases(purge_unwanted=True) first, second, third = p self.assertEqual(first, BaseUsage({'U':5,'C':3,'A':2})) self.assertEqual(second, BaseUsage({'U':5,'G':5})) self.assertEqual(third, BaseUsage({'C':8,'G':2})) #check that it also works with a different genetic code, and, #incidentally, that the purging didn't affect the original object u.GeneticCode = GeneticCodes[2] #mt code: different stop codons p = u.positionalBases(purge_unwanted=True) first, second, third = p self.assertEqual(first, BaseUsage({'U':6,'C':3,'A':20})) self.assertEqual(second, BaseUsage({'U':25,'G':4})) self.assertEqual(third, BaseUsage({'C':8,'A':11,'G':10}))
def test_rscu(self): """CodonUsage rscu should calculate synonymous usage correctly""" c = CodonUsage({'UUU':3,'UUC':1,'ACA':1}) c.rscu() self.assertEqual(c['UUU'], 0.75) self.assertEqual(c['UUC'], 0.25) self.assertEqual(c['ACA'], 1) self.assertEqual(c['GGG'], 0)
def test_rscu(self): """CodonUsage rscu should calculate synonymous usage correctly""" c = CodonUsage({'UUU': 3, 'UUC': 1, 'ACA': 1}) c.rscu() self.assertEqual(c['UUU'], 0.75) self.assertEqual(c['UUC'], 0.25) self.assertEqual(c['ACA'], 1) self.assertEqual(c['GGG'], 0)
def test_bases(self): """CodonUsage bases should count bases correctly""" u = CodonUsage('UUUCCCUAGCCCGGGAA') b = u.bases() self.assertEqual(b, BaseUsage('UUUCCCUAGCCCGGGAA')) #purge_unwanted should get rid of bad codons b = u.bases(purge_unwanted=True) self.assertEqual(b, BaseUsage('UUUCCCCCCGGG'))
def test_pr2bias(self): """CodonUsage pr2bias should give correct ratios.""" c = EqualBases.codons() b = c.pr2bias('UU') self.assertEqual(len(b), 6) self.assertEqual(b, tuple([.5]*6)) c = CodonUsage() c['ACU'] = 10 c['ACC'] = 5 c['ACA'] = 15 c['ACG'] = 20 self.assertEqual(c.pr2bias('AC'), (20/25,15/25,20/35,5/15,20/30,5/20))
def test_pr2bias(self): """CodonUsage pr2bias should give correct ratios.""" c = EqualBases.codons() b = c.pr2bias('UU') self.assertEqual(len(b), 6) self.assertEqual(b, tuple([.5] * 6)) c = CodonUsage() c['ACU'] = 10 c['ACC'] = 5 c['ACA'] = 15 c['ACG'] = 20 self.assertEqual(c.pr2bias('AC'), (20 / 25, 15 / 25, 20 / 35, 5 / 15, 20 / 30, 5 / 20))
def test_aminoAcids(self): """CodonUsage aminoAcids should correctly count amino acids""" freqs = {'UUC':5, 'AUA':10, 'AUG':10, 'CGC':3, 'AGG':2, 'XYZ':8, 'UAA':2, 'UGA':1} u = CodonUsage(freqs, "test") self.assertEqual(u.Info, 'test') for key, val in u.items(): if key in freqs: self.assertEqual(val, freqs[key]) else: self.assertEqual(val, 0) aa = u.aminoAcids() self.assertEqual(aa, AminoAcidUsage({'F':5,'I':10,'M':10,'R':5,'*':3,'X':8})) #check that it works with a different genetic code u.GeneticCode = GeneticCodes['2'] aa = u.aminoAcids() self.assertEqual(aa, AminoAcidUsage({'F':5,'I':0,'M':20,'R':3,'*':4,'W':1,'X':8})) #check that it works if a genetic code is supplied explicitly u.GeneticCode = GeneticCodes[1] aa = u.aminoAcids() self.assertEqual(aa, AminoAcidUsage({'F':5,'I':10,'M':10,'R':5,'*':3,'X':8})) aa_2 = u.aminoAcids(2) self.assertEqual(aa_2, AminoAcidUsage({'F':5,'I':0,'M':20,'R':3,'*':4,'W':1,'X':8})) #check that we held onto the info object through the above self.assertEqual(aa_2.Info, 'test')
def consolidate(usages): """Sums frequencies of a list of usages into one usage.""" result = CodonUsage() for u in usages: result += u result.normalize() return result
def test_getitem(self): """CodonUsage should allow lookup as RNA or DNA, case-insensitive""" u = CodonUsage() rna, dna, lc = 'UCAG', 'TCAG', 'ucag' for a in [rna, dna, lc]: codons = [i + j + k for i in a for j in a for k in a] for c in codons: self.assertEqual(u[c], 0)
def test_init_empty(self): """Empty CodonUsage init should have 64 codons, all 0""" u = CodonUsage() self.assertEqual(len(u), 64) for i in u: self.assertEqual(u[i], 0) #check that the genetic code is the default assert u.GeneticCode is GeneticCodes[1]
def test_aminoAcids(self): """PositionalBaseUsage aminoAcids should return correct amino acids""" #check hand-calculated values on a particular sequence orig = CodonUsage('UCGAGUUCGUCG') final = orig.positionalBases().aminoAcids() exp = { 'S': 0.75 * 0.75 * 0.75 + 0.75 * 0.75 * 0.25 + 0.25 * 0.25 * 0.25, 'W': 0.75 * 0.25 * 0.75, 'C': 0.75 * 0.25 * 0.25, 'T': 0.25 * 0.75 * 0.75 + 0.25 * 0.75 * 0.25, 'R': 0.25 * 0.25 * 0.75, } for f in final: if f in exp: self.assertFloatEqual(final[f], exp[f]) else: self.assertEqual(final[f], 0) #test for unbiased freqs on a couple of different genetic codes orig = CodonUsage('UUUCCCAAAGGG') final = orig.positionalBases().aminoAcids() SGC = GeneticCodes[1] for aa in final: self.assertEqual(final[aa], len(SGC[aa]) / 64.0) mt = GeneticCodes[2] final_mt = orig.positionalBases().aminoAcids(mt) self.assertNotEqual(final, final_mt) for aa in final_mt: self.assertEqual(final_mt[aa], len(mt[aa]) / 64.0)
def test_aminoAcids(self): """BaseUsage aminoAcids should give the same results as the codons""" known_data = { 'AAA' : .6 * .6 * .6, 'AAU' : .6 * .6 * .4, 'AUA' : .6 * .4 * .6, 'AUU' : .6 * .4 * .4, 'UAA' : .4 * .6 * .6, 'UAU' : .4 * .6 * .4, 'UUA' : .4 * .4 * .6, 'UUU' : .4 * .4 * .4, } known = CodonUsage(known_data) b = BaseUsage({'a':3, 'T':2, 'X':1}) self.assertEqual(b.aminoAcids(), known.aminoAcids()) #check that the genetic code is passed through correctly all_g = GeneticCode('G'*64) self.assertEqual(b.aminoAcids(all_g), AminoAcidUsage({'G':1}))
def test_aminoAcids(self): """BaseUsage aminoAcids should give the same results as the codons""" known_data = { 'AAA': .6 * .6 * .6, 'AAU': .6 * .6 * .4, 'AUA': .6 * .4 * .6, 'AUU': .6 * .4 * .4, 'UAA': .4 * .6 * .6, 'UAU': .4 * .6 * .4, 'UUA': .4 * .4 * .6, 'UUU': .4 * .4 * .4, } known = CodonUsage(known_data) b = BaseUsage({'a': 3, 'T': 2, 'X': 1}) self.assertEqual(b.aminoAcids(), known.aminoAcids()) #check that the genetic code is passed through correctly all_g = GeneticCode('G' * 64) self.assertEqual(b.aminoAcids(all_g), AminoAcidUsage({'G': 1}))
def test_fingerprint(self): """CodonUsage fingerprint should give correct ratios.""" c = EqualBases.codons() f = c.fingerprint() self.assertEqual(len(f), 9) self.assertEqual(f, \ [[.5,.5,.125] for i in range(8)] + [[.5,.5,1]]) #should be able to omit mean... f = c.fingerprint(include_mean=False) self.assertEqual(f, [[.5,.5,.125] for i in range(8)]) #...or use all doublets f = c.fingerprint(include_mean=False, which_blocks='all') self.assertEqual(len(f), 16) #...or do just the non-quartet ones f = c.fingerprint(include_mean=False, which_blocks='split') self.assertEqual(len(f), 6) #check that it doesn't fail on an empty codon usage c = CodonUsage('') f = c.fingerprint() self.assertEqual(f[0], [0.5, 0.5, 0])
def test_fingerprint(self): """CodonUsage fingerprint should give correct ratios.""" c = EqualBases.codons() f = c.fingerprint() self.assertEqual(len(f), 9) self.assertEqual(f, \ [[.5,.5,.125] for i in range(8)] + [[.5,.5,1]]) #should be able to omit mean... f = c.fingerprint(include_mean=False) self.assertEqual(f, [[.5, .5, .125] for i in range(8)]) #...or use all doublets f = c.fingerprint(include_mean=False, which_blocks='all') self.assertEqual(len(f), 16) #...or do just the non-quartet ones f = c.fingerprint(include_mean=False, which_blocks='split') self.assertEqual(len(f), 6) #check that it doesn't fail on an empty codon usage c = CodonUsage('') f = c.fingerprint() self.assertEqual(f[0], [0.5, 0.5, 0])
def test_codons(self): """PositionalBaseUsage codons should give expected codon freqs""" #one of each base should give freqs if 1/64 for everything orig = CodonUsage('UUUCCCAAAGGG') b = orig.positionalBases() final = b.codons() self.assertEqual(len(final), 64) for i in final: self.assertFloatEqual(final[i], 1.0 / 64) #two bases at each position should give correct freqs orig = CodonUsage('UCGAGUUCGUCG') final = orig.positionalBases().codons() exp = { 'UCG': 0.75 * 0.75 * 0.75, 'UCU': 0.75 * 0.75 * 0.25, 'UGG': 0.75 * 0.25 * 0.75, 'UGU': 0.75 * 0.25 * 0.25, 'ACG': 0.25 * 0.75 * 0.75, 'ACU': 0.25 * 0.75 * 0.25, 'AGG': 0.25 * 0.25 * 0.75, 'AGU': 0.25 * 0.25 * 0.25, } for f in final: if f in exp: self.assertFloatEqual(final[f], exp[f]) else: self.assertEqual(final[f], 0)
def adapt_pr2_bias(codon_usages, block='GC', bin_lowbound=0.0, bin_upbound=1.0,\ binwidth=0.1): """Returns the bin midpoint and the PR2 biases for each bin of GC3.""" result = [] for i, bin in enumerate(bin_by_p3(codon_usages, bin_lowbound, bin_upbound, \ binwidth)): if not bin: continue try: tot_usage = CodonUsage() for c in bin: tot_usage += c curr_pr2 = tot_usage.pr2bias(block) midbin = bin_lowbound + (i+0.5)*binwidth result.append([midbin]+list(curr_pr2)) except (ZeroDivisionError, FloatingPointError): pass return array(result)
def kegg_fasta_to_codon_list(lines): """Reads list of CodonUsage objects from KEGG-format FASTA file.""" result = [] for label, seq in MinimalFastaParser(lines): seq = seq.upper() curr_info = {} fields = label.split() curr_info['SpeciesAbbreviation'], curr_info['GeneId'] = \ fields[0].split(':') if len(fields) > 1: #additional annotation first_word = fields[1] if first_word.endswith(';'): #gene label curr_info['Gene'] = first_word[:-1] curr_info['Description'] = ' '.join(fields[2:]) else: curr_info['Description'] = ' '.join(fields[1:]) curr_codon_usage = CodonUsage(seq_to_codon_dict(seq), Info=curr_info) curr_codon_usage.__dict__.update(curr_info) result.append(curr_codon_usage) return result
def adapt_fingerprint(codon_usages, which_blocks='quartets', \ include_mean=True, normalize=True): """takes a sequence of CodonUsage objects and returns an array for a fingerprint plot with: x: the g3/(g3+c3) y: the a3/(a3+u3) frequency: total of the base/total of all in the order: alanine, arginine4, glycine, leucine4, proline, serine4, threonine, valine (if quartets_only is True). codon_usages: list of CodonUsage objects quartets_only: return only the quartets that all code for the same aa(True) quartets_only set to false yeilds a 16 fingerprint include_mean: include a point for the mean in the result (True) normalize: ensure the frequencies returned sum to 1 (True) """ tot_codon_usage = CodonUsage() for idx, c in enumerate(codon_usages): tot_codon_usage += c return tot_codon_usage.fingerprint(which_blocks=which_blocks, \ include_mean=include_mean, normalize=normalize)
def test_positionalBases(self): """CodonUsage bases should count bases at each position correctly""" freqs = { 'UUC': 5, 'AUA': 10, 'AUG': 10, 'CGC': 3, 'AGG': 2, 'XYZ': 8, 'UAA': 2, 'UGA': 1 } u = CodonUsage(freqs) b = u.positionalBases() assert isinstance(b, PositionalBaseUsage) first, second, third = b self.assertEqual(first, BaseUsage({'U': 8, 'C': 3, 'A': 22, 'X': 8})) self.assertEqual(second, BaseUsage({ 'U': 25, 'C': 0, 'A': 2, 'G': 6, 'Y': 8 })) self.assertEqual(third, BaseUsage({'C': 8, 'A': 13, 'G': 12, 'Z': 8})) #check that it also works when we purge p = u.positionalBases(purge_unwanted=True) first, second, third = p self.assertEqual(first, BaseUsage({'U': 5, 'C': 3, 'A': 2})) self.assertEqual(second, BaseUsage({'U': 5, 'G': 5})) self.assertEqual(third, BaseUsage({'C': 8, 'G': 2})) #check that it also works with a different genetic code, and, #incidentally, that the purging didn't affect the original object u.GeneticCode = GeneticCodes[2] #mt code: different stop codons p = u.positionalBases(purge_unwanted=True) first, second, third = p self.assertEqual(first, BaseUsage({'U': 6, 'C': 3, 'A': 20})) self.assertEqual(second, BaseUsage({'U': 25, 'G': 4})) self.assertEqual(third, BaseUsage({'C': 8, 'A': 11, 'G': 10}))
def test_codons(self): """AminoAcidUsage codons should return most likely codon freqs""" a = AminoAcidUsage('GGG') c = CodonUsage('GGUGGCGGAGGG') c.normalize() self.assertEqual(a.codons(), c) a = AminoAcidUsage('D') c = CodonUsage('GAUGAC') c.normalize() self.assertEqual(a.codons(), c) a = AminoAcidUsage('GDDFMM') c = CodonUsage('GGUGGCGGAGGG'+'GAUGAC'*4+'UUUUUC'*2+'AUG'*8) c.normalize() self.assertEqual(a.codons(), c) a = AminoAcidUsage('II*') c = CodonUsage('AUUAUCAUA'*2+'UAAUAGUGA') c.normalize() self.assertEqual(a.codons(), c) #check that it works with a nonstandard code code = GeneticCode('A'*4+'C'*28+'G'*32) a = AminoAcidUsage('AAA') c = CodonUsage('UUUUUCUUAUUG') c.normalize() self.assertEqual(a.codons(code), c) #check that it works with unequal codon frequencies unequal = CodonUsage({'GGU':5,'GGC':2,'GGA':2,'GGG':1,'UUU':3,'UUC':1}) a = AminoAcidUsage('GFFF') exp = { 'GGU':0.5*0.25, 'GGC':0.2*0.25, 'GGA':0.2*0.25, 'GGG':0.1*0.25, 'UUU':0.75*0.75, 'UUC':0.25*0.75 } obs = a.codons(codon_usage=unequal) for codon, freq in obs.items(): self.assertFloatEqual(freq, exp.get(codon, 0))
def test_init_string(self): """CodonUsage should count codons in string""" u = CodonUsage('UUUCCCUUUUUUGA') self.assertEqual(u, CodonUsage({'UUU': 3, 'CCC': 1, 'GA': 1})) u.normalize() self.assertEqual(u, CodonUsage({'UUU': 0.75, 'CCC': 0.25}))
def test_codons(self): """CodonUsage codons should return same object""" u = CodonUsage('abc') c = u.codons() assert u is c
def test_aminoAcids(self): """CodonUsage aminoAcids should correctly count amino acids""" freqs = { 'UUC': 5, 'AUA': 10, 'AUG': 10, 'CGC': 3, 'AGG': 2, 'XYZ': 8, 'UAA': 2, 'UGA': 1 } u = CodonUsage(freqs, "test") self.assertEqual(u.Info, 'test') for key, val in u.items(): if key in freqs: self.assertEqual(val, freqs[key]) else: self.assertEqual(val, 0) aa = u.aminoAcids() self.assertEqual( aa, AminoAcidUsage({ 'F': 5, 'I': 10, 'M': 10, 'R': 5, '*': 3, 'X': 8 })) #check that it works with a different genetic code u.GeneticCode = GeneticCodes['2'] aa = u.aminoAcids() self.assertEqual( aa, AminoAcidUsage({ 'F': 5, 'I': 0, 'M': 20, 'R': 3, '*': 4, 'W': 1, 'X': 8 })) #check that it works if a genetic code is supplied explicitly u.GeneticCode = GeneticCodes[1] aa = u.aminoAcids() self.assertEqual( aa, AminoAcidUsage({ 'F': 5, 'I': 10, 'M': 10, 'R': 5, '*': 3, 'X': 8 })) aa_2 = u.aminoAcids(2) self.assertEqual( aa_2, AminoAcidUsage({ 'F': 5, 'I': 0, 'M': 20, 'R': 3, '*': 4, 'W': 1, 'X': 8 })) #check that we held onto the info object through the above self.assertEqual(aa_2.Info, 'test')
def test_codons(self): """AminoAcidUsage codons should return most likely codon freqs""" a = AminoAcidUsage('GGG') c = CodonUsage('GGUGGCGGAGGG') c.normalize() self.assertEqual(a.codons(), c) a = AminoAcidUsage('D') c = CodonUsage('GAUGAC') c.normalize() self.assertEqual(a.codons(), c) a = AminoAcidUsage('GDDFMM') c = CodonUsage('GGUGGCGGAGGG' + 'GAUGAC' * 4 + 'UUUUUC' * 2 + 'AUG' * 8) c.normalize() self.assertEqual(a.codons(), c) a = AminoAcidUsage('II*') c = CodonUsage('AUUAUCAUA' * 2 + 'UAAUAGUGA') c.normalize() self.assertEqual(a.codons(), c) #check that it works with a nonstandard code code = GeneticCode('A' * 4 + 'C' * 28 + 'G' * 32) a = AminoAcidUsage('AAA') c = CodonUsage('UUUUUCUUAUUG') c.normalize() self.assertEqual(a.codons(code), c) #check that it works with unequal codon frequencies unequal = CodonUsage({ 'GGU': 5, 'GGC': 2, 'GGA': 2, 'GGG': 1, 'UUU': 3, 'UUC': 1 }) a = AminoAcidUsage('GFFF') exp = { 'GGU': 0.5 * 0.25, 'GGC': 0.2 * 0.25, 'GGA': 0.2 * 0.25, 'GGG': 0.1 * 0.25, 'UUU': 0.75 * 0.75, 'UUC': 0.25 * 0.75 } obs = a.codons(codon_usage=unequal) for codon, freq in obs.items(): self.assertFloatEqual(freq, exp.get(codon, 0))
def test_init_string(self): """CodonUsage should count codons in string""" u = CodonUsage('UUUCCCUUUUUUGA') self.assertEqual(u, CodonUsage({'UUU':3, 'CCC':1, 'GA':1})) u.normalize() self.assertEqual(u, CodonUsage({'UUU':0.75, 'CCC':0.25}))