def _generateScoresFromProbabilities(self): gc = translate.geneticCode() self.codon_syn_scores = {} for to_codon in self.codon_prob.keys(): aa = gc[to_codon] sum_sc_i = 0.0 # Go over all alternative codons and compute the average selection coefficient for moving from that codon to this one for from_codon in translate.getSynonyms(to_codon, rna=False): s_from_to = math.log(self.codon_prob[to_codon]/self.codon_prob[from_codon]) - math.log(self.codon_prob_from_nucleotide[to_codon]/self.codon_prob_from_nucleotide[from_codon]) sum_sc_i += self.codon_prob_given_aa[from_codon]*s_from_to self.codon_syn_scores[to_codon] = sum_sc_i
min_relad_value = 0.5 * min([v for v in relad_dict.values() if v > 0.0]) for k in relad_dict.keys(): if relad_dict[k] <= 0.0: relad_dict[k] = min_relad_value ln_relad_dict = dict([(k, math.log(v)) for (k, v) in relad_dict.items()]) # Assay the provided sequences for (id, seq) in seqs: line = "{0} Fop = {1:.4f}, CAI = {2:.4f}, GC = {3:.2f}\n".format( id, cai.getFop(seq, opt_codons), cai_fxn(seq), cai.getGC(seq)) info_outs.write(line) # If optimization is desired, do it. if options.optimize: info_outs.write("# Optimizing sequences...\n") gc = translate.geneticCode(rna=False) codons = {} opt_codon_dict = dict([(gc[c], c) for c in opt_codons]) opt_codon_dict['W'] = 'TGG' opt_codon_dict['M'] = 'ATG' opt_headers = [] opt_seqs = [] # optimize the codon sequences for (id, seq) in seqs: orig_codons = [c for c in translate.codons(seq)] prot_seq = translate.translate(seq) if not prot_seq is None: for aa in translate.AAs(): codons[aa] = [ c for c in translate.getCodonsForAA(aa, rna=False)
def test_run(self): # Here we are trying to test whether ln odds X/Y + ln odds Y/Z = ln odds X/Z. # Assign reference codons # Build a dictionary where each codon gets its reference. random.seed(111) gc = translate.geneticCode(rna=False) reference_codon_dict1 = {} reference_codon_dict2 = {} for codon in translate.AADNACodons(): aa = gc[codon] aa_codons = translate.getCodonsForAA(aa, rna=False) # Sort in alphabetical order by reverse. aa_codons.sort(key=lambda x: x[::-1]) reference_codon_dict1[codon] = aa_codons[0] reference_codon_dict2[codon] = aa_codons[-1] reference_codon_dict1['GCA'] = 'GCC' reference_codon_dict2['GCA'] = 'GCT' # Focus on alanine: GCN. Check if GCA->GCC + GCC->GCT = GCA->GCT #reference_codon_dict[] species = ['x','y'] prots = {"dmel":'MNKYGMVGVCLLAALGALLLEVTASPSS--AA-SSKVDPSQLGGLSAQFLPPEYRNTNVSIEDIKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVARPNGDLDTVFSKYCLKAPEAEACVKEFNDKAQHCLTPEEKRHQETVTRIGASVLGFACSRGGDQIALFIAEQGPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVIHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILMAASSNNTAPGLA-YS--LAGTL-----LGATILLILPMNKYGMVGVCLLAALGALLLEVTASPSS--AA-SSKVDPSQLGGLSAQFLPPEYRNTNVSIEDIKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVARPNGDLDTVFSKYCLKAPEAEACVKEFNDKAQHCLTPEEKRHQETVTRIGASVLGFACSRGGDQIALFIAEQGPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVIHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILMAASSNNTAPGLA-YS--LAGTL-----LGATILLILP', #"dere":'MNKYGIVGVCLLAALGALLLEVTADS-----A-SPKLDPSQLGGLSAQFLPPEYRNTNVSIDDMKRIYREKCKKVNGADNATFYAEIERAAAKMSNCLNGVVNLTALQEEMDVAKPNGDLDTVFSKYCQKAPEAVACVKEFNEKAQHCLTAEEKRHQETVTRIGASVLGFACSRGGDQIALFIAEQGPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFESCVIHHLEQCTQITTANIVQSVFKFVKNETDCQSWMQARANEKPILLAASSNNTATGLA-YS--LAGPL-----LGATLLLMRP', #"dana":'MHKYTLMGLCLMAALGAVLLEVNASPAG--VAIPTKLDPSQLGGLSAQFLPPEYRNTNVTVDDLKRLYREKCKKVTGADNSSFYEEIERAAAKMSNCISGVANLTAIQEEMEQAKPQGELDTVFHKYCQKAPEAEACVKEFNTKMQVCLTAEEKRHQETIARIGASLLGFACSRGGDQIALFVAEQGPECLDANKEAIANCLNQSFHNYIPKDGQVPDLMSAPELLFSPTHCVDLQRFESCVLHHLEQCSEITPANIVQSIFKFVKNETDCQAYMTARANEKPILMAAAGNSTGGGATGLTSHFGSLLAGIFASGLVLILNRY', #"dyak":'MNKYGMVGVCLLAALGALLLEVTASPSSTGSA-STKLDPSQLGGLSAQFLPPEYRNTNVSIEDVKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVAKPNGDLDMVFSKYCQKAPQAEACVKEFNAKAQHCLTAEEKRHQETVTRIGASVLGFACSHGGDQI-------GPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVVHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILLAASGNNTATGLA-YS--LAGPL-----LGATMLLMRP'} "dyak":'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXQHCLTAEEKRHQETVTRIGASVLGFACSHGGDQI-------GPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVVHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILLAASGNNTATGLA-YS--LAGPL-----LGATMLLMRPMNKYGMVGVCLLAALGALLLEVTASPSS--AA-SSKVDPSQLGGLSAQFLPPEYRNTNVSIEDIKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVARPNGDLDTVFSKYCLKAPEAEACVKEFNDKAQHCLTPEEKRHQETVTRXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX-XX--XXXXX-----XXXXXXXXXX'} genes = {"dmel":"ATGAACAAGTACGGGATGGTCGGCGTCTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACTGCCTCTCCTTCGTCCGCCGCCTCGTCTAAGGTGGATCCTAGCCAACTTGGCGGACTTTCAGCTCAGTTCTTGCCACCCGAGTACCGCAACACGAACGTTAGCATCGAGGATATAAAAAGAATATATCGTGAAAAATGCAAGAAGGTAAATGGAGCGGACAACGCAACCTTCTACGAAGAAATCGAGCGGGCGGCAGCCAAGATGAGCACCTGCATCAGCGGGGTGGTCAATCTGACGGCTCTGCAGGAGGAGATGGATGTGGCGAGGCCGAACGGCGACTTGGACACCGTGTTTAGCAAATACTGTCTCAAGGCACCGGAGGCAGAGGCCTGCGTCAAGGAGTTCAACGACAAGGCGCAGCATTGCTTGACCCCCGAGGAGAAGCGCCACCAGGAGACGGTTACCCGAATTGGAGCGTCCGTTTTGGGATTCGCCTGTTCGCGTGGCGGCGATCAGATTGCCCTCTTCATTGCCGAGCAGGGACCCGAGTGCCTGGAGGCCAACAAGGAAGCCATTAGCAATTGCCTCAATCAATCCTTTCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAGCTCCTTTTCTCACCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCCTGTGTCATCCATCATTTGGAGCAGTGCACGCAGATCACCACCGCTAATATCGTTCAGTCCGTCTTCCGTTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCACGTGCGAACGAGAAGCCCATTCTGATGGCCGCCTCCAGCAACAACACAGCCCCTGGACTCGCCTACTCCCTGGCCGGCACTCTTTTGGGCGCCACAATACTCCTGATACTCCCCTGAATGAACAAGTACGGGATGGTCGGCGTCTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACTGCCTCTCCTTCGTCCGCCGCCTCGTCTAAGGTGGATCCTAGCCAACTTGGCGGACTTTCAGCTCAGTTCTTGCCACCCGAGTACCGCAACACGAACGTTAGCATCGAGGATATAAAAAGAATATATCGTGAAAAATGCAAGAAGGTAAATGGAGCGGACAACGCAACCTTCTACGAAGAAATCGAGCGGGCGGCAGCCAAGATGAGCACCTGCATCAGCGGGGTGGTCAATCTGACGGCTCTGCAGGAGGAGATGGATGTGGCGAGGCCGAACGGCGACTTGGACACCGTGTTTAGCAAATACTGTCTCAAGGCACCGGAGGCAGAGGCCTGCGTCAAGGAGTTCAACGACAAGGCGCAGCATTGCTTGACCCCCGAGGAGAAGCGCCACCAGGAGACGGTTACCCGAATTGGAGCGTCCGTTTTGGGATTCGCCTGTTCGCGTGGCGGCGATCAGATTGCCCTCTTCATTGCCGAGCAGGGACCCGAGTGCCTGGAGGCCAACAAGGAAGCCATTAGCAATTGCCTCAATCAATCCTTTCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAGCTCCTTTTCTCACCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCCTGTGTCATCCATCATTTGGAGCAGTGCACGCAGATCACCACCGCTAATATCGTTCAGTCCGTCTTCCGTTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCACGTGCGAACGAGAAGCCCATTCTGATGGCCGCCTCCAGCAACAACACAGCCCCTGGACTCGCCTACTCCCTGGCCGGCACTCTTTTGGGCGCCACAATACTCCTGATACTCCCCTGA", "dyak":"ATGAACAAGTACGGGATGGTTGGCGTTTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACCGCCTCTCCTTCGTCCACCGGCTCGGCGAGTACCAAGCTGGATCCCAGCCAGCTAGGCGGACTTTCGGCCCAGTTCTTACCGCCCGAGTACCGCAACACGAACGTTAGCATCGAGGACGTTAAAAGAATATATCGTGAAAAATGCAAGAAGGTTAATGGAGCGGACAACGCGACCTTCTACGAGGAAATCGAGCGGGCGGCCGCGAAGATGAGCACCTGCATCAGCGGAGTGGTCAACCTGACGGCTCTGCAGGAGGAGATGGATGTGGCCAAGCCGAACGGCGACCTGGACATGGTGTTTAGCAAGTACTGCCAGAAGGCACCGCAGGCGGAGGCCTGTGTCAAGGAGTTCAACGCCAAGGCCCAGCATTGCTTGACCGCCGAGGAGAAGCGCCACCAGGAGACGGTCACCCGCATTGGAGCGTCCGTTCTGGGCTTCGCCTGCTCGCATGGTGGCGATCAGATTGGACCCGAGTGCCTGGAGGCCAACAAGGAGGCCATAAGCAATTGCCTCAACCAATCCTTCCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAACTCCTGTTCTCGCCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCGTGTGTCGTCCATCATTTGGAACAGTGCACCCAGATCACAACCGCCAACATCGTTCAGTCCGTCTTCCGCTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCTCGTGCCAACGAGAAGCCCATCCTGCTGGCCGCCTCCGGCAACAATACAGCCACTGGACTCGCCTACTCTCTGGCCGGCCCTCTCTTGGGCGCCACAATGCTCCTGATGCGCCCCTGAATGAACAAGTACGGGATGGTTGGCGTTTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACCGCCTCTCCTTCGTCCACCGGCTCGGCGAGTACCAAGCTGGATCCCAGCCAGCTAGGCGGACTTTCGGCCCAGTTCTTACCGCCCGAGTACCGCAACACGAACGTTAGCATCGAGGACGTTAAAAGAATATATCGTGAAAAATGCAAGAAGGTTAATGGAGCGGACAACGCGACCTTCTACGAGGAAATCGAGCGGGCGGCCGCGAAGATGAGCACCTGCATCAGCGGAGTGGTCAACCTGACGGCTCTGCAGGAGGAGATGGATGTGGCCAAGCCGAACGGCGACCTGGACATGGTGTTTAGCAAGTACTGCCAGAAGGCACCGCAGGCGGAGGCCTGTGTCAAGGAGTTCAACGCCAAGGCCCAGCATTGCTTGACCGCCGAGGAGAAGCGCCACCAGGAGACGGTCACCCGCATTGGAGCGTCCGTTCTGGGCTTCGCCTGCTCGCATGGTGGCGATCAGATTGGACCCGAGTGCCTGGAGGCCAACAAGGAGGCCATAAGCAATTGCCTCAACCAATCCTTCCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAACTCCTGTTCTCGCCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCGTGTGTCGTCCATCATTTGGAACAGTGCACCCAGATCACAACCGCCAACATCGTTCAGTCCGTCTTCCGCTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCTCGTGCCAACGAGAAGCCCATCCTGCTGGCCGCCTCCGGCAACAATACAGCCACTGGACTCGCCTACTCTCTGGCCGGCCCTCTCTTGGGCGCCACAATGCTCCTGATGCGCCCCTGA"} #algenes = dict([(s,muscle.alignGeneFromProtein(genes[s], prots[s])) for s in species]) species = ['x','y'] prot1 = randomProtein(1000) prot2 = mutProtein(prot1) prots = {"x":prot1, "y":prot2} algenes = dict([(s,translate.randomReverseTranslate(prots[s],bad_codon='---')) for s in species]) #print algenes gene_codon_tables1 = cai.getAkashi2x2TablesForORFRefCodon(cai.conservedAA, reference_codon_dict1, algenes['x'], prots['x'], [algenes['y']], [prots['y']], pseudocount=0, n_terminal_start=0) gene_codon_tables2 = cai.getAkashi2x2TablesForORFRefCodon(cai.conservedAA, reference_codon_dict2, algenes['x'], prots['x'], [algenes['y']], [prots['y']], pseudocount=0, n_terminal_start=0) #print gene_codon_tables1 eps = 1e-6 #for aa in translate.degenerateAAs(): # for codon in translate.getCodonsForAA(aa): for codon in ['GCA']: ref_codon1 = reference_codon_dict1[codon] ref_codon2 = reference_codon_dict2[codon] self.assertTrue(gc[codon] == gc[ref_codon1]) self.assertTrue(gc[codon] == gc[ref_codon2]) # codon to ref_codon1 mh_res1 = stats.MantelHaenszelOddsRatioVariance(gene_codon_tables1[codon]) sc_1_to_r1 = -mh_res1.ln_odds_ratio # ref_codon1 to ref_codon2 mh_res2 = stats.MantelHaenszelOddsRatioVariance(gene_codon_tables2[ref_codon1]) sc_r1_to_r2 = -mh_res2.ln_odds_ratio # codon to ref_codon2 mh_res3 = stats.MantelHaenszelOddsRatioVariance(gene_codon_tables2[codon]) sc_1_to_r2 = -mh_res3.ln_odds_ratio # prediction from additivity pred_sc_1_to_r2 = sc_1_to_r1 + sc_r1_to_r2 #print "{0}->{1} = {2}".format(codon, ref_codon1, sc_1_to_r1) #print "{0}->{1} = {2}".format(ref_codon1, ref_codon2, sc_r1_to_r2) #print "{0}->{1} = {2}".format(codon, ref_codon2, sc_1_to_r2) #, ref_codon2, sc_1_to_r1, sc_r1_to_r2, sc_1_to_r2, pred_sc_1_to_r2 self.assertTrue(abs(sc_1_to_r2-pred_sc_1_to_r2) < eps)
def test_run(self): # Here we are trying to test whether ln odds X/Y + ln odds Y/Z = ln odds X/Z. # Assign reference codons # Build a dictionary where each codon gets its reference. random.seed(111) gc = translate.geneticCode(rna=False) reference_codon_dict1 = {} reference_codon_dict2 = {} for codon in translate.AADNACodons(): aa = gc[codon] aa_codons = translate.getCodonsForAA(aa, rna=False) # Sort in alphabetical order by reverse. aa_codons.sort(key=lambda x: x[::-1]) reference_codon_dict1[codon] = aa_codons[0] reference_codon_dict2[codon] = aa_codons[-1] reference_codon_dict1['GCA'] = 'GCC' reference_codon_dict2['GCA'] = 'GCT' # Focus on alanine: GCN. Check if GCA->GCC + GCC->GCT = GCA->GCT #reference_codon_dict[] species = ['x', 'y'] prots = { "dmel": 'MNKYGMVGVCLLAALGALLLEVTASPSS--AA-SSKVDPSQLGGLSAQFLPPEYRNTNVSIEDIKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVARPNGDLDTVFSKYCLKAPEAEACVKEFNDKAQHCLTPEEKRHQETVTRIGASVLGFACSRGGDQIALFIAEQGPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVIHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILMAASSNNTAPGLA-YS--LAGTL-----LGATILLILPMNKYGMVGVCLLAALGALLLEVTASPSS--AA-SSKVDPSQLGGLSAQFLPPEYRNTNVSIEDIKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVARPNGDLDTVFSKYCLKAPEAEACVKEFNDKAQHCLTPEEKRHQETVTRIGASVLGFACSRGGDQIALFIAEQGPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVIHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILMAASSNNTAPGLA-YS--LAGTL-----LGATILLILP', #"dere":'MNKYGIVGVCLLAALGALLLEVTADS-----A-SPKLDPSQLGGLSAQFLPPEYRNTNVSIDDMKRIYREKCKKVNGADNATFYAEIERAAAKMSNCLNGVVNLTALQEEMDVAKPNGDLDTVFSKYCQKAPEAVACVKEFNEKAQHCLTAEEKRHQETVTRIGASVLGFACSRGGDQIALFIAEQGPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFESCVIHHLEQCTQITTANIVQSVFKFVKNETDCQSWMQARANEKPILLAASSNNTATGLA-YS--LAGPL-----LGATLLLMRP', #"dana":'MHKYTLMGLCLMAALGAVLLEVNASPAG--VAIPTKLDPSQLGGLSAQFLPPEYRNTNVTVDDLKRLYREKCKKVTGADNSSFYEEIERAAAKMSNCISGVANLTAIQEEMEQAKPQGELDTVFHKYCQKAPEAEACVKEFNTKMQVCLTAEEKRHQETIARIGASLLGFACSRGGDQIALFVAEQGPECLDANKEAIANCLNQSFHNYIPKDGQVPDLMSAPELLFSPTHCVDLQRFESCVLHHLEQCSEITPANIVQSIFKFVKNETDCQAYMTARANEKPILMAAAGNSTGGGATGLTSHFGSLLAGIFASGLVLILNRY', #"dyak":'MNKYGMVGVCLLAALGALLLEVTASPSSTGSA-STKLDPSQLGGLSAQFLPPEYRNTNVSIEDVKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVAKPNGDLDMVFSKYCQKAPQAEACVKEFNAKAQHCLTAEEKRHQETVTRIGASVLGFACSHGGDQI-------GPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVVHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILLAASGNNTATGLA-YS--LAGPL-----LGATMLLMRP'} "dyak": 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXQHCLTAEEKRHQETVTRIGASVLGFACSHGGDQI-------GPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVVHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILLAASGNNTATGLA-YS--LAGPL-----LGATMLLMRPMNKYGMVGVCLLAALGALLLEVTASPSS--AA-SSKVDPSQLGGLSAQFLPPEYRNTNVSIEDIKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVARPNGDLDTVFSKYCLKAPEAEACVKEFNDKAQHCLTPEEKRHQETVTRXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX-XX--XXXXX-----XXXXXXXXXX' } genes = { "dmel": "ATGAACAAGTACGGGATGGTCGGCGTCTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACTGCCTCTCCTTCGTCCGCCGCCTCGTCTAAGGTGGATCCTAGCCAACTTGGCGGACTTTCAGCTCAGTTCTTGCCACCCGAGTACCGCAACACGAACGTTAGCATCGAGGATATAAAAAGAATATATCGTGAAAAATGCAAGAAGGTAAATGGAGCGGACAACGCAACCTTCTACGAAGAAATCGAGCGGGCGGCAGCCAAGATGAGCACCTGCATCAGCGGGGTGGTCAATCTGACGGCTCTGCAGGAGGAGATGGATGTGGCGAGGCCGAACGGCGACTTGGACACCGTGTTTAGCAAATACTGTCTCAAGGCACCGGAGGCAGAGGCCTGCGTCAAGGAGTTCAACGACAAGGCGCAGCATTGCTTGACCCCCGAGGAGAAGCGCCACCAGGAGACGGTTACCCGAATTGGAGCGTCCGTTTTGGGATTCGCCTGTTCGCGTGGCGGCGATCAGATTGCCCTCTTCATTGCCGAGCAGGGACCCGAGTGCCTGGAGGCCAACAAGGAAGCCATTAGCAATTGCCTCAATCAATCCTTTCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAGCTCCTTTTCTCACCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCCTGTGTCATCCATCATTTGGAGCAGTGCACGCAGATCACCACCGCTAATATCGTTCAGTCCGTCTTCCGTTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCACGTGCGAACGAGAAGCCCATTCTGATGGCCGCCTCCAGCAACAACACAGCCCCTGGACTCGCCTACTCCCTGGCCGGCACTCTTTTGGGCGCCACAATACTCCTGATACTCCCCTGAATGAACAAGTACGGGATGGTCGGCGTCTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACTGCCTCTCCTTCGTCCGCCGCCTCGTCTAAGGTGGATCCTAGCCAACTTGGCGGACTTTCAGCTCAGTTCTTGCCACCCGAGTACCGCAACACGAACGTTAGCATCGAGGATATAAAAAGAATATATCGTGAAAAATGCAAGAAGGTAAATGGAGCGGACAACGCAACCTTCTACGAAGAAATCGAGCGGGCGGCAGCCAAGATGAGCACCTGCATCAGCGGGGTGGTCAATCTGACGGCTCTGCAGGAGGAGATGGATGTGGCGAGGCCGAACGGCGACTTGGACACCGTGTTTAGCAAATACTGTCTCAAGGCACCGGAGGCAGAGGCCTGCGTCAAGGAGTTCAACGACAAGGCGCAGCATTGCTTGACCCCCGAGGAGAAGCGCCACCAGGAGACGGTTACCCGAATTGGAGCGTCCGTTTTGGGATTCGCCTGTTCGCGTGGCGGCGATCAGATTGCCCTCTTCATTGCCGAGCAGGGACCCGAGTGCCTGGAGGCCAACAAGGAAGCCATTAGCAATTGCCTCAATCAATCCTTTCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAGCTCCTTTTCTCACCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCCTGTGTCATCCATCATTTGGAGCAGTGCACGCAGATCACCACCGCTAATATCGTTCAGTCCGTCTTCCGTTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCACGTGCGAACGAGAAGCCCATTCTGATGGCCGCCTCCAGCAACAACACAGCCCCTGGACTCGCCTACTCCCTGGCCGGCACTCTTTTGGGCGCCACAATACTCCTGATACTCCCCTGA", "dyak": "ATGAACAAGTACGGGATGGTTGGCGTTTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACCGCCTCTCCTTCGTCCACCGGCTCGGCGAGTACCAAGCTGGATCCCAGCCAGCTAGGCGGACTTTCGGCCCAGTTCTTACCGCCCGAGTACCGCAACACGAACGTTAGCATCGAGGACGTTAAAAGAATATATCGTGAAAAATGCAAGAAGGTTAATGGAGCGGACAACGCGACCTTCTACGAGGAAATCGAGCGGGCGGCCGCGAAGATGAGCACCTGCATCAGCGGAGTGGTCAACCTGACGGCTCTGCAGGAGGAGATGGATGTGGCCAAGCCGAACGGCGACCTGGACATGGTGTTTAGCAAGTACTGCCAGAAGGCACCGCAGGCGGAGGCCTGTGTCAAGGAGTTCAACGCCAAGGCCCAGCATTGCTTGACCGCCGAGGAGAAGCGCCACCAGGAGACGGTCACCCGCATTGGAGCGTCCGTTCTGGGCTTCGCCTGCTCGCATGGTGGCGATCAGATTGGACCCGAGTGCCTGGAGGCCAACAAGGAGGCCATAAGCAATTGCCTCAACCAATCCTTCCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAACTCCTGTTCTCGCCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCGTGTGTCGTCCATCATTTGGAACAGTGCACCCAGATCACAACCGCCAACATCGTTCAGTCCGTCTTCCGCTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCTCGTGCCAACGAGAAGCCCATCCTGCTGGCCGCCTCCGGCAACAATACAGCCACTGGACTCGCCTACTCTCTGGCCGGCCCTCTCTTGGGCGCCACAATGCTCCTGATGCGCCCCTGAATGAACAAGTACGGGATGGTTGGCGTTTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACCGCCTCTCCTTCGTCCACCGGCTCGGCGAGTACCAAGCTGGATCCCAGCCAGCTAGGCGGACTTTCGGCCCAGTTCTTACCGCCCGAGTACCGCAACACGAACGTTAGCATCGAGGACGTTAAAAGAATATATCGTGAAAAATGCAAGAAGGTTAATGGAGCGGACAACGCGACCTTCTACGAGGAAATCGAGCGGGCGGCCGCGAAGATGAGCACCTGCATCAGCGGAGTGGTCAACCTGACGGCTCTGCAGGAGGAGATGGATGTGGCCAAGCCGAACGGCGACCTGGACATGGTGTTTAGCAAGTACTGCCAGAAGGCACCGCAGGCGGAGGCCTGTGTCAAGGAGTTCAACGCCAAGGCCCAGCATTGCTTGACCGCCGAGGAGAAGCGCCACCAGGAGACGGTCACCCGCATTGGAGCGTCCGTTCTGGGCTTCGCCTGCTCGCATGGTGGCGATCAGATTGGACCCGAGTGCCTGGAGGCCAACAAGGAGGCCATAAGCAATTGCCTCAACCAATCCTTCCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAACTCCTGTTCTCGCCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCGTGTGTCGTCCATCATTTGGAACAGTGCACCCAGATCACAACCGCCAACATCGTTCAGTCCGTCTTCCGCTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCTCGTGCCAACGAGAAGCCCATCCTGCTGGCCGCCTCCGGCAACAATACAGCCACTGGACTCGCCTACTCTCTGGCCGGCCCTCTCTTGGGCGCCACAATGCTCCTGATGCGCCCCTGA" } #algenes = dict([(s,muscle.alignGeneFromProtein(genes[s], prots[s])) for s in species]) species = ['x', 'y'] prot1 = randomProtein(1000) prot2 = mutProtein(prot1) prots = {"x": prot1, "y": prot2} algenes = dict([(s, translate.randomReverseTranslate(prots[s], bad_codon='---')) for s in species]) #print algenes gene_codon_tables1 = cai.getAkashi2x2TablesForORFRefCodon( cai.conservedAA, reference_codon_dict1, algenes['x'], prots['x'], [algenes['y']], [prots['y']], pseudocount=0, n_terminal_start=0) gene_codon_tables2 = cai.getAkashi2x2TablesForORFRefCodon( cai.conservedAA, reference_codon_dict2, algenes['x'], prots['x'], [algenes['y']], [prots['y']], pseudocount=0, n_terminal_start=0) #print gene_codon_tables1 eps = 1e-6 #for aa in translate.degenerateAAs(): # for codon in translate.getCodonsForAA(aa): for codon in ['GCA']: ref_codon1 = reference_codon_dict1[codon] ref_codon2 = reference_codon_dict2[codon] self.assertTrue(gc[codon] == gc[ref_codon1]) self.assertTrue(gc[codon] == gc[ref_codon2]) # codon to ref_codon1 mh_res1 = stats.MantelHaenszelOddsRatioVariance( gene_codon_tables1[codon]) sc_1_to_r1 = -mh_res1.ln_odds_ratio # ref_codon1 to ref_codon2 mh_res2 = stats.MantelHaenszelOddsRatioVariance( gene_codon_tables2[ref_codon1]) sc_r1_to_r2 = -mh_res2.ln_odds_ratio # codon to ref_codon2 mh_res3 = stats.MantelHaenszelOddsRatioVariance( gene_codon_tables2[codon]) sc_1_to_r2 = -mh_res3.ln_odds_ratio # prediction from additivity pred_sc_1_to_r2 = sc_1_to_r1 + sc_r1_to_r2 #print "{0}->{1} = {2}".format(codon, ref_codon1, sc_1_to_r1) #print "{0}->{1} = {2}".format(ref_codon1, ref_codon2, sc_r1_to_r2) #print "{0}->{1} = {2}".format(codon, ref_codon2, sc_1_to_r2) #, ref_codon2, sc_1_to_r1, sc_r1_to_r2, sc_1_to_r2, pred_sc_1_to_r2 self.assertTrue(abs(sc_1_to_r2 - pred_sc_1_to_r2) < eps)
# Should be using some sort of better estimator! min_relad_value = 0.5 * min([v for v in relad_dict.values() if v>0.0]) for k in relad_dict.keys(): if relad_dict[k] <= 0.0: relad_dict[k] = min_relad_value ln_relad_dict = dict([(k,math.log(v)) for (k,v) in relad_dict.items()]) # Assay the provided sequences for (id, seq) in seqs: line = "{0} Fop = {1:.4f}, CAI = {2:.4f}, GC = {3:.2f}\n".format(id, cai.getFop(seq, opt_codons), cai_fxn(seq), cai.getGC(seq)) info_outs.write(line) # If optimization is desired, do it. if options.optimize: info_outs.write("# Optimizing sequences...\n") gc = translate.geneticCode(rna=False) codons = {} opt_codon_dict = dict([(gc[c],c) for c in opt_codons]) opt_codon_dict['W'] = 'TGG' opt_codon_dict['M'] = 'ATG' opt_headers = [] opt_seqs = [] # optimize the codon sequences for (id, seq) in seqs: orig_codons = [c for c in translate.codons(seq)] prot_seq = translate.translate(seq) if not prot_seq is None: for aa in translate.AAs(): codons[aa] = [c for c in translate.getCodonsForAA(aa, rna=False) if relad_dict[c] >= options.min_rel_adapt] opt_seq = ''