Example #1
0
        gc = translate.geneticCode(rna=False)
        codons = {}
        opt_codon_dict = dict([(gc[c], c) for c in opt_codons])
        opt_codon_dict['W'] = 'TGG'
        opt_codon_dict['M'] = 'ATG'

        opt_headers = []
        opt_seqs = []
        # optimize the codon sequences
        for (id, seq) in seqs:
            orig_codons = [c for c in translate.codons(seq)]
            prot_seq = translate.translate(seq)
            if not prot_seq is None:
                for aa in translate.AAs():
                    codons[aa] = [
                        c for c in translate.getCodonsForAA(aa, rna=False)
                        if relad_dict[c] >= options.min_rel_adapt
                    ]
                opt_seq = ''
                for (aai, aa) in enumerate(prot_seq):
                    #opt_seq += opt_codon_dict[aa] #random.choice(codons[aa])
                    codons_to_choose_from = codons[aa]
                    # If avoiding codons and we have a choice, eliminate the avoided codon.
                    if options.avoid_sequence and len(
                            codons_to_choose_from) > 1:
                        try:
                            codons_to_choose_from.remove(orig_codons[aai])
                        except ValueError:  # codon to be avoided not among codon choices anyway
                            pass
                    opt_seq += random.choice(codons_to_choose_from)
                assert translate.translate(opt_seq) == prot_seq
Example #2
0
File: cai_test.py Project: dad/base
	def test_run(self):
		# Here we are trying to test whether ln odds X/Y + ln odds Y/Z = ln odds X/Z.
		# Assign reference codons
		# Build a dictionary where each codon gets its reference.
		random.seed(111)

		gc = translate.geneticCode(rna=False)
		reference_codon_dict1 = {}
		reference_codon_dict2 = {}
		for codon in translate.AADNACodons():
			aa = gc[codon]
			aa_codons = translate.getCodonsForAA(aa, rna=False)
			# Sort in alphabetical order by reverse.
			aa_codons.sort(key=lambda x: x[::-1])
			reference_codon_dict1[codon] = aa_codons[0]
			reference_codon_dict2[codon] = aa_codons[-1]
		
		reference_codon_dict1['GCA'] = 'GCC'
		reference_codon_dict2['GCA'] = 'GCT'
		
		# Focus on alanine: GCN. Check if GCA->GCC + GCC->GCT = GCA->GCT
		#reference_codon_dict[]
		species = ['x','y']

		prots = {"dmel":'MNKYGMVGVCLLAALGALLLEVTASPSS--AA-SSKVDPSQLGGLSAQFLPPEYRNTNVSIEDIKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVARPNGDLDTVFSKYCLKAPEAEACVKEFNDKAQHCLTPEEKRHQETVTRIGASVLGFACSRGGDQIALFIAEQGPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVIHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILMAASSNNTAPGLA-YS--LAGTL-----LGATILLILPMNKYGMVGVCLLAALGALLLEVTASPSS--AA-SSKVDPSQLGGLSAQFLPPEYRNTNVSIEDIKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVARPNGDLDTVFSKYCLKAPEAEACVKEFNDKAQHCLTPEEKRHQETVTRIGASVLGFACSRGGDQIALFIAEQGPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVIHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILMAASSNNTAPGLA-YS--LAGTL-----LGATILLILP',
				#"dere":'MNKYGIVGVCLLAALGALLLEVTADS-----A-SPKLDPSQLGGLSAQFLPPEYRNTNVSIDDMKRIYREKCKKVNGADNATFYAEIERAAAKMSNCLNGVVNLTALQEEMDVAKPNGDLDTVFSKYCQKAPEAVACVKEFNEKAQHCLTAEEKRHQETVTRIGASVLGFACSRGGDQIALFIAEQGPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFESCVIHHLEQCTQITTANIVQSVFKFVKNETDCQSWMQARANEKPILLAASSNNTATGLA-YS--LAGPL-----LGATLLLMRP',
				#"dana":'MHKYTLMGLCLMAALGAVLLEVNASPAG--VAIPTKLDPSQLGGLSAQFLPPEYRNTNVTVDDLKRLYREKCKKVTGADNSSFYEEIERAAAKMSNCISGVANLTAIQEEMEQAKPQGELDTVFHKYCQKAPEAEACVKEFNTKMQVCLTAEEKRHQETIARIGASLLGFACSRGGDQIALFVAEQGPECLDANKEAIANCLNQSFHNYIPKDGQVPDLMSAPELLFSPTHCVDLQRFESCVLHHLEQCSEITPANIVQSIFKFVKNETDCQAYMTARANEKPILMAAAGNSTGGGATGLTSHFGSLLAGIFASGLVLILNRY',
				#"dyak":'MNKYGMVGVCLLAALGALLLEVTASPSSTGSA-STKLDPSQLGGLSAQFLPPEYRNTNVSIEDVKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVAKPNGDLDMVFSKYCQKAPQAEACVKEFNAKAQHCLTAEEKRHQETVTRIGASVLGFACSHGGDQI-------GPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVVHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILLAASGNNTATGLA-YS--LAGPL-----LGATMLLMRP'}
				 "dyak":'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXQHCLTAEEKRHQETVTRIGASVLGFACSHGGDQI-------GPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVVHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILLAASGNNTATGLA-YS--LAGPL-----LGATMLLMRPMNKYGMVGVCLLAALGALLLEVTASPSS--AA-SSKVDPSQLGGLSAQFLPPEYRNTNVSIEDIKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVARPNGDLDTVFSKYCLKAPEAEACVKEFNDKAQHCLTPEEKRHQETVTRXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX-XX--XXXXX-----XXXXXXXXXX'}
		genes = {"dmel":"ATGAACAAGTACGGGATGGTCGGCGTCTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACTGCCTCTCCTTCGTCCGCCGCCTCGTCTAAGGTGGATCCTAGCCAACTTGGCGGACTTTCAGCTCAGTTCTTGCCACCCGAGTACCGCAACACGAACGTTAGCATCGAGGATATAAAAAGAATATATCGTGAAAAATGCAAGAAGGTAAATGGAGCGGACAACGCAACCTTCTACGAAGAAATCGAGCGGGCGGCAGCCAAGATGAGCACCTGCATCAGCGGGGTGGTCAATCTGACGGCTCTGCAGGAGGAGATGGATGTGGCGAGGCCGAACGGCGACTTGGACACCGTGTTTAGCAAATACTGTCTCAAGGCACCGGAGGCAGAGGCCTGCGTCAAGGAGTTCAACGACAAGGCGCAGCATTGCTTGACCCCCGAGGAGAAGCGCCACCAGGAGACGGTTACCCGAATTGGAGCGTCCGTTTTGGGATTCGCCTGTTCGCGTGGCGGCGATCAGATTGCCCTCTTCATTGCCGAGCAGGGACCCGAGTGCCTGGAGGCCAACAAGGAAGCCATTAGCAATTGCCTCAATCAATCCTTTCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAGCTCCTTTTCTCACCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCCTGTGTCATCCATCATTTGGAGCAGTGCACGCAGATCACCACCGCTAATATCGTTCAGTCCGTCTTCCGTTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCACGTGCGAACGAGAAGCCCATTCTGATGGCCGCCTCCAGCAACAACACAGCCCCTGGACTCGCCTACTCCCTGGCCGGCACTCTTTTGGGCGCCACAATACTCCTGATACTCCCCTGAATGAACAAGTACGGGATGGTCGGCGTCTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACTGCCTCTCCTTCGTCCGCCGCCTCGTCTAAGGTGGATCCTAGCCAACTTGGCGGACTTTCAGCTCAGTTCTTGCCACCCGAGTACCGCAACACGAACGTTAGCATCGAGGATATAAAAAGAATATATCGTGAAAAATGCAAGAAGGTAAATGGAGCGGACAACGCAACCTTCTACGAAGAAATCGAGCGGGCGGCAGCCAAGATGAGCACCTGCATCAGCGGGGTGGTCAATCTGACGGCTCTGCAGGAGGAGATGGATGTGGCGAGGCCGAACGGCGACTTGGACACCGTGTTTAGCAAATACTGTCTCAAGGCACCGGAGGCAGAGGCCTGCGTCAAGGAGTTCAACGACAAGGCGCAGCATTGCTTGACCCCCGAGGAGAAGCGCCACCAGGAGACGGTTACCCGAATTGGAGCGTCCGTTTTGGGATTCGCCTGTTCGCGTGGCGGCGATCAGATTGCCCTCTTCATTGCCGAGCAGGGACCCGAGTGCCTGGAGGCCAACAAGGAAGCCATTAGCAATTGCCTCAATCAATCCTTTCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAGCTCCTTTTCTCACCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCCTGTGTCATCCATCATTTGGAGCAGTGCACGCAGATCACCACCGCTAATATCGTTCAGTCCGTCTTCCGTTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCACGTGCGAACGAGAAGCCCATTCTGATGGCCGCCTCCAGCAACAACACAGCCCCTGGACTCGCCTACTCCCTGGCCGGCACTCTTTTGGGCGCCACAATACTCCTGATACTCCCCTGA",
				"dyak":"ATGAACAAGTACGGGATGGTTGGCGTTTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACCGCCTCTCCTTCGTCCACCGGCTCGGCGAGTACCAAGCTGGATCCCAGCCAGCTAGGCGGACTTTCGGCCCAGTTCTTACCGCCCGAGTACCGCAACACGAACGTTAGCATCGAGGACGTTAAAAGAATATATCGTGAAAAATGCAAGAAGGTTAATGGAGCGGACAACGCGACCTTCTACGAGGAAATCGAGCGGGCGGCCGCGAAGATGAGCACCTGCATCAGCGGAGTGGTCAACCTGACGGCTCTGCAGGAGGAGATGGATGTGGCCAAGCCGAACGGCGACCTGGACATGGTGTTTAGCAAGTACTGCCAGAAGGCACCGCAGGCGGAGGCCTGTGTCAAGGAGTTCAACGCCAAGGCCCAGCATTGCTTGACCGCCGAGGAGAAGCGCCACCAGGAGACGGTCACCCGCATTGGAGCGTCCGTTCTGGGCTTCGCCTGCTCGCATGGTGGCGATCAGATTGGACCCGAGTGCCTGGAGGCCAACAAGGAGGCCATAAGCAATTGCCTCAACCAATCCTTCCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAACTCCTGTTCTCGCCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCGTGTGTCGTCCATCATTTGGAACAGTGCACCCAGATCACAACCGCCAACATCGTTCAGTCCGTCTTCCGCTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCTCGTGCCAACGAGAAGCCCATCCTGCTGGCCGCCTCCGGCAACAATACAGCCACTGGACTCGCCTACTCTCTGGCCGGCCCTCTCTTGGGCGCCACAATGCTCCTGATGCGCCCCTGAATGAACAAGTACGGGATGGTTGGCGTTTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACCGCCTCTCCTTCGTCCACCGGCTCGGCGAGTACCAAGCTGGATCCCAGCCAGCTAGGCGGACTTTCGGCCCAGTTCTTACCGCCCGAGTACCGCAACACGAACGTTAGCATCGAGGACGTTAAAAGAATATATCGTGAAAAATGCAAGAAGGTTAATGGAGCGGACAACGCGACCTTCTACGAGGAAATCGAGCGGGCGGCCGCGAAGATGAGCACCTGCATCAGCGGAGTGGTCAACCTGACGGCTCTGCAGGAGGAGATGGATGTGGCCAAGCCGAACGGCGACCTGGACATGGTGTTTAGCAAGTACTGCCAGAAGGCACCGCAGGCGGAGGCCTGTGTCAAGGAGTTCAACGCCAAGGCCCAGCATTGCTTGACCGCCGAGGAGAAGCGCCACCAGGAGACGGTCACCCGCATTGGAGCGTCCGTTCTGGGCTTCGCCTGCTCGCATGGTGGCGATCAGATTGGACCCGAGTGCCTGGAGGCCAACAAGGAGGCCATAAGCAATTGCCTCAACCAATCCTTCCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAACTCCTGTTCTCGCCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCGTGTGTCGTCCATCATTTGGAACAGTGCACCCAGATCACAACCGCCAACATCGTTCAGTCCGTCTTCCGCTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCTCGTGCCAACGAGAAGCCCATCCTGCTGGCCGCCTCCGGCAACAATACAGCCACTGGACTCGCCTACTCTCTGGCCGGCCCTCTCTTGGGCGCCACAATGCTCCTGATGCGCCCCTGA"}
				
		#algenes = dict([(s,muscle.alignGeneFromProtein(genes[s], prots[s])) for s in species])
		
		species = ['x','y']
		prot1 = randomProtein(1000)
		prot2 = mutProtein(prot1)
		prots = {"x":prot1, "y":prot2}
		algenes = dict([(s,translate.randomReverseTranslate(prots[s],bad_codon='---')) for s in species])
		
		#print algenes
		gene_codon_tables1 = cai.getAkashi2x2TablesForORFRefCodon(cai.conservedAA, reference_codon_dict1, algenes['x'], prots['x'], [algenes['y']], [prots['y']], pseudocount=0, n_terminal_start=0)
		gene_codon_tables2 = cai.getAkashi2x2TablesForORFRefCodon(cai.conservedAA, reference_codon_dict2, algenes['x'], prots['x'], [algenes['y']], [prots['y']], pseudocount=0, n_terminal_start=0)
		#print gene_codon_tables1
		
		eps = 1e-6
		
		#for aa in translate.degenerateAAs():
		#	for codon in translate.getCodonsForAA(aa):

		for codon in ['GCA']:				
			ref_codon1 = reference_codon_dict1[codon]
			ref_codon2 = reference_codon_dict2[codon]
			self.assertTrue(gc[codon] == gc[ref_codon1])
			self.assertTrue(gc[codon] == gc[ref_codon2])
			# codon to ref_codon1 
			mh_res1 = stats.MantelHaenszelOddsRatioVariance(gene_codon_tables1[codon])
			sc_1_to_r1 = -mh_res1.ln_odds_ratio
			# ref_codon1 to ref_codon2
			mh_res2 = stats.MantelHaenszelOddsRatioVariance(gene_codon_tables2[ref_codon1])
			sc_r1_to_r2 = -mh_res2.ln_odds_ratio
			# codon to ref_codon2
			mh_res3 = stats.MantelHaenszelOddsRatioVariance(gene_codon_tables2[codon])
			sc_1_to_r2 = -mh_res3.ln_odds_ratio
			# prediction from additivity
			pred_sc_1_to_r2 = sc_1_to_r1 + sc_r1_to_r2
			#print "{0}->{1} = {2}".format(codon, ref_codon1, sc_1_to_r1)
			#print "{0}->{1} = {2}".format(ref_codon1, ref_codon2, sc_r1_to_r2)
			#print "{0}->{1} = {2}".format(codon, ref_codon2, sc_1_to_r2)
			#, ref_codon2, sc_1_to_r1, sc_r1_to_r2, sc_1_to_r2, pred_sc_1_to_r2
			self.assertTrue(abs(sc_1_to_r2-pred_sc_1_to_r2) < eps)
Example #3
0
	data_outs = util.OutStreams()

	# Start up output
	if not options.out_fname is None:
		outf = file(options.out_fname, 'w')
		data_outs.addStream(outf)
	else:
		data_outs.addStream(sys.stdout)
	formatFxn = biofile.getIDFunction(options.format)
	cdna_dict = biofile.readFASTADict(in_fname, formatFxn)
	calc = Calculator()
	calc.initializeFromSequences(cdna_dict.values(), options.pseudocount)
	syn_dict = calc.getCodonSYNScores()
	syn_opt_codons = []
	for aa in translate.degenerateAAs():
		codons = translate.getCodonsForAA(aa, rna=False)
		best_syn_codon = sorted([(syn_dict[c],c) for c in codons])[-1][1]
		syn_opt_codons.append(best_syn_codon)
	data_outs.write("# Read {0}\n#{1:d} sequences, {2:d} codons, {3:d} nucleotides\n".format(in_fname, len(cdna_dict.keys()), int(sum(calc.codon_freq.values())), int(sum(calc.nucleotide_freq.values()))))
	data_outs.write("# syn_scores = {0!s}\n".format(syn_dict))
	data_outs.write("# SYN opt codons = {0!s}\n".format(sorted(syn_opt_codons)))
	data_outs.write("{0!s}".format(calc))

	if not options.score_dict_fname is None:
		pickle.dump(syn_dict, file(options.score_dict_fname,'w'))

	if not options.score_fname is None:
		outf = file(options.score_fname, 'w')
		outf.write("orf\tsyn\n")
		orfs = cdna_dict.keys()
		n_written = 0
Example #4
0
    def test_run(self):
        # Here we are trying to test whether ln odds X/Y + ln odds Y/Z = ln odds X/Z.
        # Assign reference codons
        # Build a dictionary where each codon gets its reference.
        random.seed(111)

        gc = translate.geneticCode(rna=False)
        reference_codon_dict1 = {}
        reference_codon_dict2 = {}
        for codon in translate.AADNACodons():
            aa = gc[codon]
            aa_codons = translate.getCodonsForAA(aa, rna=False)
            # Sort in alphabetical order by reverse.
            aa_codons.sort(key=lambda x: x[::-1])
            reference_codon_dict1[codon] = aa_codons[0]
            reference_codon_dict2[codon] = aa_codons[-1]

        reference_codon_dict1['GCA'] = 'GCC'
        reference_codon_dict2['GCA'] = 'GCT'

        # Focus on alanine: GCN. Check if GCA->GCC + GCC->GCT = GCA->GCT
        #reference_codon_dict[]
        species = ['x', 'y']

        prots = {
            "dmel":
            'MNKYGMVGVCLLAALGALLLEVTASPSS--AA-SSKVDPSQLGGLSAQFLPPEYRNTNVSIEDIKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVARPNGDLDTVFSKYCLKAPEAEACVKEFNDKAQHCLTPEEKRHQETVTRIGASVLGFACSRGGDQIALFIAEQGPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVIHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILMAASSNNTAPGLA-YS--LAGTL-----LGATILLILPMNKYGMVGVCLLAALGALLLEVTASPSS--AA-SSKVDPSQLGGLSAQFLPPEYRNTNVSIEDIKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVARPNGDLDTVFSKYCLKAPEAEACVKEFNDKAQHCLTPEEKRHQETVTRIGASVLGFACSRGGDQIALFIAEQGPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVIHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILMAASSNNTAPGLA-YS--LAGTL-----LGATILLILP',
            #"dere":'MNKYGIVGVCLLAALGALLLEVTADS-----A-SPKLDPSQLGGLSAQFLPPEYRNTNVSIDDMKRIYREKCKKVNGADNATFYAEIERAAAKMSNCLNGVVNLTALQEEMDVAKPNGDLDTVFSKYCQKAPEAVACVKEFNEKAQHCLTAEEKRHQETVTRIGASVLGFACSRGGDQIALFIAEQGPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFESCVIHHLEQCTQITTANIVQSVFKFVKNETDCQSWMQARANEKPILLAASSNNTATGLA-YS--LAGPL-----LGATLLLMRP',
            #"dana":'MHKYTLMGLCLMAALGAVLLEVNASPAG--VAIPTKLDPSQLGGLSAQFLPPEYRNTNVTVDDLKRLYREKCKKVTGADNSSFYEEIERAAAKMSNCISGVANLTAIQEEMEQAKPQGELDTVFHKYCQKAPEAEACVKEFNTKMQVCLTAEEKRHQETIARIGASLLGFACSRGGDQIALFVAEQGPECLDANKEAIANCLNQSFHNYIPKDGQVPDLMSAPELLFSPTHCVDLQRFESCVLHHLEQCSEITPANIVQSIFKFVKNETDCQAYMTARANEKPILMAAAGNSTGGGATGLTSHFGSLLAGIFASGLVLILNRY',
            #"dyak":'MNKYGMVGVCLLAALGALLLEVTASPSSTGSA-STKLDPSQLGGLSAQFLPPEYRNTNVSIEDVKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVAKPNGDLDMVFSKYCQKAPQAEACVKEFNAKAQHCLTAEEKRHQETVTRIGASVLGFACSHGGDQI-------GPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVVHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILLAASGNNTATGLA-YS--LAGPL-----LGATMLLMRP'}
            "dyak":
            'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXQHCLTAEEKRHQETVTRIGASVLGFACSHGGDQI-------GPECLEANKEAISNCLNQSFHQYIPKDGQVPDLMSRPELLFSPTHCVDLQRFEACVVHHLEQCTQITTANIVQSVFRFVKNETDCQAWMQARANEKPILLAASGNNTATGLA-YS--LAGPL-----LGATMLLMRPMNKYGMVGVCLLAALGALLLEVTASPSS--AA-SSKVDPSQLGGLSAQFLPPEYRNTNVSIEDIKRIYREKCKKVNGADNATFYEEIERAAAKMSTCISGVVNLTALQEEMDVARPNGDLDTVFSKYCLKAPEAEACVKEFNDKAQHCLTPEEKRHQETVTRXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX-XX--XXXXX-----XXXXXXXXXX'
        }
        genes = {
            "dmel":
            "ATGAACAAGTACGGGATGGTCGGCGTCTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACTGCCTCTCCTTCGTCCGCCGCCTCGTCTAAGGTGGATCCTAGCCAACTTGGCGGACTTTCAGCTCAGTTCTTGCCACCCGAGTACCGCAACACGAACGTTAGCATCGAGGATATAAAAAGAATATATCGTGAAAAATGCAAGAAGGTAAATGGAGCGGACAACGCAACCTTCTACGAAGAAATCGAGCGGGCGGCAGCCAAGATGAGCACCTGCATCAGCGGGGTGGTCAATCTGACGGCTCTGCAGGAGGAGATGGATGTGGCGAGGCCGAACGGCGACTTGGACACCGTGTTTAGCAAATACTGTCTCAAGGCACCGGAGGCAGAGGCCTGCGTCAAGGAGTTCAACGACAAGGCGCAGCATTGCTTGACCCCCGAGGAGAAGCGCCACCAGGAGACGGTTACCCGAATTGGAGCGTCCGTTTTGGGATTCGCCTGTTCGCGTGGCGGCGATCAGATTGCCCTCTTCATTGCCGAGCAGGGACCCGAGTGCCTGGAGGCCAACAAGGAAGCCATTAGCAATTGCCTCAATCAATCCTTTCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAGCTCCTTTTCTCACCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCCTGTGTCATCCATCATTTGGAGCAGTGCACGCAGATCACCACCGCTAATATCGTTCAGTCCGTCTTCCGTTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCACGTGCGAACGAGAAGCCCATTCTGATGGCCGCCTCCAGCAACAACACAGCCCCTGGACTCGCCTACTCCCTGGCCGGCACTCTTTTGGGCGCCACAATACTCCTGATACTCCCCTGAATGAACAAGTACGGGATGGTCGGCGTCTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACTGCCTCTCCTTCGTCCGCCGCCTCGTCTAAGGTGGATCCTAGCCAACTTGGCGGACTTTCAGCTCAGTTCTTGCCACCCGAGTACCGCAACACGAACGTTAGCATCGAGGATATAAAAAGAATATATCGTGAAAAATGCAAGAAGGTAAATGGAGCGGACAACGCAACCTTCTACGAAGAAATCGAGCGGGCGGCAGCCAAGATGAGCACCTGCATCAGCGGGGTGGTCAATCTGACGGCTCTGCAGGAGGAGATGGATGTGGCGAGGCCGAACGGCGACTTGGACACCGTGTTTAGCAAATACTGTCTCAAGGCACCGGAGGCAGAGGCCTGCGTCAAGGAGTTCAACGACAAGGCGCAGCATTGCTTGACCCCCGAGGAGAAGCGCCACCAGGAGACGGTTACCCGAATTGGAGCGTCCGTTTTGGGATTCGCCTGTTCGCGTGGCGGCGATCAGATTGCCCTCTTCATTGCCGAGCAGGGACCCGAGTGCCTGGAGGCCAACAAGGAAGCCATTAGCAATTGCCTCAATCAATCCTTTCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAGCTCCTTTTCTCACCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCCTGTGTCATCCATCATTTGGAGCAGTGCACGCAGATCACCACCGCTAATATCGTTCAGTCCGTCTTCCGTTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCACGTGCGAACGAGAAGCCCATTCTGATGGCCGCCTCCAGCAACAACACAGCCCCTGGACTCGCCTACTCCCTGGCCGGCACTCTTTTGGGCGCCACAATACTCCTGATACTCCCCTGA",
            "dyak":
            "ATGAACAAGTACGGGATGGTTGGCGTTTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACCGCCTCTCCTTCGTCCACCGGCTCGGCGAGTACCAAGCTGGATCCCAGCCAGCTAGGCGGACTTTCGGCCCAGTTCTTACCGCCCGAGTACCGCAACACGAACGTTAGCATCGAGGACGTTAAAAGAATATATCGTGAAAAATGCAAGAAGGTTAATGGAGCGGACAACGCGACCTTCTACGAGGAAATCGAGCGGGCGGCCGCGAAGATGAGCACCTGCATCAGCGGAGTGGTCAACCTGACGGCTCTGCAGGAGGAGATGGATGTGGCCAAGCCGAACGGCGACCTGGACATGGTGTTTAGCAAGTACTGCCAGAAGGCACCGCAGGCGGAGGCCTGTGTCAAGGAGTTCAACGCCAAGGCCCAGCATTGCTTGACCGCCGAGGAGAAGCGCCACCAGGAGACGGTCACCCGCATTGGAGCGTCCGTTCTGGGCTTCGCCTGCTCGCATGGTGGCGATCAGATTGGACCCGAGTGCCTGGAGGCCAACAAGGAGGCCATAAGCAATTGCCTCAACCAATCCTTCCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAACTCCTGTTCTCGCCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCGTGTGTCGTCCATCATTTGGAACAGTGCACCCAGATCACAACCGCCAACATCGTTCAGTCCGTCTTCCGCTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCTCGTGCCAACGAGAAGCCCATCCTGCTGGCCGCCTCCGGCAACAATACAGCCACTGGACTCGCCTACTCTCTGGCCGGCCCTCTCTTGGGCGCCACAATGCTCCTGATGCGCCCCTGAATGAACAAGTACGGGATGGTTGGCGTTTGCCTACTGGCTGCTCTGGGCGCTCTGCTCCTGGAGGTCACCGCCTCTCCTTCGTCCACCGGCTCGGCGAGTACCAAGCTGGATCCCAGCCAGCTAGGCGGACTTTCGGCCCAGTTCTTACCGCCCGAGTACCGCAACACGAACGTTAGCATCGAGGACGTTAAAAGAATATATCGTGAAAAATGCAAGAAGGTTAATGGAGCGGACAACGCGACCTTCTACGAGGAAATCGAGCGGGCGGCCGCGAAGATGAGCACCTGCATCAGCGGAGTGGTCAACCTGACGGCTCTGCAGGAGGAGATGGATGTGGCCAAGCCGAACGGCGACCTGGACATGGTGTTTAGCAAGTACTGCCAGAAGGCACCGCAGGCGGAGGCCTGTGTCAAGGAGTTCAACGCCAAGGCCCAGCATTGCTTGACCGCCGAGGAGAAGCGCCACCAGGAGACGGTCACCCGCATTGGAGCGTCCGTTCTGGGCTTCGCCTGCTCGCATGGTGGCGATCAGATTGGACCCGAGTGCCTGGAGGCCAACAAGGAGGCCATAAGCAATTGCCTCAACCAATCCTTCCATCAGTACATTCCCAAGGATGGCCAAGTTCCGGACCTGATGAGCCGCCCAGAACTCCTGTTCTCGCCCACCCACTGCGTGGACCTGCAGCGCTTCGAGGCGTGTGTCGTCCATCATTTGGAACAGTGCACCCAGATCACAACCGCCAACATCGTTCAGTCCGTCTTCCGCTTCGTGAAGAACGAGACCGACTGCCAGGCTTGGATGCAGGCTCGTGCCAACGAGAAGCCCATCCTGCTGGCCGCCTCCGGCAACAATACAGCCACTGGACTCGCCTACTCTCTGGCCGGCCCTCTCTTGGGCGCCACAATGCTCCTGATGCGCCCCTGA"
        }

        #algenes = dict([(s,muscle.alignGeneFromProtein(genes[s], prots[s])) for s in species])

        species = ['x', 'y']
        prot1 = randomProtein(1000)
        prot2 = mutProtein(prot1)
        prots = {"x": prot1, "y": prot2}
        algenes = dict([(s,
                         translate.randomReverseTranslate(prots[s],
                                                          bad_codon='---'))
                        for s in species])

        #print algenes
        gene_codon_tables1 = cai.getAkashi2x2TablesForORFRefCodon(
            cai.conservedAA,
            reference_codon_dict1,
            algenes['x'],
            prots['x'], [algenes['y']], [prots['y']],
            pseudocount=0,
            n_terminal_start=0)
        gene_codon_tables2 = cai.getAkashi2x2TablesForORFRefCodon(
            cai.conservedAA,
            reference_codon_dict2,
            algenes['x'],
            prots['x'], [algenes['y']], [prots['y']],
            pseudocount=0,
            n_terminal_start=0)
        #print gene_codon_tables1

        eps = 1e-6

        #for aa in translate.degenerateAAs():
        #	for codon in translate.getCodonsForAA(aa):

        for codon in ['GCA']:
            ref_codon1 = reference_codon_dict1[codon]
            ref_codon2 = reference_codon_dict2[codon]
            self.assertTrue(gc[codon] == gc[ref_codon1])
            self.assertTrue(gc[codon] == gc[ref_codon2])
            # codon to ref_codon1
            mh_res1 = stats.MantelHaenszelOddsRatioVariance(
                gene_codon_tables1[codon])
            sc_1_to_r1 = -mh_res1.ln_odds_ratio
            # ref_codon1 to ref_codon2
            mh_res2 = stats.MantelHaenszelOddsRatioVariance(
                gene_codon_tables2[ref_codon1])
            sc_r1_to_r2 = -mh_res2.ln_odds_ratio
            # codon to ref_codon2
            mh_res3 = stats.MantelHaenszelOddsRatioVariance(
                gene_codon_tables2[codon])
            sc_1_to_r2 = -mh_res3.ln_odds_ratio
            # prediction from additivity
            pred_sc_1_to_r2 = sc_1_to_r1 + sc_r1_to_r2
            #print "{0}->{1} = {2}".format(codon, ref_codon1, sc_1_to_r1)
            #print "{0}->{1} = {2}".format(ref_codon1, ref_codon2, sc_r1_to_r2)
            #print "{0}->{1} = {2}".format(codon, ref_codon2, sc_1_to_r2)
            #, ref_codon2, sc_1_to_r1, sc_r1_to_r2, sc_1_to_r2, pred_sc_1_to_r2
            self.assertTrue(abs(sc_1_to_r2 - pred_sc_1_to_r2) < eps)
Example #5
0
File: codonopt.py Project: dad/base
		info_outs.write("# Optimizing sequences...\n")
		gc = translate.geneticCode(rna=False)
		codons = {}
		opt_codon_dict = dict([(gc[c],c) for c in opt_codons])
		opt_codon_dict['W'] = 'TGG'
		opt_codon_dict['M'] = 'ATG'

		opt_headers = []
		opt_seqs = []
		# optimize the codon sequences
		for (id, seq) in seqs:
			orig_codons = [c for c in translate.codons(seq)]
			prot_seq = translate.translate(seq)
			if not prot_seq is None:
				for aa in translate.AAs():
					codons[aa] = [c for c in translate.getCodonsForAA(aa, rna=False) if relad_dict[c] >= options.min_rel_adapt]
				opt_seq = ''
				for (aai, aa) in enumerate(prot_seq):
					#opt_seq += opt_codon_dict[aa] #random.choice(codons[aa])
					codons_to_choose_from = codons[aa]
					# If avoiding codons and we have a choice, eliminate the avoided codon.
					if options.avoid_sequence and len(codons_to_choose_from)>1:
						try:
							codons_to_choose_from.remove(orig_codons[aai])
						except ValueError: # codon to be avoided not among codon choices anyway
							pass
					opt_seq += random.choice(codons_to_choose_from)
				assert translate.translate(opt_seq) == prot_seq
				header_line = "{0} Fop = {1:.4f}, CAI = {2:.4f}, GC = {3:.2f}".format(id, cai.getFop(opt_seq, opt_codons), cai_fxn(opt_seq), cai.getGC(opt_seq))
				info_outs.write("# Optimized {}\n".format(header_line))
				opt_headers.append(header_line)