def test_build(self): codon_aln1 = codonalign.build(self.aln1, self.seqlist1) codon_aln2 = codonalign.build(self.aln2, self.seqlist2) codon_aln3 = codonalign.build( self.aln3, self.seqlist3, codon_table=self.codontable3 ) codon_aln4 = codonalign.build(self.aln1, self.seqlist1, complete_protein=True)
def setUp(self): self.aln_file = [TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3, TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6] alns = [] for i in self.aln_file: if i[1] == 'parse': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet) elif i[1] == 'index': nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20) elif i[1] == 'id': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with open(i[0][2]) as handle: id = dict((i.split()[0], i.split()[1]) for i in handle) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet) alns.append(caln) nucl.close() # Close the indexed FASTA file self.alns = alns
def setUp(self): self.aln_file = [TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3, TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6] alns = [] for i in self.aln_file: if i[1] == "parse": nucl = SeqIO.parse(i[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], "clustal", alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter("ignore") caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet) elif i[1] == "index": # Deliberately using a fancy protein alphabet for testing: nucl = SeqIO.index(i[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], "clustal", alphabet=generic_protein) with warnings.catch_warnings(): warnings.simplefilter("ignore") caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20) nucl.close() # Close the indexed FASTA file elif i[1] == "id": nucl = SeqIO.parse(i[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], "clustal", alphabet=IUPAC.protein) with open(i[0][2]) as handle: id = {i.split()[0]: i.split()[1] for i in handle} with warnings.catch_warnings(): warnings.simplefilter("ignore") caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet) alns.append(caln) self.alns = alns
def setUp(self): self.aln_file = [TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3, TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6] alns = [] for i in self.aln_file: if i[1] == 'parse': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet) elif i[1] == 'index': # Deliberately using a fancy protein alphabet for testing: nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=Gapped(IUPAC.ExtendedIUPACProtein())) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20) elif i[1] == 'id': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with open(i[0][2]) as handle: id = dict((i.split()[0], i.split()[1]) for i in handle) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet) alns.append(caln) nucl.close() # Close the indexed FASTA file self.alns = alns
def setUp(self): self.aln_file = [ TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3, TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6 ] alns = [] for i in self.aln_file: if i[1] == "parse": nucl = SeqIO.parse(i[0][0], "fasta") prot = AlignIO.read(i[0][1], "clustal") with warnings.catch_warnings(): warnings.simplefilter("ignore") caln = codonalign.build(prot, nucl) elif i[1] == "index": nucl = SeqIO.index(i[0][0], "fasta") prot = AlignIO.read(i[0][1], "clustal") with warnings.catch_warnings(): warnings.simplefilter("ignore") caln = codonalign.build(prot, nucl, max_score=20) nucl.close() # Close the indexed FASTA file elif i[1] == "id": nucl = SeqIO.parse(i[0][0], "fasta") prot = AlignIO.read(i[0][1], "clustal") with open(i[0][2]) as handle: id = {i.split()[0]: i.split()[1] for i in handle} with warnings.catch_warnings(): warnings.simplefilter("ignore") caln = codonalign.build(prot, nucl, corr_dict=id) alns.append(caln) self.alns = alns
def setUp(self): nucl = SeqIO.parse(TEST_ALIGN_FILE6[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(TEST_ALIGN_FILE6[0][1], 'clustal', alphabet=IUPAC.protein) with open(TEST_ALIGN_FILE6[0][2]) as handle: id_corr = dict((i.split()[0], i.split()[1]) for i in handle) aln = codonalign.build(prot, nucl, corr_dict=id_corr, alphabet=codonalign.default_codon_alphabet) self.aln = aln
def test_mk(self): p = SeqIO.index(TEST_ALIGN_FILE7[0][0], "fasta") pro_aln = AlignIO.read(TEST_ALIGN_FILE7[0][1], "clustal") codon_aln = codonalign.build(pro_aln, p) p.close() # Close indexed FASTA file self.assertAlmostEqual(codonalign.mktest( [codon_aln[1:12], codon_aln[12:16], codon_aln[16:]]), 0.0021, places=4)
def setUp(self): nucl = SeqIO.parse(TEST_ALIGN_FILE6[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(TEST_ALIGN_FILE6[0][1], 'clustal', alphabet=IUPAC.protein) with open(TEST_ALIGN_FILE6[0][2]) as handle: id_corr = dict((i.split()[0], i.split()[1]) for i in handle) with warnings.catch_warnings(): warnings.simplefilter('ignore', BiopythonWarning) aln = codonalign.build(prot, nucl, corr_dict=id_corr, alphabet=codonalign.default_codon_alphabet) self.aln = aln
def test_ValueError(self): """Check that ValueError is thrown for Alignments of different lengths""" # original len(self.aln) = 2 , len(aln) = 3 aln = MultipleSeqAlignment([self.pro1, self.pro2, SeqRecord(Seq('M--', alphabet=generic_protein), id='pro3')]) triple_codon = codonalign.build(aln, [self.seq1, self.seq2, SeqRecord(Seq('ATG', alphabet=generic_dna), id='pro3')]) with self.assertRaises(ValueError): triple_codon + self.multi_aln with self.assertRaises(ValueError): triple_codon + self.codon_aln
def setUp(self): nucl = SeqIO.parse(TEST_ALIGN_FILE6[0][0], "fasta") prot = AlignIO.read(TEST_ALIGN_FILE6[0][1], "clustal") with open(TEST_ALIGN_FILE6[0][2]) as handle: id_corr = {i.split()[0]: i.split()[1] for i in handle} with warnings.catch_warnings(): warnings.simplefilter("ignore", BiopythonWarning) aln = codonalign.build(prot, nucl, corr_dict=id_corr) self.aln = aln
def test_ValueError(self): """Check that ValueError is thrown for Alignments of different lengths.""" # original len(self.aln) = 2 , len(aln) = 3 aln = MultipleSeqAlignment([self.pro1, self.pro2, SeqRecord(Seq("M--", alphabet=generic_protein), id="pro3")]) triple_codon = codonalign.build(aln, [self.seq1, self.seq2, SeqRecord(Seq("ATG", alphabet=generic_dna), id="pro3")]) with self.assertRaises(ValueError): triple_codon + self.multi_aln with self.assertRaises(ValueError): triple_codon + self.codon_aln
def setUp(self): self.seq1 = SeqRecord(Seq('ATGTCTCGT', alphabet=generic_dna), id='pro1') self.seq2 = SeqRecord(Seq('ATGCGT', alphabet=generic_dna), id='pro2') self.pro1 = SeqRecord(Seq('MSR', alphabet=generic_protein), id='pro1') self.pro2 = SeqRecord(Seq('M-R', alphabet=generic_protein), id='pro2') self.aln = MultipleSeqAlignment([self.pro1, self.pro2]) self.codon_aln = codonalign.build(self.aln, [self.seq1, self.seq2]) tail1 = SeqRecord(Seq('AAA', alphabet=generic_dna), id='pro1') tail2 = SeqRecord(Seq('AAA', alphabet=generic_dna), id='pro2') self.multi_aln = MultipleSeqAlignment([tail1, tail2])
def setUp(self): self.seq1 = SeqRecord(Seq("ATGTCTCGT"), id="pro1") self.seq2 = SeqRecord(Seq("ATGCGT"), id="pro2") self.pro1 = SeqRecord(Seq("MSR"), id="pro1") self.pro2 = SeqRecord(Seq("M-R"), id="pro2") self.aln = MultipleSeqAlignment([self.pro1, self.pro2]) self.codon_aln = codonalign.build(self.aln, [self.seq1, self.seq2]) tail1 = SeqRecord(Seq("AAA"), id="pro1") tail2 = SeqRecord(Seq("AAA"), id="pro2") self.multi_aln = MultipleSeqAlignment([tail1, tail2])
def test_mk(self): p = SeqIO.index(TEST_ALIGN_FILE7[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) pro_aln = AlignIO.read(TEST_ALIGN_FILE7[0][1], 'clustal', alphabet=IUPAC.protein) codon_aln = codonalign.build(pro_aln, p) p.close() # Close indexed FASTA file self.assertAlmostEqual(codonalign.mktest( [codon_aln[1:12], codon_aln[12:16], codon_aln[16:]]), 0.0021, places=4)
def test_build(self): codon_aln1 = codonalign.build(self.aln1, self.seqlist1) codon_aln2 = codonalign.build(self.aln2, self.seqlist2) codon_aln3 = codonalign.build(self.aln3, self.seqlist3, codon_table=self.codontable3)
def test_mk(self): p = SeqIO.index(TEST_ALIGN_FILE7[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) pro_aln = AlignIO.read(TEST_ALIGN_FILE7[0][1], 'clustal', alphabet=IUPAC.protein) codon_aln = codonalign.build(pro_aln, p) p.close() # Close indexed FASTA file self.assertAlmostEqual(round(codonalign.mktest([codon_aln[1:12], codon_aln[12:16], codon_aln[16:]]), 4), 0.0021, places=4)
nucl = SeqIO.parse(file,"fasta", alphabet=IUPAC.IUPACUnambiguousDNA()) return nucl #print "Gene\tdN \tdS\tdN/DS" i = 0 n = 1000 print "gene\txylogr\ttrape\tagyri\tgraphi\tlambie\tcladon\txantho" outfile = open("output.txt", "a") outfile.write("gene\txylogr\ttrape\tagyri\tgraphi\tlambie\tcladon\txantho\n") outfile.close() for prot_file, dna_file in zip(protein_file_names, dna_file_names): outfile = open("output.txt", "a") prot = align_next_file(prot_file) dna = import_next_file(dna_file) codon = codonalign.build(prot, dna, alphabet=codonalign.default_codon_alphabet) #dN, dS = cal_dn_ds(codon[0], codon[1], method='NG86') dn_matrix, ds_matrix = codon.get_dn_ds_matrix(method="ML") length = len(sorted(dn_matrix.matrix,key=len,reverse=True)[0]) #although this is always the same depending on the number of species dn_mat = np.array([xi+[0.0]*(length-len(xi)) for xi in dn_matrix.matrix]) ds_mat = np.array([xi+[0.0]*(length-len(xi)) for xi in ds_matrix.matrix]) sum_dn_col = list(np.sum(dn_mat, axis=0)) sum_dn_row = list(np.sum(dn_mat, axis=1)) sum_dn_col = [x/length for x in sum_dn_col] sum_dn_row = [x/length for x in sum_dn_row] sum_ds_col = list(np.sum(ds_mat, axis=0)) sum_ds_row = list(np.sum(ds_mat, axis=1)) sum_ds_col = [x/length for x in sum_ds_col] sum_ds_row = [x/length for x in sum_ds_row] marg_dn = [x+y for x,y in zip(sum_dn_col,sum_dn_row)] marg_ds = [x+y for x,y in zip(sum_ds_col,sum_ds_row)]
def test_build(self): codon_aln1 = codonalign.build(self.aln1, self.seqlist1) codon_aln2 = codonalign.build(self.aln2, self.seqlist2)
#print "Gene\tdN \tdS\tdN/DS" i = 0 n = 1000 print "gene\txylogr\ttrape\tagyri\tgraphi\tlambie\tcladon\txantho" outfile = open("output.txt", "a") outfile.write("gene\txylogr\ttrape\tagyri\tgraphi\tlambie\tcladon\txantho\n") outfile.close() for prot_file, dna_file in zip(protein_file_names, dna_file_names): outfile = open("output.txt", "a") prot = align_next_file(prot_file) dna = import_next_file(dna_file) codon = codonalign.build(prot, dna, alphabet=codonalign.default_codon_alphabet) #dN, dS = cal_dn_ds(codon[0], codon[1], method='NG86') dn_matrix, ds_matrix = codon.get_dn_ds_matrix(method="ML") length = len( sorted(dn_matrix.matrix, key=len, reverse=True)[0] ) #although this is always the same depending on the number of species dn_mat = np.array( [xi + [0.0] * (length - len(xi)) for xi in dn_matrix.matrix]) ds_mat = np.array( [xi + [0.0] * (length - len(xi)) for xi in ds_matrix.matrix]) sum_dn_col = list(np.sum(dn_mat, axis=0)) sum_dn_row = list(np.sum(dn_mat, axis=1)) sum_dn_col = [x / length for x in sum_dn_col] sum_dn_row = [x / length for x in sum_dn_row] sum_ds_col = list(np.sum(ds_mat, axis=0))
def divergence(): ######################## ## Arguments d'entrée ## ######################## fic1dna = sys.argv[1] #fichier des séquences adn de l'espèce 1 fic2dna = sys.argv[2] #fichier des séquences adn de l'espèce 2 fic1prot = sys.argv[3] #fichier des séquences protéiques de l'espèce 1 fic2prot = sys.argv[4] #fichier des séquences protéiques de l'espèce 2 #outfile_unaligned="outfile_unaligned.fa" #outfile_unaligned=open(outfile_unaligned,"w",encoding='utf-8') outfile_dn_ds = sys.argv[5] #fichier de sortie format tableau, sep = ";" outfile_dn_ds = open(outfile_dn_ds, "w", encoding='utf-8') method = sys.argv[6] #Methode utilisée muscle_exe = sys.argv[7] #Chemin vers le fichier executable de MUSCLE #Transformation des séquences en format SeqIO seq1dna = list( SeqIO.parse(fic1dna, "fasta", alphabet=IUPAC.IUPACUnambiguousDNA())) seq2dna = list( SeqIO.parse(fic2dna, "fasta", alphabet=IUPAC.IUPACUnambiguousDNA())) seq1prot = list(SeqIO.parse(fic1prot, "fasta", alphabet=IUPAC.protein)) seq2prot = list(SeqIO.parse(fic2prot, "fasta", alphabet=IUPAC.protein)) #Première ligne du tableau "titres" """print("seq.id",";","dN",";","dS",";","Dist_third_pos",";","Dist_brute",";","Length_seq_1",";","Length_seq2", ";","GC_content_seq1",";","GC_content_seq2",";","GC",";","Mean_length",file=outfile_dn_ds)""" print("Nombre de paires de sequences a analyser: ", len(seq1dna)) print("seq.id", ";", "dN", ";", "dS", ";", "Dist_third_pos", ";", "Dist_brute", ";", "Length_seq_1", ";", "Length_seq2", ";", "GC_content_seq1", ";", "GC_content_seq2", ";", "GC", ";", "Mean_length") """df2 = pd.DataFrame(columns=("seq.id","dN","dS","Dist_third_pos","Dist_brute","Length_seq_1","Length_seq2", "GC_content_seq1","GC_content_seq2","GC","Mean_length"))""" #Boucle sur chaque paire de séquence u = 0 while u < (len(seq1dna)): try: ########################################################### #. Alignement entre chaque paire de séquence # ########################################################### nuc1 = str(seq1dna[u].seq ) #Récupère la séquence u et la transforme en string nuc2 = str(seq2dna[u].seq) prot1 = str(seq1prot[u].seq) prot2 = str(seq2prot[u].seq) protein2 = SeqRecord( Seq(prot2, alphabet=IUPAC.protein), id='protein2' ) #Transformation de la séquence protéique en format SeqRecord protein1 = SeqRecord(Seq(prot1, alphabet=IUPAC.protein), id='protein1') with open( "outfile_unaligned.fa", "w", encoding='utf-8' ) as output_handle: #Permet de créer un fichier de deux séquences non-alignées (format fasta) SeqIO.write(protein1, output_handle, "fasta") SeqIO.write(protein2, output_handle, "fasta") muscle_cline = MuscleCommandline( muscle_exe, input="outfile_unaligned.fa", out="outfile_aligned.aln" ) #Prend en entrée le fichier de séquences non-alignées et sort un fichier de séquences alignées stdout, stderr = muscle_cline() alns = AlignIO.read( "outfile_aligned.aln", "fasta") #Lecture du fichier de séquences alignées prot1 = str(alns[0].seq) #Récupère la séquence protéique 1 alignée prot2 = str(alns[1].seq) #Récup§re la séquence protéique 2 alignée nuc2 = SeqRecord( Seq(nuc2, alphabet=IUPAC.IUPACUnambiguousDNA()), id='nuc2' ) #Transformation de la séquence nucléique en format SeqRecord nuc1 = SeqRecord(Seq(nuc1, alphabet=IUPAC.IUPACUnambiguousDNA()), id='nuc1') prot1 = SeqRecord( Seq(prot1, alphabet=IUPAC.protein), id='pro1' ) #Transformation de la séquence protéique en format SeqRecord prot2 = SeqRecord(Seq(prot2, alphabet=IUPAC.protein), id='pro2') aln = MultipleSeqAlignment( [prot1, prot2] ) #Créer format alignement des 2 séquences protéiques préalablement alignées codon_aln = codonalign.build( aln, [nuc1, nuc2]) #Créer un alignement de codon #Fichier d'alignement #AlignIO.write(codon_aln,"outfile_aligned", 'fasta') lengthseq1 = len(nuc1.seq) lengthseq2 = len(nuc2.seq) GCcontentseq1 = GC(nuc1.seq) GCcontentseq2 = GC(nuc2.seq) GC_mean = ((GCcontentseq1 + GCcontentseq2) / 2) if lengthseq1 >= lengthseq2: Min_length = lengthseq2 if lengthseq1 < lengthseq2: Min_length = lengthseq1 ########################################################## # CALCULS DES INDICES DE DIVERGENCE # ########################################################## #Calcul de divergence synonyme et non-synonyme #Supression des gaps seq1 = "" seq2 = "" for x, z in zip(codon_aln[0], codon_aln[1]): if z == "-": continue if x == "-": continue else: seq1 += x seq2 += z ################################################################# #. Comptage du nombre de site polymorhe brute # ################################################################# #Compteur de différences par site compteur0 = 0 for i, e in zip(seq1, seq2): if i != e: compteur0 += 1 distance_brute = round(float((compteur0) / len(seq1)), 3) seq1_third_pos = "" seq2_third_pos = "" compteur1 = 0 for i in seq1[2::3]: if i.isalpha(): seq1_third_pos += i compteur1 += 1 compteur2 = 0 for i in seq2[2::3]: if i.isalpha(): seq2_third_pos += i compteur2 += 1 #################################################################### # Comptage du nombre de site polymorphe en troisième position # #################################################################### #Compteur de différences par site (3ieme position) compteur3 = 0 for i, e in zip(seq1_third_pos, seq2_third_pos): if i != e: compteur3 += 1 distance_third_pos = round(float((compteur3) / compteur2), 3) #################################################################### # Calcul dN et dS selon la méthode utilisée # #################################################################### try: dN, dS = cal_dn_ds(codon_aln[0], codon_aln[1], method=method) """print(seq1dna[u].id,";",dN,";",dS,";",distance_third_pos,";",distance_brute,";",lengthseq1, ";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)""" print(seq1dna[u].id, ";", dN, ";", dS, ";", distance_third_pos, ";", distance_brute, ";", lengthseq1, ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean, ";", Min_length) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":dN,"dS":dS,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1, "Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)""" except ValueError: result = 9.999 #Saturation trop importante pour calculer les indices. """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1, ";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)""" print(seq1dna[u].id, ";", result, ";", result, ";", distance_third_pos, ";", distance_brute, ";", lengthseq1, ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean, ";", Min_length) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1, "Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)""" except ZeroDivisionError: result = 9.999 """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1, ";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)""" print(seq1dna[u].id, ";", result, ";", result, ";", distance_third_pos, ";", distance_brute, ";", lengthseq1, ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean, ";", Min_length) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1, "Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)""" except KeyError: result = 9.999 """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1, ";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)""" print(seq1dna[u].id, ";", result, ";", result, ";", distance_third_pos, ";", distance_brute, ";", lengthseq1, ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean, ";", Min_length) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1, "Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)""" u += 1 except: traceback.print_exc() print("Une erreur est survenue pour la sequence: ", seq1dna[u].id, "vs", seq2dna[u].id) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":"NA","dS":"NA","Dist_third_pos":"NA","Dist_brute":"NA","Length_seq_1":"NA", "Length_seq2":"NA","GC_content_seq1":"NA","GC_content_seq2":"NA","GC":"NA","Mean_length":"NA"}, ignore_index=True)""" u += 1 #df2.to_csv(outfile_dn_ds, sep='\t') outfile_dn_ds.close() #Fermeture du fichier ouvert
def proteinToCodonAlignment(proteinAlignment, extraDnaSeqs=None): protSeqDict = {} for seqRecord in proteinAlignment: protSeqDict[seqRecord.id] = seqRecord dnaFasta = patric_api.getSequenceOfFeatures(protSeqDict.keys(), 'dna') #if Debug: # LOG.write("dnaFasta sample: %s\n"%dnaFasta[:100]) dnaSeqDict = SeqIO.to_dict( SeqIO.parse(StringIO(dnaFasta), "fasta", alphabet=IUPAC.IUPACAmbiguousDNA())) for seqId in protSeqDict: if extraDnaSeqs and seqId in extraDnaSeqs: dnaSeqDict[seqId] = extraDnaSeqs[seqId] if Debug: LOG.write("appending extra DNA seq %s\n" % seqId) if set(dnaSeqDict.keys()) != set(protSeqDict.keys()): raise Exception( "Protein and DNA sets differ:\nProteins: %s\nDNA: %s\n" % (", ".join(sorted(protSeqDict)), ", ".join(sorted(dnaSeqDict)))) for seqId in dnaSeqDict: if not len(dnaSeqDict[seqId].seq): #del(dnaSeqDict[seqId]) LOG.write("warning: seqId %s length of dna was zero\n" % seqId) dnaSeqRecords = [] for proteinSeq in proteinAlignment: dnaSeqRecords.append(dnaSeqDict[proteinSeq.id]) if Debug: LOG.write("dna seqs has %d seqs\n" % (len(dnaSeqRecords))) #LOG.write("DNA seq ids: %s\n"%(", ".join(sorted(dnaSeqDict)))) #LOG.write("pro seq ids: %s\n"%(", ".join(sorted(protSeqDict)))) #LOG.write("first two aligned DNA seqs:\n") #SeqIO.write(dnaSeqRecords[:2], LOG, "fasta") #LOG.flush() """ # now check length of protein vs dna sequences, extend dna if needed to make match in numbers of codons for i, protRec in enumerate(proteinAlignment): protSeq = str(protRec.seq) protSeq.replace('-','') protLen = len(protSeq) if len(dnaSeqs[i].seq) < protLen*3: shortfall = (protLen*3) - len(dnaSeqs[i].seq) if Debug: LOG.write("DNA seq for %s is too short for protein, shortfall = %d\n"%(protRec.id, shortfall)) # extend on both ends to be safe dnaSeqs[i].seq = "N"*shortfall + dnaSeqs[i].seq + "N"*shortfall """ returnValue = None #with warnings.catch_warnings(): #warnings.simplefilter('ignore', BiopythonWarning) #try: #ambiguous_nucleotide_values = {'K': 'GT', 'M': 'AC', 'N': 'ACGT', 'S': 'CG', 'R': 'AG', 'W': 'AT', 'Y': 'CT'} #ambiguous_protein_values = {'X': 'ACDEFGHIKLMNOPQRSTVWY', 'J': 'IL', 'B': 'DN', 'Z': 'EQ'} #ambiguous_codon_table = CodonTable.AmbiguousCodonTable(CodonTable.ambiguous_dna_by_name["Standard"], IUPAC.IUPACAmbiguousDNA(), ambiguous_nucleotide_values, IUPAC.protein, ambiguous_protein_values) #returnValue = codonalign.build(pro_align=proteinAlignment, nucl_seqs=dnaSeqRecords, codon_table=ambiguous_codon_table, max_score=1000) returnValue = codonalign.build(pro_align=proteinAlignment, nucl_seqs=dnaSeqRecords, max_score=1000) for dnaSeq in returnValue: proteinRecord = protSeqDict[dnaSeq.id] if proteinRecord.annotations: dnaSeq.annotations = proteinRecord.annotations.copy() #except Exception as e: # LOG.write("problem in codonalign, skipping\n%s\n"%str(e)) # raise(e) return returnValue