def test_dn_ds(self): from Bio.codonalign.codonseq import cal_dn_ds codon_seq1 = self.aln[0] codon_seq2 = self.aln[1] dN, dS = cal_dn_ds(codon_seq1, codon_seq2, method='NG86') self.assertAlmostEqual(round(dN, 4), 0.0209, places=4) self.assertAlmostEqual(round(dS, 4), 0.0178, places=4) dN, dS = cal_dn_ds(codon_seq1, codon_seq2, method='LWL85') self.assertAlmostEqual(round(dN, 4), 0.0203, places=4) self.assertAlmostEqual(round(dS, 4), 0.0164, places=4) try: import scipy except ImportError: # Silently skip the rest of the test return # This should be present: from scipy.linalg import expm dN, dS = cal_dn_ds(codon_seq1, codon_seq2, method='YN00') self.assertAlmostEqual(round(dN, 4), 0.0198, places=4) self.assertAlmostEqual(round(dS, 4), 0.0222, places=4) try: # New in scipy v0.11 from scipy.optimize import minimize dN, dS = cal_dn_ds(codon_seq1, codon_seq2, method='ML') self.assertAlmostEqual(round(dN, 4), 0.0194, places=4) self.assertAlmostEqual(round(dS, 4), 0.0217, places=4) except ImportError: # TODO - Show a warning? pass
def calculate_piN_piS(codonseqs, method, codon_table, het=False): """ takes a list of CodonSeq() objects and calculates piN, piS, pi, and piNpiS for them """ analysis = { "seqname": "", "piN": -1, "piS": -1, "piNpiS": -1, "pi": -1, "method": method } x = seqfreqs(codonseqs) #if 'piNpiS' in options.debug: # print("freqs are: {}".format(x)) # print("len codonseqs is: ", len(codonseqs)) piN = 0 piS = 0 for i in range(len(codonseqs)): for j in range(i + 1, len(codonseqs)): #print(codonseqs[i], codonseqs[j]) if not het: dN, dS = cal_dn_ds(codonseqs[i], codonseqs[j], codon_table=codon_table, method=method) piN = piN + (x[i] * x[j] * dN) piS = piS + (x[i] * x[j] * dS) #if 'piNpiS' in options.debug: # print("{0} dN{1}{2}={3} dS{1}{2}={4}".format(method, i, j, dN, dS)) else: try: dN, dS = cal_dn_ds(codonseqs[i], codonseqs[j], codon_table=codon_table, method=method) piN = piN + (x[i] * x[j] * dN) piS = piS + (x[i] * x[j] * dS) except: pass analysis['piN'] = piN analysis['piS'] = piS try: analysis['piNpiS'] = piN / piS except: analysis['piNpiS'] = 0 #if 'piNpiS' in options.debug: # print ("{0} dN={1:.3f} dS={2:.3f} piN/piS = {3:.3f}".format( # method, analysis['piN'], analysis['piS'], analysis['piNpiS'])) return analysis
def get_dn_ds_matrix(self, method="NG86", codon_table=default_codon_table): """Available methods include NG86, LWL85, YN00 and ML. Argument: - method - Available methods include NG86, LWL85, YN00 and ML. - codon_table - Codon table to use for forward translation. """ from Bio.Phylo.TreeConstruction import _DistanceMatrix as DM names = [i.id for i in self._records] size = len(self._records) dn_matrix = [] ds_matrix = [] for i in range(size): dn_matrix.append([]) ds_matrix.append([]) for j in range(i + 1): if i != j: dn, ds = cal_dn_ds(self._records[i], self._records[j], method=method, codon_table=codon_table) dn_matrix[i].append(dn) ds_matrix[i].append(ds) else: dn_matrix[i].append(0.0) ds_matrix[i].append(0.0) dn_dm = DM(names, matrix=dn_matrix) ds_dm = DM(names, matrix=ds_matrix) return dn_dm, ds_dm
def get_dn_ds_matrix(self, method="NG86"): """Available methods include NG86, LWL85, YN00 and ML. """ from Bio.Phylo.TreeConstruction import _DistanceMatrix as DM names = [i.id for i in self._records] size = len(self._records) dn_matrix = [] ds_matrix = [] for i in range(size): dn_matrix.append([]) ds_matrix.append([]) for j in range(i + 1): if i != j: dn, ds = cal_dn_ds(self._records[i], self._records[j], method=method) dn_matrix[i].append(dn) ds_matrix[i].append(ds) else: dn_matrix[i].append(0.0) ds_matrix[i].append(0.0) dn_dm = DM(names, matrix=dn_matrix) ds_dm = DM(names, matrix=ds_matrix) return dn_dm, ds_dm
def divergence(): ######################## ## Arguments d'entrée ## ######################## fic1dna = sys.argv[1] #fichier des séquences adn de l'espèce 1 fic2dna = sys.argv[2] #fichier des séquences adn de l'espèce 2 fic1prot = sys.argv[3] #fichier des séquences protéiques de l'espèce 1 fic2prot = sys.argv[4] #fichier des séquences protéiques de l'espèce 2 #outfile_unaligned="outfile_unaligned.fa" #outfile_unaligned=open(outfile_unaligned,"w",encoding='utf-8') outfile_dn_ds = sys.argv[5] #fichier de sortie format tableau, sep = ";" outfile_dn_ds = open(outfile_dn_ds, "w", encoding='utf-8') method = sys.argv[6] #Methode utilisée muscle_exe = sys.argv[7] #Chemin vers le fichier executable de MUSCLE #Transformation des séquences en format SeqIO seq1dna = list( SeqIO.parse(fic1dna, "fasta", alphabet=IUPAC.IUPACUnambiguousDNA())) seq2dna = list( SeqIO.parse(fic2dna, "fasta", alphabet=IUPAC.IUPACUnambiguousDNA())) seq1prot = list(SeqIO.parse(fic1prot, "fasta", alphabet=IUPAC.protein)) seq2prot = list(SeqIO.parse(fic2prot, "fasta", alphabet=IUPAC.protein)) #Première ligne du tableau "titres" """print("seq.id",";","dN",";","dS",";","Dist_third_pos",";","Dist_brute",";","Length_seq_1",";","Length_seq2", ";","GC_content_seq1",";","GC_content_seq2",";","GC",";","Mean_length",file=outfile_dn_ds)""" print("Nombre de paires de sequences a analyser: ", len(seq1dna)) print("seq.id", ";", "dN", ";", "dS", ";", "Dist_third_pos", ";", "Dist_brute", ";", "Length_seq_1", ";", "Length_seq2", ";", "GC_content_seq1", ";", "GC_content_seq2", ";", "GC", ";", "Mean_length") """df2 = pd.DataFrame(columns=("seq.id","dN","dS","Dist_third_pos","Dist_brute","Length_seq_1","Length_seq2", "GC_content_seq1","GC_content_seq2","GC","Mean_length"))""" #Boucle sur chaque paire de séquence u = 0 while u < (len(seq1dna)): try: ########################################################### #. Alignement entre chaque paire de séquence # ########################################################### nuc1 = str(seq1dna[u].seq ) #Récupère la séquence u et la transforme en string nuc2 = str(seq2dna[u].seq) prot1 = str(seq1prot[u].seq) prot2 = str(seq2prot[u].seq) protein2 = SeqRecord( Seq(prot2, alphabet=IUPAC.protein), id='protein2' ) #Transformation de la séquence protéique en format SeqRecord protein1 = SeqRecord(Seq(prot1, alphabet=IUPAC.protein), id='protein1') with open( "outfile_unaligned.fa", "w", encoding='utf-8' ) as output_handle: #Permet de créer un fichier de deux séquences non-alignées (format fasta) SeqIO.write(protein1, output_handle, "fasta") SeqIO.write(protein2, output_handle, "fasta") muscle_cline = MuscleCommandline( muscle_exe, input="outfile_unaligned.fa", out="outfile_aligned.aln" ) #Prend en entrée le fichier de séquences non-alignées et sort un fichier de séquences alignées stdout, stderr = muscle_cline() alns = AlignIO.read( "outfile_aligned.aln", "fasta") #Lecture du fichier de séquences alignées prot1 = str(alns[0].seq) #Récupère la séquence protéique 1 alignée prot2 = str(alns[1].seq) #Récup§re la séquence protéique 2 alignée nuc2 = SeqRecord( Seq(nuc2, alphabet=IUPAC.IUPACUnambiguousDNA()), id='nuc2' ) #Transformation de la séquence nucléique en format SeqRecord nuc1 = SeqRecord(Seq(nuc1, alphabet=IUPAC.IUPACUnambiguousDNA()), id='nuc1') prot1 = SeqRecord( Seq(prot1, alphabet=IUPAC.protein), id='pro1' ) #Transformation de la séquence protéique en format SeqRecord prot2 = SeqRecord(Seq(prot2, alphabet=IUPAC.protein), id='pro2') aln = MultipleSeqAlignment( [prot1, prot2] ) #Créer format alignement des 2 séquences protéiques préalablement alignées codon_aln = codonalign.build( aln, [nuc1, nuc2]) #Créer un alignement de codon #Fichier d'alignement #AlignIO.write(codon_aln,"outfile_aligned", 'fasta') lengthseq1 = len(nuc1.seq) lengthseq2 = len(nuc2.seq) GCcontentseq1 = GC(nuc1.seq) GCcontentseq2 = GC(nuc2.seq) GC_mean = ((GCcontentseq1 + GCcontentseq2) / 2) if lengthseq1 >= lengthseq2: Min_length = lengthseq2 if lengthseq1 < lengthseq2: Min_length = lengthseq1 ########################################################## # CALCULS DES INDICES DE DIVERGENCE # ########################################################## #Calcul de divergence synonyme et non-synonyme #Supression des gaps seq1 = "" seq2 = "" for x, z in zip(codon_aln[0], codon_aln[1]): if z == "-": continue if x == "-": continue else: seq1 += x seq2 += z ################################################################# #. Comptage du nombre de site polymorhe brute # ################################################################# #Compteur de différences par site compteur0 = 0 for i, e in zip(seq1, seq2): if i != e: compteur0 += 1 distance_brute = round(float((compteur0) / len(seq1)), 3) seq1_third_pos = "" seq2_third_pos = "" compteur1 = 0 for i in seq1[2::3]: if i.isalpha(): seq1_third_pos += i compteur1 += 1 compteur2 = 0 for i in seq2[2::3]: if i.isalpha(): seq2_third_pos += i compteur2 += 1 #################################################################### # Comptage du nombre de site polymorphe en troisième position # #################################################################### #Compteur de différences par site (3ieme position) compteur3 = 0 for i, e in zip(seq1_third_pos, seq2_third_pos): if i != e: compteur3 += 1 distance_third_pos = round(float((compteur3) / compteur2), 3) #################################################################### # Calcul dN et dS selon la méthode utilisée # #################################################################### try: dN, dS = cal_dn_ds(codon_aln[0], codon_aln[1], method=method) """print(seq1dna[u].id,";",dN,";",dS,";",distance_third_pos,";",distance_brute,";",lengthseq1, ";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)""" print(seq1dna[u].id, ";", dN, ";", dS, ";", distance_third_pos, ";", distance_brute, ";", lengthseq1, ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean, ";", Min_length) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":dN,"dS":dS,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1, "Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)""" except ValueError: result = 9.999 #Saturation trop importante pour calculer les indices. """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1, ";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)""" print(seq1dna[u].id, ";", result, ";", result, ";", distance_third_pos, ";", distance_brute, ";", lengthseq1, ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean, ";", Min_length) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1, "Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)""" except ZeroDivisionError: result = 9.999 """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1, ";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)""" print(seq1dna[u].id, ";", result, ";", result, ";", distance_third_pos, ";", distance_brute, ";", lengthseq1, ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean, ";", Min_length) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1, "Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)""" except KeyError: result = 9.999 """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1, ";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)""" print(seq1dna[u].id, ";", result, ";", result, ";", distance_third_pos, ";", distance_brute, ";", lengthseq1, ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean, ";", Min_length) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1, "Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)""" u += 1 except: traceback.print_exc() print("Une erreur est survenue pour la sequence: ", seq1dna[u].id, "vs", seq2dna[u].id) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":"NA","dS":"NA","Dist_third_pos":"NA","Dist_brute":"NA","Length_seq_1":"NA", "Length_seq2":"NA","GC_content_seq1":"NA","GC_content_seq2":"NA","GC":"NA","Mean_length":"NA"}, ignore_index=True)""" u += 1 #df2.to_csv(outfile_dn_ds, sep='\t') outfile_dn_ds.close() #Fermeture du fichier ouvert
index_Cap = int([names.index(i) for i in names if 'Capybara' in i][0]) index_Cpor = int([names.index(i) for i in names if 'Cavia' in i][0]) CapCDS = aln.takeSeqs([names[index_Cap]]) Cap_seqs.append(CapCDS.getSeqNames()[0].split('|')[1]) CporCDS = aln.takeSeqs([names[index_Cpor]]) Cpor_seqs.append(CporCDS.getSeqNames()[0].split('|')[1]) if ((len(aln) / len(aln1)) > 0.5): #Solo analizar alineaientos que despues de eliminados los gaps, tengan una longitud mayor al 50% de la longitud original CapCDSstr = str(CapCDS.todict().values()[0]) CapCDSstr = str(CapCDS.todict().values()[0]) CapCDSstr = CodonSeq(CapCDSstr) CporCDSstr = str(CporCDS.todict().values()[0]) CporCDSstr = CodonSeq(CporCDSstr) try: if cal_dn_ds(CapCDSstr, CporCDSstr)[1] == 0.0: val1 = 0.0 else: val1 = cal_dn_ds(CapCDSstr, CporCDSstr)[0] / cal_dn_ds( CapCDSstr, CporCDSstr)[1] CGP.append(val1) except: print 'Error with: ' + os.path.split( sequences[s])[-1].split('_')[0] else: CGP.append('NA') #s = s + 1 Cpor_seqs = np.array(Cpor_seqs) Cap_seqs = np.array(Cap_seqs) CGP = np.array(CGP)
Cap_seqs.append(CapCDS.getSeqNames()[0].split('|')[1]) CporCDS = aln.takeSeqs([names[index_Cpor]]) Cpor_seqs.append(CporCDS.getSeqNames()[0].split('|')[1]) RatCDS = aln.takeSeqs([names[index_Rat]]) Rat_seqs.append(RatCDS.getSeqNames()[0]) if ((len(aln) / len(aln1)) > 0.5): #Solo analizar alineaientos que despues de eliminados los gaps, tengan una longitud mayor al 50% de la longitud original CapCDSstr = str(CapCDS.todict().values()[0]) CapCDSstr = CodonSeq(CapCDSstr) CporCDSstr = str(CporCDS.todict().values()[0]) CporCDSstr = CodonSeq(CporCDSstr) RatCDSstr = str(RatCDS.todict().values()[0]) RatCDSstr = CodonSeq(RatCDSstr) try: if cal_dn_ds(CapCDSstr, CporCDSstr)[1] == 0.0: val1 = 0.0 elif cal_dn_ds(CporCDSstr, RatCDSstr)[1] == 0.0: val2 = 0.0 else: val1 = cal_dn_ds(CapCDSstr, CporCDSstr)[0] / cal_dn_ds( CapCDSstr, CporCDSstr)[1] val2 = cal_dn_ds(CporCDSstr, RatCDSstr)[0] / cal_dn_ds( CporCDSstr, RatCDSstr)[1] CGP.append(val1) GPR.append(val2) except: print 'Error with: ' + os.path.split( sequences[s])[-1].split('_')[0] else: CGP.append('NA')