Example #1
0
 def test_build(self):
     codon_aln1 = codonalign.build(self.aln1, self.seqlist1)
     codon_aln2 = codonalign.build(self.aln2, self.seqlist2)
     codon_aln3 = codonalign.build(
         self.aln3, self.seqlist3, codon_table=self.codontable3
     )
     codon_aln4 = codonalign.build(self.aln1, self.seqlist1, complete_protein=True)
Example #2
0
 def setUp(self):
     self.aln_file = [TEST_ALIGN_FILE1,
                      TEST_ALIGN_FILE2,
                      TEST_ALIGN_FILE3,
                      TEST_ALIGN_FILE4,
                      TEST_ALIGN_FILE5,
                      TEST_ALIGN_FILE6]
     alns = []
     for i in self.aln_file:
         if i[1] == 'parse':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet)
         elif i[1] == 'index':
             nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20)
         elif i[1] == 'id':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with open(i[0][2]) as handle:
                 id = dict((i.split()[0], i.split()[1]) for i in handle)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet)
         alns.append(caln)
         nucl.close()  # Close the indexed FASTA file
     self.alns = alns
Example #3
0
 def setUp(self):
     self.aln_file = [TEST_ALIGN_FILE1,
                      TEST_ALIGN_FILE2,
                      TEST_ALIGN_FILE3,
                      TEST_ALIGN_FILE4,
                      TEST_ALIGN_FILE5,
                      TEST_ALIGN_FILE6]
     alns = []
     for i in self.aln_file:
         if i[1] == "parse":
             nucl = SeqIO.parse(i[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], "clustal", alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet)
         elif i[1] == "index":
             # Deliberately using a fancy protein alphabet for testing:
             nucl = SeqIO.index(i[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], "clustal", alphabet=generic_protein)
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20)
             nucl.close()  # Close the indexed FASTA file
         elif i[1] == "id":
             nucl = SeqIO.parse(i[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], "clustal", alphabet=IUPAC.protein)
             with open(i[0][2]) as handle:
                 id = {i.split()[0]: i.split()[1] for i in handle}
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet)
         alns.append(caln)
     self.alns = alns
Example #4
0
 def setUp(self):
     self.aln_file = [TEST_ALIGN_FILE1,
                      TEST_ALIGN_FILE2,
                      TEST_ALIGN_FILE3,
                      TEST_ALIGN_FILE4,
                      TEST_ALIGN_FILE5,
                      TEST_ALIGN_FILE6]
     alns = []
     for i in self.aln_file:
         if i[1] == 'parse':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet)
         elif i[1] == 'index':
             # Deliberately using a fancy protein alphabet for testing:
             nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=Gapped(IUPAC.ExtendedIUPACProtein()))
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20)
         elif i[1] == 'id':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with open(i[0][2]) as handle:
                 id = dict((i.split()[0], i.split()[1]) for i in handle)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet)
         alns.append(caln)
         nucl.close()  # Close the indexed FASTA file
     self.alns = alns
Example #5
0
 def setUp(self):
     self.aln_file = [
         TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3,
         TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6
     ]
     alns = []
     for i in self.aln_file:
         if i[1] == "parse":
             nucl = SeqIO.parse(i[0][0], "fasta")
             prot = AlignIO.read(i[0][1], "clustal")
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 caln = codonalign.build(prot, nucl)
         elif i[1] == "index":
             nucl = SeqIO.index(i[0][0], "fasta")
             prot = AlignIO.read(i[0][1], "clustal")
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 caln = codonalign.build(prot, nucl, max_score=20)
             nucl.close()  # Close the indexed FASTA file
         elif i[1] == "id":
             nucl = SeqIO.parse(i[0][0], "fasta")
             prot = AlignIO.read(i[0][1], "clustal")
             with open(i[0][2]) as handle:
                 id = {i.split()[0]: i.split()[1] for i in handle}
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 caln = codonalign.build(prot, nucl, corr_dict=id)
         alns.append(caln)
     self.alns = alns
Example #6
0
 def setUp(self):
     nucl = SeqIO.parse(TEST_ALIGN_FILE6[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
     prot = AlignIO.read(TEST_ALIGN_FILE6[0][1], 'clustal', alphabet=IUPAC.protein)
     with open(TEST_ALIGN_FILE6[0][2]) as handle:
         id_corr = dict((i.split()[0], i.split()[1]) for i in handle)
     aln = codonalign.build(prot, nucl, corr_dict=id_corr, alphabet=codonalign.default_codon_alphabet)
     self.aln = aln
Example #7
0
 def test_mk(self):
     p = SeqIO.index(TEST_ALIGN_FILE7[0][0], "fasta")
     pro_aln = AlignIO.read(TEST_ALIGN_FILE7[0][1], "clustal")
     codon_aln = codonalign.build(pro_aln, p)
     p.close()  # Close indexed FASTA file
     self.assertAlmostEqual(codonalign.mktest(
         [codon_aln[1:12], codon_aln[12:16], codon_aln[16:]]),
                            0.0021,
                            places=4)
Example #8
0
 def setUp(self):
     nucl = SeqIO.parse(TEST_ALIGN_FILE6[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
     prot = AlignIO.read(TEST_ALIGN_FILE6[0][1], 'clustal', alphabet=IUPAC.protein)
     with open(TEST_ALIGN_FILE6[0][2]) as handle:
         id_corr = dict((i.split()[0], i.split()[1]) for i in handle)
     with warnings.catch_warnings():
         warnings.simplefilter('ignore', BiopythonWarning)
         aln = codonalign.build(prot, nucl, corr_dict=id_corr, alphabet=codonalign.default_codon_alphabet)
     self.aln = aln
Example #9
0
 def test_ValueError(self):
     """Check that ValueError is thrown for Alignments of different lengths"""
     # original len(self.aln) = 2 , len(aln) = 3
     aln = MultipleSeqAlignment([self.pro1, self.pro2, SeqRecord(Seq('M--', alphabet=generic_protein), id='pro3')])
     triple_codon = codonalign.build(aln, [self.seq1, self.seq2, SeqRecord(Seq('ATG', alphabet=generic_dna), id='pro3')])
     with self.assertRaises(ValueError):
         triple_codon + self.multi_aln
     with self.assertRaises(ValueError):
         triple_codon + self.codon_aln
Example #10
0
 def setUp(self):
     nucl = SeqIO.parse(TEST_ALIGN_FILE6[0][0], "fasta")
     prot = AlignIO.read(TEST_ALIGN_FILE6[0][1], "clustal")
     with open(TEST_ALIGN_FILE6[0][2]) as handle:
         id_corr = {i.split()[0]: i.split()[1] for i in handle}
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", BiopythonWarning)
         aln = codonalign.build(prot, nucl, corr_dict=id_corr)
     self.aln = aln
Example #11
0
 def test_ValueError(self):
     """Check that ValueError is thrown for Alignments of different lengths."""
     # original len(self.aln) = 2 , len(aln) = 3
     aln = MultipleSeqAlignment([self.pro1, self.pro2, SeqRecord(Seq("M--", alphabet=generic_protein), id="pro3")])
     triple_codon = codonalign.build(aln, [self.seq1, self.seq2, SeqRecord(Seq("ATG", alphabet=generic_dna), id="pro3")])
     with self.assertRaises(ValueError):
         triple_codon + self.multi_aln
     with self.assertRaises(ValueError):
         triple_codon + self.codon_aln
Example #12
0
    def setUp(self):
        self.seq1 = SeqRecord(Seq('ATGTCTCGT', alphabet=generic_dna), id='pro1')
        self.seq2 = SeqRecord(Seq('ATGCGT', alphabet=generic_dna), id='pro2')
        self.pro1 = SeqRecord(Seq('MSR', alphabet=generic_protein), id='pro1')
        self.pro2 = SeqRecord(Seq('M-R', alphabet=generic_protein), id='pro2')
        self.aln = MultipleSeqAlignment([self.pro1, self.pro2])
        self.codon_aln = codonalign.build(self.aln, [self.seq1, self.seq2])

        tail1 = SeqRecord(Seq('AAA', alphabet=generic_dna), id='pro1')
        tail2 = SeqRecord(Seq('AAA', alphabet=generic_dna), id='pro2')
        self.multi_aln = MultipleSeqAlignment([tail1, tail2])
Example #13
0
    def setUp(self):
        self.seq1 = SeqRecord(Seq("ATGTCTCGT"), id="pro1")
        self.seq2 = SeqRecord(Seq("ATGCGT"), id="pro2")
        self.pro1 = SeqRecord(Seq("MSR"), id="pro1")
        self.pro2 = SeqRecord(Seq("M-R"), id="pro2")
        self.aln = MultipleSeqAlignment([self.pro1, self.pro2])
        self.codon_aln = codonalign.build(self.aln, [self.seq1, self.seq2])

        tail1 = SeqRecord(Seq("AAA"), id="pro1")
        tail2 = SeqRecord(Seq("AAA"), id="pro2")
        self.multi_aln = MultipleSeqAlignment([tail1, tail2])
Example #14
0
    def setUp(self):
        self.seq1 = SeqRecord(Seq('ATGTCTCGT', alphabet=generic_dna), id='pro1')
        self.seq2 = SeqRecord(Seq('ATGCGT', alphabet=generic_dna), id='pro2')
        self.pro1 = SeqRecord(Seq('MSR', alphabet=generic_protein), id='pro1')
        self.pro2 = SeqRecord(Seq('M-R', alphabet=generic_protein), id='pro2')
        self.aln = MultipleSeqAlignment([self.pro1, self.pro2])
        self.codon_aln = codonalign.build(self.aln, [self.seq1, self.seq2])

        tail1 = SeqRecord(Seq('AAA', alphabet=generic_dna), id='pro1')
        tail2 = SeqRecord(Seq('AAA', alphabet=generic_dna), id='pro2')
        self.multi_aln = MultipleSeqAlignment([tail1, tail2])
Example #15
0
 def test_mk(self):
     p = SeqIO.index(TEST_ALIGN_FILE7[0][0],
                     'fasta',
                     alphabet=IUPAC.IUPACUnambiguousDNA())
     pro_aln = AlignIO.read(TEST_ALIGN_FILE7[0][1],
                            'clustal',
                            alphabet=IUPAC.protein)
     codon_aln = codonalign.build(pro_aln, p)
     p.close()  # Close indexed FASTA file
     self.assertAlmostEqual(codonalign.mktest(
         [codon_aln[1:12], codon_aln[12:16], codon_aln[16:]]),
                            0.0021,
                            places=4)
Example #16
0
 def test_build(self):
     codon_aln1 = codonalign.build(self.aln1, self.seqlist1)
     codon_aln2 = codonalign.build(self.aln2, self.seqlist2)
     codon_aln3 = codonalign.build(self.aln3, self.seqlist3, codon_table=self.codontable3)
Example #17
0
 def test_build(self):
     codon_aln1 = codonalign.build(self.aln1, self.seqlist1)
     codon_aln2 = codonalign.build(self.aln2, self.seqlist2)
     codon_aln3 = codonalign.build(self.aln3, self.seqlist3, codon_table=self.codontable3)
Example #18
0
 def test_mk(self):
     p = SeqIO.index(TEST_ALIGN_FILE7[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
     pro_aln = AlignIO.read(TEST_ALIGN_FILE7[0][1], 'clustal', alphabet=IUPAC.protein)
     codon_aln = codonalign.build(pro_aln, p)
     p.close()  # Close indexed FASTA file
     self.assertAlmostEqual(round(codonalign.mktest([codon_aln[1:12], codon_aln[12:16], codon_aln[16:]]), 4), 0.0021, places=4)
Example #19
0
	nucl = SeqIO.parse(file,"fasta", alphabet=IUPAC.IUPACUnambiguousDNA())
	return nucl

#print "Gene\tdN \tdS\tdN/DS"
i = 0
n = 1000
print "gene\txylogr\ttrape\tagyri\tgraphi\tlambie\tcladon\txantho"
outfile = open("output.txt", "a")
outfile.write("gene\txylogr\ttrape\tagyri\tgraphi\tlambie\tcladon\txantho\n")
outfile.close()

for prot_file, dna_file in zip(protein_file_names, dna_file_names):
	outfile = open("output.txt", "a")
	prot = align_next_file(prot_file)
	dna = import_next_file(dna_file)
	codon = codonalign.build(prot, dna, alphabet=codonalign.default_codon_alphabet)
	#dN, dS = cal_dn_ds(codon[0], codon[1], method='NG86')
	dn_matrix, ds_matrix = codon.get_dn_ds_matrix(method="ML")
	length = len(sorted(dn_matrix.matrix,key=len,reverse=True)[0]) #although this is always the same depending on the number of species
	dn_mat = np.array([xi+[0.0]*(length-len(xi)) for xi in dn_matrix.matrix])
	ds_mat = np.array([xi+[0.0]*(length-len(xi)) for xi in ds_matrix.matrix])
	sum_dn_col = list(np.sum(dn_mat, axis=0))
	sum_dn_row = list(np.sum(dn_mat, axis=1))
	sum_dn_col = [x/length for x in sum_dn_col]
	sum_dn_row = [x/length for x in sum_dn_row]
	sum_ds_col = list(np.sum(ds_mat, axis=0))
	sum_ds_row = list(np.sum(ds_mat, axis=1))
	sum_ds_col = [x/length for x in sum_ds_col]
	sum_ds_row = [x/length for x in sum_ds_row]
	marg_dn = [x+y for x,y in zip(sum_dn_col,sum_dn_row)]
	marg_ds = [x+y for x,y in zip(sum_ds_col,sum_ds_row)]
Example #20
0
 def test_build(self):
     codon_aln1 = codonalign.build(self.aln1, self.seqlist1)
     codon_aln2 = codonalign.build(self.aln2, self.seqlist2)
Example #21
0

#print "Gene\tdN \tdS\tdN/DS"
i = 0
n = 1000
print "gene\txylogr\ttrape\tagyri\tgraphi\tlambie\tcladon\txantho"
outfile = open("output.txt", "a")
outfile.write("gene\txylogr\ttrape\tagyri\tgraphi\tlambie\tcladon\txantho\n")
outfile.close()

for prot_file, dna_file in zip(protein_file_names, dna_file_names):
    outfile = open("output.txt", "a")
    prot = align_next_file(prot_file)
    dna = import_next_file(dna_file)
    codon = codonalign.build(prot,
                             dna,
                             alphabet=codonalign.default_codon_alphabet)
    #dN, dS = cal_dn_ds(codon[0], codon[1], method='NG86')
    dn_matrix, ds_matrix = codon.get_dn_ds_matrix(method="ML")
    length = len(
        sorted(dn_matrix.matrix, key=len, reverse=True)[0]
    )  #although this is always the same depending on the number of species
    dn_mat = np.array(
        [xi + [0.0] * (length - len(xi)) for xi in dn_matrix.matrix])
    ds_mat = np.array(
        [xi + [0.0] * (length - len(xi)) for xi in ds_matrix.matrix])
    sum_dn_col = list(np.sum(dn_mat, axis=0))
    sum_dn_row = list(np.sum(dn_mat, axis=1))
    sum_dn_col = [x / length for x in sum_dn_col]
    sum_dn_row = [x / length for x in sum_dn_row]
    sum_ds_col = list(np.sum(ds_mat, axis=0))
def divergence():

    ########################
    ## Arguments d'entrée ##
    ########################
    fic1dna = sys.argv[1]  #fichier des séquences adn de l'espèce 1
    fic2dna = sys.argv[2]  #fichier des séquences adn de l'espèce 2
    fic1prot = sys.argv[3]  #fichier des séquences protéiques de l'espèce 1
    fic2prot = sys.argv[4]  #fichier des séquences protéiques de l'espèce 2

    #outfile_unaligned="outfile_unaligned.fa"
    #outfile_unaligned=open(outfile_unaligned,"w",encoding='utf-8')
    outfile_dn_ds = sys.argv[5]  #fichier de sortie format tableau, sep = ";"
    outfile_dn_ds = open(outfile_dn_ds, "w", encoding='utf-8')
    method = sys.argv[6]  #Methode utilisée
    muscle_exe = sys.argv[7]  #Chemin vers le fichier executable de MUSCLE

    #Transformation des séquences en format SeqIO
    seq1dna = list(
        SeqIO.parse(fic1dna, "fasta", alphabet=IUPAC.IUPACUnambiguousDNA()))
    seq2dna = list(
        SeqIO.parse(fic2dna, "fasta", alphabet=IUPAC.IUPACUnambiguousDNA()))
    seq1prot = list(SeqIO.parse(fic1prot, "fasta", alphabet=IUPAC.protein))
    seq2prot = list(SeqIO.parse(fic2prot, "fasta", alphabet=IUPAC.protein))

    #Première ligne du tableau "titres"
    """print("seq.id",";","dN",";","dS",";","Dist_third_pos",";","Dist_brute",";","Length_seq_1",";","Length_seq2",
		";","GC_content_seq1",";","GC_content_seq2",";","GC",";","Mean_length",file=outfile_dn_ds)"""

    print("Nombre de paires de sequences a analyser: ", len(seq1dna))

    print("seq.id", ";", "dN", ";", "dS", ";", "Dist_third_pos", ";",
          "Dist_brute", ";", "Length_seq_1", ";", "Length_seq2", ";",
          "GC_content_seq1", ";", "GC_content_seq2", ";", "GC", ";",
          "Mean_length")
    """df2 = pd.DataFrame(columns=("seq.id","dN","dS","Dist_third_pos","Dist_brute","Length_seq_1","Length_seq2",
		"GC_content_seq1","GC_content_seq2","GC","Mean_length"))"""

    #Boucle sur chaque paire de séquence
    u = 0
    while u < (len(seq1dna)):

        try:

            ###########################################################
            #.    Alignement entre chaque paire de séquence           #
            ###########################################################

            nuc1 = str(seq1dna[u].seq
                       )  #Récupère la séquence u et la transforme en string
            nuc2 = str(seq2dna[u].seq)
            prot1 = str(seq1prot[u].seq)
            prot2 = str(seq2prot[u].seq)

            protein2 = SeqRecord(
                Seq(prot2, alphabet=IUPAC.protein), id='protein2'
            )  #Transformation de la séquence protéique en format SeqRecord
            protein1 = SeqRecord(Seq(prot1, alphabet=IUPAC.protein),
                                 id='protein1')

            with open(
                    "outfile_unaligned.fa", "w", encoding='utf-8'
            ) as output_handle:  #Permet de créer un fichier de deux séquences non-alignées (format fasta)
                SeqIO.write(protein1, output_handle, "fasta")
                SeqIO.write(protein2, output_handle, "fasta")

            muscle_cline = MuscleCommandline(
                muscle_exe,
                input="outfile_unaligned.fa",
                out="outfile_aligned.aln"
            )  #Prend en entrée le fichier de séquences non-alignées et sort un fichier de séquences alignées
            stdout, stderr = muscle_cline()
            alns = AlignIO.read(
                "outfile_aligned.aln",
                "fasta")  #Lecture du fichier de séquences alignées

            prot1 = str(alns[0].seq)  #Récupère la séquence protéique 1 alignée
            prot2 = str(alns[1].seq)  #Récup§re la séquence protéique 2 alignée

            nuc2 = SeqRecord(
                Seq(nuc2, alphabet=IUPAC.IUPACUnambiguousDNA()), id='nuc2'
            )  #Transformation de la séquence nucléique en format SeqRecord
            nuc1 = SeqRecord(Seq(nuc1, alphabet=IUPAC.IUPACUnambiguousDNA()),
                             id='nuc1')

            prot1 = SeqRecord(
                Seq(prot1, alphabet=IUPAC.protein), id='pro1'
            )  #Transformation de la séquence protéique en format SeqRecord
            prot2 = SeqRecord(Seq(prot2, alphabet=IUPAC.protein), id='pro2')

            aln = MultipleSeqAlignment(
                [prot1, prot2]
            )  #Créer format alignement des 2 séquences protéiques préalablement alignées

            codon_aln = codonalign.build(
                aln, [nuc1, nuc2])  #Créer un alignement de codon

            #Fichier d'alignement
            #AlignIO.write(codon_aln,"outfile_aligned", 'fasta')

            lengthseq1 = len(nuc1.seq)
            lengthseq2 = len(nuc2.seq)
            GCcontentseq1 = GC(nuc1.seq)
            GCcontentseq2 = GC(nuc2.seq)

            GC_mean = ((GCcontentseq1 + GCcontentseq2) / 2)

            if lengthseq1 >= lengthseq2:
                Min_length = lengthseq2
            if lengthseq1 < lengthseq2:
                Min_length = lengthseq1

            ##########################################################
            #           CALCULS DES INDICES DE DIVERGENCE            #
            ##########################################################

            #Calcul de divergence synonyme et non-synonyme

            #Supression des gaps
            seq1 = ""
            seq2 = ""
            for x, z in zip(codon_aln[0], codon_aln[1]):
                if z == "-":
                    continue
                if x == "-":
                    continue
                else:
                    seq1 += x
                    seq2 += z

            #################################################################
            #.	        Comptage du nombre de site polymorhe brute          #
            #################################################################

            #Compteur de différences par site
            compteur0 = 0
            for i, e in zip(seq1, seq2):
                if i != e:
                    compteur0 += 1

            distance_brute = round(float((compteur0) / len(seq1)), 3)

            seq1_third_pos = ""
            seq2_third_pos = ""

            compteur1 = 0
            for i in seq1[2::3]:
                if i.isalpha():
                    seq1_third_pos += i
                    compteur1 += 1

            compteur2 = 0
            for i in seq2[2::3]:
                if i.isalpha():
                    seq2_third_pos += i
                    compteur2 += 1

            ####################################################################
            #	Comptage du nombre de site polymorphe en troisième position    #
            ####################################################################

            #Compteur de différences par site (3ieme position)
            compteur3 = 0
            for i, e in zip(seq1_third_pos, seq2_third_pos):
                if i != e:
                    compteur3 += 1

            distance_third_pos = round(float((compteur3) / compteur2), 3)

            ####################################################################
            #			Calcul dN et dS selon la méthode utilisée 			   #
            ####################################################################

            try:

                dN, dS = cal_dn_ds(codon_aln[0], codon_aln[1], method=method)
                """print(seq1dna[u].id,";",dN,";",dS,";",distance_third_pos,";",distance_brute,";",lengthseq1,
					";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)"""
                print(seq1dna[u].id, ";", dN, ";", dS, ";", distance_third_pos,
                      ";", distance_brute, ";", lengthseq1, ";", lengthseq2,
                      ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean,
                      ";", Min_length)
                """df2=df2.append({"seq.id":seq1dna[u].id,"dN":dN,"dS":dS,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1,
		"Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)"""

            except ValueError:
                result = 9.999  #Saturation trop importante pour calculer les indices.
                """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1,
					";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)"""
                print(seq1dna[u].id, ";", result, ";", result, ";",
                      distance_third_pos, ";", distance_brute, ";", lengthseq1,
                      ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2,
                      ";", GC_mean, ";", Min_length)
                """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1,
		"Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)"""

            except ZeroDivisionError:
                result = 9.999
                """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1,
					";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)"""
                print(seq1dna[u].id, ";", result, ";", result, ";",
                      distance_third_pos, ";", distance_brute, ";", lengthseq1,
                      ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2,
                      ";", GC_mean, ";", Min_length)
                """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1,
		"Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)"""

            except KeyError:
                result = 9.999
                """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1,
					";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)"""
                print(seq1dna[u].id, ";", result, ";", result, ";",
                      distance_third_pos, ";", distance_brute, ";", lengthseq1,
                      ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2,
                      ";", GC_mean, ";", Min_length)
                """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1,
		"Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)"""

            u += 1

        except:
            traceback.print_exc()
            print("Une erreur est survenue pour la sequence: ", seq1dna[u].id,
                  "vs", seq2dna[u].id)
            """df2=df2.append({"seq.id":seq1dna[u].id,"dN":"NA","dS":"NA","Dist_third_pos":"NA","Dist_brute":"NA","Length_seq_1":"NA",
		"Length_seq2":"NA","GC_content_seq1":"NA","GC_content_seq2":"NA","GC":"NA","Mean_length":"NA"}, ignore_index=True)"""

            u += 1

    #df2.to_csv(outfile_dn_ds, sep='\t')
    outfile_dn_ds.close()  #Fermeture du fichier ouvert
Example #23
0
 def test_build(self):
     codon_aln1 = codonalign.build(self.aln1, self.seqlist1)
     codon_aln2 = codonalign.build(self.aln2, self.seqlist2)
Example #24
0
def proteinToCodonAlignment(proteinAlignment, extraDnaSeqs=None):
    protSeqDict = {}
    for seqRecord in proteinAlignment:
        protSeqDict[seqRecord.id] = seqRecord
    dnaFasta = patric_api.getSequenceOfFeatures(protSeqDict.keys(), 'dna')
    #if Debug:
    #     LOG.write("dnaFasta sample: %s\n"%dnaFasta[:100])

    dnaSeqDict = SeqIO.to_dict(
        SeqIO.parse(StringIO(dnaFasta),
                    "fasta",
                    alphabet=IUPAC.IUPACAmbiguousDNA()))
    for seqId in protSeqDict:
        if extraDnaSeqs and seqId in extraDnaSeqs:
            dnaSeqDict[seqId] = extraDnaSeqs[seqId]
            if Debug:
                LOG.write("appending extra DNA seq %s\n" % seqId)
    if set(dnaSeqDict.keys()) != set(protSeqDict.keys()):
        raise Exception(
            "Protein and DNA sets differ:\nProteins: %s\nDNA: %s\n" %
            (", ".join(sorted(protSeqDict)), ", ".join(sorted(dnaSeqDict))))
    for seqId in dnaSeqDict:
        if not len(dnaSeqDict[seqId].seq):
            #del(dnaSeqDict[seqId])
            LOG.write("warning: seqId %s length of dna was zero\n" % seqId)
    dnaSeqRecords = []
    for proteinSeq in proteinAlignment:
        dnaSeqRecords.append(dnaSeqDict[proteinSeq.id])

    if Debug:
        LOG.write("dna seqs has %d seqs\n" % (len(dnaSeqRecords)))
        #LOG.write("DNA seq ids: %s\n"%(", ".join(sorted(dnaSeqDict))))
        #LOG.write("pro seq ids: %s\n"%(", ".join(sorted(protSeqDict))))
        #LOG.write("first two aligned DNA seqs:\n")
        #SeqIO.write(dnaSeqRecords[:2], LOG, "fasta")
        #LOG.flush()
    """
    # now check length of protein vs dna sequences, extend dna if needed to make match in numbers of codons
    for i, protRec in enumerate(proteinAlignment):
        protSeq = str(protRec.seq)
        protSeq.replace('-','')
        protLen = len(protSeq)
        if len(dnaSeqs[i].seq) < protLen*3:
            shortfall = (protLen*3) - len(dnaSeqs[i].seq)
            if Debug:
                LOG.write("DNA seq for %s is too short for protein, shortfall = %d\n"%(protRec.id, shortfall))
            # extend on both ends to be safe
            dnaSeqs[i].seq = "N"*shortfall + dnaSeqs[i].seq + "N"*shortfall
    """
    returnValue = None
    #with warnings.catch_warnings():
    #warnings.simplefilter('ignore', BiopythonWarning)
    #try:
    #ambiguous_nucleotide_values = {'K': 'GT', 'M': 'AC', 'N': 'ACGT', 'S': 'CG', 'R': 'AG', 'W': 'AT', 'Y': 'CT'}
    #ambiguous_protein_values = {'X': 'ACDEFGHIKLMNOPQRSTVWY', 'J': 'IL', 'B': 'DN', 'Z': 'EQ'}
    #ambiguous_codon_table = CodonTable.AmbiguousCodonTable(CodonTable.ambiguous_dna_by_name["Standard"], IUPAC.IUPACAmbiguousDNA(), ambiguous_nucleotide_values, IUPAC.protein, ambiguous_protein_values)
    #returnValue = codonalign.build(pro_align=proteinAlignment, nucl_seqs=dnaSeqRecords, codon_table=ambiguous_codon_table, max_score=1000)
    returnValue = codonalign.build(pro_align=proteinAlignment,
                                   nucl_seqs=dnaSeqRecords,
                                   max_score=1000)
    for dnaSeq in returnValue:
        proteinRecord = protSeqDict[dnaSeq.id]
        if proteinRecord.annotations:
            dnaSeq.annotations = proteinRecord.annotations.copy()

        #except Exception as e:
        #    LOG.write("problem in codonalign, skipping\n%s\n"%str(e))
        #    raise(e)
    return returnValue