def test_paralinear_distance(self):
        """calculate paralinear variance consistent with hand calculation"""
        data = [
            ('seq1',
             "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT"
             ),
            ('seq2',
             "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC"
             )
        ]
        aln = LoadSeqs(data=data, moltype=DNA)
        paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln)
        paralinear_calc.run(show_progress=False)

        index = dict(zip('ACGT', range(4)))
        J = numpy.zeros((4, 4))
        for p in zip(data[0][1], data[1][1]):
            J[index[p[0]], index[p[1]]] += 1
        for i in range(4):
            if J[i, i] == 0:
                J[i, i] += 0.5
        J /= J.sum()
        M = numpy.linalg.inv(J)
        f = J.sum(1), J.sum(0)
        dist = -0.25 * numpy.log( numpy.linalg.det(J) / \
                numpy.sqrt(f[0].prod() * f[1].prod()) )

        self.assertFloatEqual(paralinear_calc.Dists[1, 1], dist, eps=1e-3)
    def test_paralinear_variance(self):
        """calculate paralinear variance consistent with hand calculation"""
        data = [
            ('seq1',
             "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT"
             ),
            ('seq2',
             "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC"
             )
        ]
        aln = LoadSeqs(data=data, moltype=DNA)
        paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln)
        paralinear_calc.run(show_progress=False)

        index = dict(zip('ACGT', range(4)))
        J = numpy.zeros((4, 4))
        for p in zip(data[0][1], data[1][1]):
            J[index[p[0]], index[p[1]]] += 1
        for i in range(4):
            if J[i, i] == 0:
                J[i, i] += 0.5
        J /= J.sum()
        M = numpy.linalg.inv(J)
        f = J.sum(1), J.sum(0)
        var = 0.
        for i in range(4):
            for j in range(4):
                var += M[j, i]**2 * J[i, j]
            var -= 1 / numpy.sqrt(f[0][i] * f[1][i])
        var /= 16 * len(data[0][1])

        self.assertFloatEqual(paralinear_calc.Variances[1, 1], var, eps=1e-3)
    def test_paralinear_variance(self):
        """calculate paralinear variance consistent with hand calculation"""
        data = [('seq1', "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT"),
                ('seq2', "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC")]
        aln = LoadSeqs(data=data, moltype=DNA)
        paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln)
        paralinear_calc.run(show_progress=False)
       
        index = dict(zip('ACGT', range(4)))
        J = numpy.zeros((4, 4))
        for p in zip(data[0][1], data[1][1]):
            J[index[p[0]], index[p[1]]] += 1
        for i in range(4):
            if J[i, i] == 0:
                J[i, i] += 0.5
        J /= J.sum()
        M = numpy.linalg.inv(J)
        f = J.sum(1), J.sum(0)
        var = 0.
        for i in range(4):
            for j in range(4):
                var += M[j, i]**2 * J[i, j]
            var -= 1 / numpy.sqrt(f[0][i] * f[1][i])
        var /= 16 * len(data[0][1])

        self.assertFloatEqual(paralinear_calc.Variances[1,1], var, eps=1e-3)
 def test_paralinear_pair_aa(self):
     """paralinear shouldn't fail to produce distances for aa seqs"""
     aln = LoadSeqs('data/brca1_5.paml', moltype=DNA)
     aln = aln.getTranslation()
     paralinear_calc = ParalinearPair(moltype=PROTEIN, alignment=aln)
     paralinear_calc.run(show_progress=False)
     dists = paralinear_calc.getPairwiseDistances()
 def test_paralinear_pair_aa(self):
     """paralinear shouldn't fail to produce distances for aa seqs"""
     aln = LoadSeqs('data/brca1_5.paml', moltype=DNA)
     aln = aln.getTranslation()
     paralinear_calc = ParalinearPair(moltype=PROTEIN, alignment=aln)
     paralinear_calc.run(show_progress=False)
     dists = paralinear_calc.getPairwiseDistances()
Example #6
0
def get_paralinear_distances(gene, data_directory=None, third_position=False, **kw):
    filenames = glob.glob(os.path.join(data_directory, gene+'.fasta*'))
    assert len(filenames) == 1, 'Wrong number of alignment files for ' + gene
    filename = filenames[0]
    if filename.endswith('.fasta'):
        with open(filename) as fastafile:
            fastadata = fastafile.read()
    elif filename.endswith('.fasta.gz'):
        with GzipFile(filename) as fastafile:
            fastadata = fastafile.read()
    else:
        raise RuntimeError(gene + ' file could not be read')

    sequences = LoadSeqs(data=fastadata)
    if third_position:
        indices = [(i, i+1) for i in range(len(sequences))[2::3]]
        pos3 = sequences.addFeature('pos3', 'pos3', indices)
        sequences = pos3.getSlice()
    sequences = sequences.filtered(lambda x: set(''.join(x)) <= set(DNA))

    paralinear_calc = ParalinearPair(moltype=DNA, alignment=sequences)
    paralinear_calc.run(show_progress=False)
    dists = paralinear_calc.getPairwiseDistances()

    return {frozenset(k):v for k, v in dists.items()}
    def test_paralinear_pair_dna(self):
        """calculate paralinear distance consistent with logdet distance"""
        data = [('seq1', 'TAATTCATTGGGACGTCGAATCCGGCAGTCCTGCCGCAAAAGCTTCCGGAATCGAATTTTGGCA'),
                ('seq2', 'AAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGG')]
        aln = LoadSeqs(data=data, moltype=DNA)
        paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln)
        paralinear_calc.run(show_progress=False)
        logdet_calc = LogDetPair(moltype=DNA, alignment=aln)
        logdet_calc.run(show_progress=False)

        self.assertFloatEqual(logdet_calc.Dists[1,1],
                paralinear_calc.Dists[1,1], eps=1e-3)
        self.assertFloatEqual(paralinear_calc.Variances[1,1], 
                logdet_calc.Variances[1,1], eps=1e-3)
    def test_paralinear_pair_dna(self):
        """calculate paralinear distance consistent with logdet distance"""
        data = [
            ('seq1',
             'TAATTCATTGGGACGTCGAATCCGGCAGTCCTGCCGCAAAAGCTTCCGGAATCGAATTTTGGCA'
             ),
            ('seq2',
             'AAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGG'
             )
        ]
        aln = LoadSeqs(data=data, moltype=DNA)
        paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln)
        paralinear_calc.run(show_progress=False)
        logdet_calc = LogDetPair(moltype=DNA, alignment=aln)
        logdet_calc.run(show_progress=False)

        self.assertFloatEqual(logdet_calc.Dists[1, 1],
                              paralinear_calc.Dists[1, 1],
                              eps=1e-3)
        self.assertFloatEqual(paralinear_calc.Variances[1, 1],
                              logdet_calc.Variances[1, 1],
                              eps=1e-3)
    def test_paralinear_distance(self):
        """calculate paralinear variance consistent with hand calculation"""
        data = [('seq1', "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT"),
                ('seq2', "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC")]
        aln = LoadSeqs(data=data, moltype=DNA)
        paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln)
        paralinear_calc.run(show_progress=False)
       
        index = dict(zip('ACGT', range(4)))
        J = numpy.zeros((4, 4))
        for p in zip(data[0][1], data[1][1]):
            J[index[p[0]], index[p[1]]] += 1
        for i in range(4):
            if J[i, i] == 0:
                J[i, i] += 0.5
        J /= J.sum()
        M = numpy.linalg.inv(J)
        f = J.sum(1), J.sum(0)
        dist = -0.25 * numpy.log( numpy.linalg.det(J) / \
                numpy.sqrt(f[0].prod() * f[1].prod()) )

        self.assertFloatEqual(paralinear_calc.Dists[1,1], dist, eps=1e-3)
    def test_paralinear_for_determinant_lte_zero(self):
        """returns distance of None if the determinant is <= 0"""
        data = dict(
            seq1=
            "AGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT",
            seq2=
            "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC")
        aln = LoadSeqs(data=data, moltype=DNA)

        paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln)
        paralinear_calc.run(show_progress=False)
        dists = paralinear_calc.getPairwiseDistances()
        self.assertTrue(dists.values()[0] is None)
        paralinear_calc.run(show_progress=False)
        dists = paralinear_calc.getPairwiseDistances()
        self.assertTrue(dists.values()[0] is None)
 def test_paralinear_for_determinant_lte_zero(self):
     """returns distance of None if the determinant is <= 0"""
     data = dict(seq1="AGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT",
                 seq2="TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC")
     aln = LoadSeqs(data=data, moltype=DNA)
     
     paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln)
     paralinear_calc.run(show_progress=False)
     dists = paralinear_calc.getPairwiseDistances()
     self.assertTrue(dists.values()[0] is None)
     paralinear_calc.run(show_progress=False)
     dists = paralinear_calc.getPairwiseDistances()
     self.assertTrue(dists.values()[0] is None)