def test_paralinear_variance(self): """calculate paralinear variance consistent with hand calculation""" data = [ ( "seq1", "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT", ), ( "seq2", "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC", ), ] aln = make_aligned_seqs(data=data, moltype=DNA) paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln) paralinear_calc.run(show_progress=False) index = dict(list(zip("ACGT", list(range(4))))) J = numpy.zeros((4, 4)) for p in zip(data[0][1], data[1][1]): J[index[p[0]], index[p[1]]] += 1 for i in range(4): if J[i, i] == 0: J[i, i] += 0.5 J /= J.sum() M = numpy.linalg.inv(J) f = J.sum(1), J.sum(0) var = 0.0 for i in range(4): for j in range(4): var += M[j, i]**2 * J[i, j] var -= 1 / numpy.sqrt(f[0][i] * f[1][i]) var /= 16 * len(data[0][1]) assert_allclose(paralinear_calc.variances[1, 1], var, atol=1e-3)
def test_paralinear_distance(self): """calculate paralinear variance consistent with hand calculation""" data = [ ( "seq1", "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT", ), ( "seq2", "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC", ), ] aln = make_aligned_seqs(data=data, moltype=DNA) paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln) paralinear_calc.run(show_progress=False) index = dict(list(zip("ACGT", list(range(4))))) J = numpy.zeros((4, 4)) for p in zip(data[0][1], data[1][1]): J[index[p[0]], index[p[1]]] += 1 for i in range(4): if J[i, i] == 0: J[i, i] += 0.5 J /= J.sum() M = numpy.linalg.inv(J) f = J.sum(1), J.sum(0) dist = -0.25 * numpy.log( numpy.linalg.det(J) / numpy.sqrt(f[0].prod() * f[1].prod())) assert_allclose(paralinear_calc.dists["seq1", "seq2"], dist)
def test_paralinear_pair_aa(self): """paralinear shouldn't fail to produce distances for aa seqs""" aln = load_aligned_seqs("data/brca1_5.paml", moltype=DNA) aln = aln.get_translation() paralinear_calc = ParalinearPair(moltype=PROTEIN, alignment=aln) paralinear_calc.run(show_progress=False) dists = paralinear_calc.get_pairwise_distances()
def test_paralinear_pair_dna(self): """calculate paralinear distance consistent with logdet distance""" data = [ ( "seq1", "TAATTCATTGGGACGTCGAATCCGGCAGTCCTGCCGCAAAAGCTTCCGGAATCGAATTTTGGCA", ), ( "seq2", "AAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGG", ), ] aln = make_aligned_seqs(data=data, moltype=DNA) paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln) paralinear_calc.run(show_progress=False) logdet_calc = LogDetPair(moltype=DNA, alignment=aln) logdet_calc.run(show_progress=False) self.assertEqual(logdet_calc.dists[1, 1], paralinear_calc.dists[1, 1]) self.assertEqual(paralinear_calc.variances[1, 1], logdet_calc.variances[1, 1])
def test_paralinear_for_determinant_lte_zero(self): """returns distance of None if the determinant is <= 0""" data = dict( seq1="AGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT", seq2="TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC", ) aln = make_aligned_seqs(data=data, moltype=DNA) paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln) paralinear_calc.run(show_progress=False) dists = paralinear_calc.get_pairwise_distances().to_dict() self.assertTrue(numpy.isnan(list(dists.values())[0])) paralinear_calc.run(show_progress=False) dists = paralinear_calc.get_pairwise_distances().to_dict() self.assertTrue(numpy.isnan(list(dists.values())[0]))
def get_calc(data): aln = make_aligned_seqs(data=data, moltype=DNA) calc = ParalinearPair(moltype=DNA, alignment=aln) calc(show_progress=False) return calc