Esempio n. 1
0
 def test_logdet_pair_aa(self):
     """logdet shouldn't fail to produce distances for aa seqs"""
     aln = load_aligned_seqs("data/brca1_5.paml", moltype=DNA)
     aln = aln.get_translation()
     logdet_calc = LogDetPair(moltype=PROTEIN, alignment=aln)
     logdet_calc.run(use_tk_adjustment=True, show_progress=False)
     dists = logdet_calc.get_pairwise_distances()
Esempio n. 2
0
    def test_logdet_variance(self):
        """calculate logdet variance consistent with hand calculation"""
        data = [
            (
                "seq1",
                "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT",
            ),
            (
                "seq2",
                "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC",
            ),
        ]
        aln = make_aligned_seqs(data=data, moltype=DNA)
        logdet_calc = LogDetPair(moltype=DNA, alignment=aln)
        logdet_calc.run(use_tk_adjustment=True, show_progress=False)
        self.assertEqual(logdet_calc.variances[1, 1], None)

        index = dict(list(zip("ACGT", list(range(4)))))
        J = numpy.zeros((4, 4))
        for p in zip(data[0][1], data[1][1]):
            J[index[p[0]], index[p[1]]] += 1
        for i in range(4):
            if J[i, i] == 0:
                J[i, i] += 0.5
        J /= J.sum()
        M = numpy.linalg.inv(J)
        var = 0.0
        for i in range(4):
            for j in range(4):
                var += M[j, i]**2 * J[i, j] - 1
        var /= 16 * len(data[0][1])

        logdet_calc.run(use_tk_adjustment=False, show_progress=False)
        dists = logdet_calc.get_pairwise_distances()
        assert_allclose(logdet_calc.variances[1, 1], var, atol=1e-3)
Esempio n. 3
0
 def test_logdet_pair_dna(self):
     """logdet should produce distances that match MEGA"""
     aln = load_aligned_seqs("data/brca1_5.paml", moltype=DNA)
     logdet_calc = LogDetPair(moltype=DNA, alignment=aln)
     logdet_calc.run(use_tk_adjustment=True, show_progress=False)
     dists = logdet_calc.get_pairwise_distances().to_dict()
     all_expected = {
         ("Human", "NineBande"): 0.075336929999999996,
         ("NineBande", "DogFaced"): 0.0898575452,
         ("DogFaced", "Human"): 0.1061747919,
         ("HowlerMon", "DogFaced"): 0.0934480008,
         ("Mouse", "HowlerMon"): 0.26422862920000001,
         ("NineBande", "Human"): 0.075336929999999996,
         ("HowlerMon", "NineBande"): 0.062202897899999998,
         ("DogFaced", "NineBande"): 0.0898575452,
         ("DogFaced", "HowlerMon"): 0.0934480008,
         ("Human", "DogFaced"): 0.1061747919,
         ("Mouse", "Human"): 0.26539976700000001,
         ("NineBande", "HowlerMon"): 0.062202897899999998,
         ("HowlerMon", "Human"): 0.036571181899999999,
         ("DogFaced", "Mouse"): 0.2652555144,
         ("HowlerMon", "Mouse"): 0.26422862920000001,
         ("Mouse", "DogFaced"): 0.2652555144,
         ("NineBande", "Mouse"): 0.22754789210000001,
         ("Mouse", "NineBande"): 0.22754789210000001,
         ("Human", "Mouse"): 0.26539976700000001,
         ("Human", "HowlerMon"): 0.036571181899999999,
     }
     for pair in dists:
         got = dists[pair]
         expected = all_expected[pair]
         assert_allclose(got, expected)
Esempio n. 4
0
 def test_logdet_tk_adjustment(self):
     """logdet using tamura kumar differs from classic"""
     aln = load_aligned_seqs("data/brca1_5.paml", moltype=DNA)
     logdet_calc = LogDetPair(moltype=DNA, alignment=aln)
     logdet_calc.run(use_tk_adjustment=True, show_progress=False)
     tk = logdet_calc.get_pairwise_distances()
     logdet_calc.run(use_tk_adjustment=False, show_progress=False)
     not_tk = logdet_calc.get_pairwise_distances()
     self.assertNotEqual(tk, not_tk)
Esempio n. 5
0
 def test_paralinear_pair_dna(self):
     """calculate paralinear distance consistent with logdet distance"""
     data = [
         (
             "seq1",
             "TAATTCATTGGGACGTCGAATCCGGCAGTCCTGCCGCAAAAGCTTCCGGAATCGAATTTTGGCA",
         ),
         (
             "seq2",
             "AAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGG",
         ),
     ]
     aln = make_aligned_seqs(data=data, moltype=DNA)
     paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln)
     paralinear_calc.run(show_progress=False)
     logdet_calc = LogDetPair(moltype=DNA, alignment=aln)
     logdet_calc.run(show_progress=False)
     self.assertEqual(logdet_calc.dists[1, 1], paralinear_calc.dists[1, 1])
     self.assertEqual(paralinear_calc.variances[1, 1], logdet_calc.variances[1, 1])
Esempio n. 6
0
    def test_logdet_missing_states(self):
        """should calculate logdet measurement with missing states"""
        data = [
            (
                "seq1",
                "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT",
            ),
            (
                "seq2",
                "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTNTTTTTTTTTTTTCCCCCCCCCCCCCCCCC",
            ),
        ]
        aln = make_aligned_seqs(data=data, moltype=DNA)
        logdet_calc = LogDetPair(moltype=DNA, alignment=aln)
        logdet_calc.run(use_tk_adjustment=True, show_progress=False)

        dists = logdet_calc.get_pairwise_distances().to_dict()
        self.assertTrue(list(dists.values())[0] is not None)

        logdet_calc.run(use_tk_adjustment=False, show_progress=False)
        dists = logdet_calc.get_pairwise_distances().to_dict()
        self.assertTrue(list(dists.values())[0] is not None)
Esempio n. 7
0
    def test_logdet_for_determinant_lte_zero(self):
        """returns distance of None if the determinant is <= 0"""
        data = dict(
            seq1="AGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT",
            seq2="TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC",
        )
        aln = make_aligned_seqs(data=data, moltype=DNA)

        logdet_calc = LogDetPair(moltype=DNA, alignment=aln)
        logdet_calc.run(use_tk_adjustment=True, show_progress=False)
        dists = logdet_calc.get_pairwise_distances().to_dict()
        self.assertTrue(numpy.isnan(list(dists.values())[0]))
        logdet_calc.run(use_tk_adjustment=False, show_progress=False)
        dists = logdet_calc.get_pairwise_distances().to_dict()
        self.assertTrue(numpy.isnan(list(dists.values())[0]))

        # but raises ArithmeticError if told to
        logdet_calc = LogDetPair(moltype=DNA, alignment=aln, invalid_raises=True)
        with self.assertRaises(ArithmeticError):
            logdet_calc.run(use_tk_adjustment=True, show_progress=False)