Esempio n. 1
0
 def test_model_result_alignment_split_pos_model(self):
     """returns alignment from lf with split codon positions"""
     _data = {
         "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG",
         "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG",
         "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG",
     }
     aln = make_aligned_seqs(data=_data, moltype="dna")
     mod = evo_app.model(
         "F81",
         split_codons=True,
         show_progress=False,
         opt_args=dict(max_evaluations=5, limit_action="ignore"),
     )
     result = mod(aln)
     for i in range(1, 4):
         got = result.alignment[i]
         expect = aln[i - 1 :: 3]
         self.assertEqual(got.to_dict(), expect.to_dict())
Esempio n. 2
0
 def test_model_result_tree_split_pos_model(self):
     """returns tree from lf with split codon positions"""
     _data = {
         "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG",
         "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG",
         "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG",
     }
     aln = make_aligned_seqs(data=_data, moltype="dna")
     mod = evo_app.model(
         "F81",
         split_codons=True,
         show_progress=False,
         opt_args=dict(max_evaluations=55, limit_action="ignore"),
     )
     result = mod(aln)
     self.assertTrue(len(result.tree), 3)
     # check the trees are different by summing lengths
     lengths = {t.total_length() for _, t in result.tree.items()}
     self.assertTrue(len(lengths) > 1)
    def test_set_motif_probs(self):
        """Mprobs supplied to the parameter controller"""
        def compare_mprobs(got, exp):
            # handle min val
            for e in got:
                self.assertFloatEqual(got[e], exp[e], eps=3e-6)

        model = cogent3.evolve.substitution_model.TimeReversibleNucleotide(
            model_gaps=True, motif_probs=None)
        lf = model.make_likelihood_function(self.tree,
                                            motif_probs_from_align=False)

        mprobs = {"A": 0.1, "C": 0.2, "G": 0.2, "T": 0.5, "-": 0.0}
        lf.set_motif_probs(mprobs)
        # node the LF adjust motif probs so they are all >= 1e-6
        got = lf.get_motif_probs().to_dict()
        compare_mprobs(got, mprobs)

        lf.set_motif_probs_from_data(self.al[:1], is_constant=True)
        self.assertFloatEqual(lf.get_motif_probs()["G"], 0.6, eps=3e-6)

        lf.set_motif_probs_from_data(self.al[:1], pseudocount=1)
        self.assertNotEqual(lf.get_motif_probs()["G"], 0.6)

        # test with consideration of ambiguous states
        al = make_aligned_seqs(data={
            "seq1": "ACGTAAGNA",
            "seq2": "ACGTANGTC",
            "seq3": "ACGTACGTG"
        })
        lf.set_motif_probs_from_data(al,
                                     include_ambiguity=True,
                                     is_constant=True)
        motif_probs = dict(lf.get_motif_probs())
        correct_probs = {
            "A": 8.5 / 27,
            "C": 5.5 / 27,
            "-": 0.0,
            "T": 5.5 / 27,
            "G": 7.5 / 27,
        }
        compare_mprobs(motif_probs, correct_probs)
        self.assertFloatEqual(sum(motif_probs.values()), 1.0)
Esempio n. 4
0
 def test_aln_to_ref_known(self):
     """correctly recapitulates known case"""
     orig = make_aligned_seqs(
         {
             "Ref": "CAG---GAGAACAGAAACCCAT--TACTCACT",
             "Qu1": "CAG---GAGAACAG---CCCGTGTTACTCACT",
             "Qu2": "CAGCATGAGAACAGAAACCCGT--TA---ACT",
             "Qu3": "CAGCATGAGAACAGAAACCCGT----CTCACT",
             "Qu4": "CAGCATGAGAACAGAAACCCGTGTTACTCACT",
             "Qu5": "CAG---GAGAACAG---CCCAT--TACTCACT",
             "Qu6": "CAG---GA-AACAG---CCCAT--TACTCACT",
             "Qu7": "CAG---GA--ACAGA--CCCGT--TA---ACT",
         },
         moltype="dna",
     )
     expect = orig.to_dict()
     aligner = align_app.align_to_ref(ref_seq="Ref")
     aln = aligner(orig.degap())
     self.assertEqual(aln.to_dict(), expect)
Esempio n. 5
0
def BestLogLikelihood(
    aln,
    alphabet=None,
    exclude_chars=None,
    allowed_chars="ACGT",
    motif_length=None,
    return_length=False,
):
    """returns the best log-likelihood according to Goldman 1993.

    Parameters
    ----------
    alphabet
        a sequence alphabet object.
    motif_length
        1 for nucleotide, 2 for dinucleotide, etc ..
    exclude_chars
        a series of characters used to exclude motifs
    allowed_chars
        only motifs that contain a subset of these are
        allowed
    return_length
        whether to also return the number of alignment columns

    """
    assert alphabet or motif_length, (
        "Must provide either an alphabet or a" " motif_length"
    )
    # need to use the alphabet, so we can enforce character compliance
    if alphabet:
        kwargs = dict(moltype=alphabet.moltype)
        motif_length = alphabet.get_motif_len()
    else:
        kwargs = {}

    aln = make_aligned_seqs(aln.to_dict(), **kwargs)
    columns = aligned_columns_to_rows(aln, motif_length, exclude_chars, allowed_chars)
    num_cols = len(columns)
    log_likelihood = get_G93_lnL_from_array(columns)
    if return_length:
        return log_likelihood, num_cols

    return log_likelihood
Esempio n. 6
0
 def test_roundtrip_likelihood_function(self):
     """likelihood function.to_json enables roundtrip"""
     _data = {
         "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG",
         "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG",
         "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG",
     }
     aln = make_aligned_seqs(data=_data, moltype="dna")
     tree = make_tree(tip_names=aln.names)
     sm = get_model("HKY85")
     lf = sm.make_likelihood_function(tree)
     lf.set_alignment(aln)
     edge_vals = zip(aln.names, (2, 3, 4))
     for edge, val in edge_vals:
         lf.set_param_rule("kappa", edge=edge, init=val)
     lnL = lf.get_log_likelihood()
     data = lf.to_json()
     got_obj = deserialise_object(data)
     self.assertFloatEqual(got_obj.get_log_likelihood(), lnL)
Esempio n. 7
0
 def test_roundtrip_discrete_time_likelihood_function(self):
     """discrete time likelihood function.to_json enables roundtrip"""
     _data = {
         "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG",
         "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG",
         "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG",
     }
     aln = make_aligned_seqs(data=_data, moltype="dna")
     tree = make_tree(tip_names=aln.names)
     sm = get_model("BH")
     lf = sm.make_likelihood_function(tree)
     lf.set_alignment(aln)
     lf.optimise(max_evaluations=25,
                 limit_action="ignore",
                 show_progress=False)
     lnL = lf.get_log_likelihood()
     data = lf.to_json()
     got_obj = deserialise_object(data)
     self.assertFloatEqual(got_obj.get_log_likelihood(), lnL)
Esempio n. 8
0
    def test_user_function_multiple(self):
        """user defined composable functions should not interfere with each other"""
        from cogent3 import make_aligned_seqs
        from cogent3.core.alignment import Alignment

        u_function_1 = user_function(self.foo, "aligned", "aligned")
        u_function_2 = user_function(self.bar, "aligned", "pairwise_distances")

        aln_1 = make_aligned_seqs(data=[("a",
                                         "GCAAGCGTTTAT"), ("b",
                                                           "GCTTTTGTCAAT")])
        data = dict([("s1", "ACGTACGTA"), ("s2", "GTGTACGTA")])
        aln_2 = Alignment(data=data, moltype="dna")

        got_1 = u_function_1(aln_1)
        got_2 = u_function_2(aln_2)

        self.assertEqual(got_1.to_dict(), {"a": "GCAA", "b": "GCTT"})
        self.assertEqual(got_2, {("s1", "s2"): 2.0, ("s2", "s1"): 2.0})
Esempio n. 9
0
    def test_roundtrip_rc_annotated_align(self):
        """should work for an alignment that has been reverse complemented"""
        # the key that exposed the bug was a gap in the middle of the sequence
        aln = make_aligned_seqs(
            data=[["x", "-AAAGGGGGAAC-CT"], ["y", "TTTT--TTTTAGGGA"]],
            array_align=False,
            moltype="dna",
        )
        of1 = aln.get_seq("x").add_annotation(Feature, "exon", "E1", [(3, 8)])
        of2 = aln.get_seq("x").add_annotation(Feature, "exon", "E2", [(10, 13)])

        raln = aln.rc()
        json = raln.to_json()
        got = deserialise_object(json)
        self.assertEqual(got.to_dict(), raln.to_dict())
        orig_annots = {
            a.name: a.get_slice() for a in raln.get_annotations_from_any_seq()
        }
        got_annots = {a.name: a.get_slice() for a in got.get_annotations_from_any_seq()}
        self.assertEqual(got_annots, orig_annots)
Esempio n. 10
0
    def test_tabulate(self):
        """call returns tabular_result with Tables"""
        from cogent3.util.table import Table

        _data = {
            "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG",
            "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG",
            "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG",
        }
        aln = make_aligned_seqs(data=_data, moltype="dna")
        mod = evo_app.model("GN",
                            opt_args=dict(max_evaluations=25,
                                          limit_action="ignore"))
        result = mod(aln)
        tabulator = evo_app.tabulate_stats()
        tabulated = tabulator(result)
        self.assertEqual(len(tabulated), 3)
        for title in ("motif params", "global params", "edge params"):
            self.assertTrue(title in tabulated)
            self.assertIsInstance(tabulated[title], Table)
Esempio n. 11
0
    def test_logdet_for_determinant_lte_zero(self):
        """returns distance of None if the determinant is <= 0"""
        data = dict(
            seq1="AGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT",
            seq2="TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC",
        )
        aln = make_aligned_seqs(data=data, moltype=DNA)

        logdet_calc = LogDetPair(moltype=DNA, alignment=aln)
        logdet_calc.run(use_tk_adjustment=True, show_progress=False)
        dists = logdet_calc.get_pairwise_distances().to_dict()
        self.assertTrue(numpy.isnan(list(dists.values())[0]))
        logdet_calc.run(use_tk_adjustment=False, show_progress=False)
        dists = logdet_calc.get_pairwise_distances().to_dict()
        self.assertTrue(numpy.isnan(list(dists.values())[0]))

        # but raises ArithmeticError if told to
        logdet_calc = LogDetPair(moltype=DNA, alignment=aln, invalid_raises=True)
        with self.assertRaises(ArithmeticError):
            logdet_calc.run(use_tk_adjustment=True, show_progress=False)
Esempio n. 12
0
 def test_paralinear_pair_dna(self):
     """calculate paralinear distance consistent with logdet distance"""
     data = [
         (
             "seq1",
             "TAATTCATTGGGACGTCGAATCCGGCAGTCCTGCCGCAAAAGCTTCCGGAATCGAATTTTGGCA",
         ),
         (
             "seq2",
             "AAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGG",
         ),
     ]
     aln = make_aligned_seqs(data=data, moltype=DNA)
     paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln)
     paralinear_calc.run(show_progress=False)
     logdet_calc = LogDetPair(moltype=DNA, alignment=aln)
     logdet_calc.run(show_progress=False)
     self.assertEqual(logdet_calc.dists[1, 1], paralinear_calc.dists[1, 1])
     self.assertEqual(paralinear_calc.variances[1, 1],
                      logdet_calc.variances[1, 1])
Esempio n. 13
0
 def test_split_codon_model_result_json(self):
     """round trip split_codon result"""
     _data = {
         "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG",
         "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG",
         "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG",
     }
     aln = make_aligned_seqs(data=_data, moltype="dna")
     tree = make_tree(tip_names=aln.names)
     mod = evo_app.model(
         "F81",
         tree=tree,
         split_codons=True,
         opt_args=dict(max_evaluations=5, limit_action="ignore"),
     )
     result = mod(aln)
     lf1 = result.lf[1]
     json = result.to_json()
     deser = deserialise_object(json)
     assert_allclose(deser.lf[1].lnL, lf1.lnL)
Esempio n. 14
0
 def test_model_hypothesis_result_repr(self):
     """result objects __repr__ and _repr_html_ methods work correctly"""
     _data = {
         "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG",
         "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG",
         "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG",
     }
     aln = make_aligned_seqs(data=_data, moltype="dna")
     model1 = evo_app.model("F81",
                            opt_args=dict(max_evaluations=25,
                                          limit_action="ignore"))
     model2 = evo_app.model("HKY85",
                            opt_args=dict(max_evaluations=25,
                                          limit_action="ignore"))
     hyp = evo_app.hypothesis(model1, model2)
     result = hyp(aln)
     self.assertIsInstance(result.__repr__(), str)
     self.assertIsInstance(result._repr_html_(), str)
     self.assertIsInstance(result.null.__repr__(), str)
     self.assertIsInstance(result.null._repr_html_(), str)
Esempio n. 15
0
    def test_hyp_init(self):
        """uses user specified init_alt function, or not"""
        opt_args = dict(max_evaluations=25, limit_action="ignore")
        model1 = evo_app.model("F81", opt_args=opt_args)
        model2 = evo_app.model("HKY85", opt_args=opt_args)
        # defaults to using null for init
        hyp = evo_app.hypothesis(model1, model2)
        _data = {
            "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG",
            "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG",
            "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG",
        }
        aln = make_aligned_seqs(data=_data, moltype="dna")
        result = hyp(aln)
        self.assertEqual(result.df, 1)

        # user specified function
        hyp = evo_app.hypothesis(model1, model2, init_alt=lambda x, y: x)
        result = hyp(aln)
        self.assertEqual(result.df, 1)
Esempio n. 16
0
    def setUp(self):
        """ Initialize some variables for the tests """
        self.canonical_abbrevs = "ACDEFGHIKLMNPQRSTVWY"
        self.ambiguous_abbrevs = "BXZ"

        self.all_to_a = [("A", self.canonical_abbrevs + self.ambiguous_abbrevs)
                         ]
        self.charge_2 = alphabets["charge_2"]
        self.hydropathy_3 = alphabets["hydropathy_3"]
        self.orig = alphabets["orig"]
        self.aln = ArrayAlignment(data={
            "1": "CDDFBXZ",
            "2": "CDD-BXZ",
            "3": "AAAASS-"
        })
        self.aln2 = make_aligned_seqs(data={
            "1": "CDDFBXZ",
            "2": "CDD-BXZ",
            "3": "AAAASS-"
        })
Esempio n. 17
0
    def test_trim_stop_codons(self):
        """trims stop codons using the specified genetic code"""
        trimmer = sample.trim_stop_codons()  # defaults to standard code
        seqs = make_unaligned_seqs(data={
            "seq1": "AAATTTCCC",
            "seq2": "AAATTTTAA"
        },
                                   moltype="dna")
        got = trimmer(seqs)
        expect = {"seq1": "AAATTTCCC", "seq2": "AAATTT"}
        self.assertEqual(got.to_dict(), expect)

        trimmer = sample.trim_stop_codons(gc=1)  # standard code
        seqs = make_unaligned_seqs(data={
            "seq1": "AAATTTCCC",
            "seq2": "AAATTTTAA"
        },
                                   moltype="dna")
        got = trimmer(seqs)
        expect = {"seq1": "AAATTTCCC", "seq2": "AAATTT"}
        self.assertEqual(got.to_dict(), expect)
        trimmer = sample.trim_stop_codons(gc=1)  # standard code
        aln = make_aligned_seqs(data={
            "seq1": "AAATTTCCC",
            "seq2": "AAATTTTAA"
        },
                                moltype="dna")
        got = trimmer(aln)
        expect = {"seq1": "AAATTTCCC", "seq2": "AAATTT---"}
        self.assertEqual(got.to_dict(), expect)

        # different genetic code
        trimmer = sample.trim_stop_codons(gc=2)  # mt code
        seqs = make_unaligned_seqs(data={
            "seq1": "AAATTTCCC",
            "seq2": "AAATTTAGA"
        },
                                   moltype="dna")
        got = trimmer(seqs)
        expect = {"seq1": "AAATTTCCC", "seq2": "AAATTT"}
        self.assertEqual(got.to_dict(), expect)
Esempio n. 18
0
    def test_feature_residue(self):
        """seq features on alignment operate in sequence coordinates"""
        # In this case, only those residues included within the feature are
        # covered - note the omission of the T in y opposite the gap in x.

        aln = make_aligned_seqs(
            data=[["x", "C-CCCAAAAA"], ["y", "-T----TTTT"]],
            moltype=DNA,
            array_align=False,
        )
        self.assertEqual(str(aln), ">x\nC-CCCAAAAA\n>y\n-T----TTTT\n")
        exon = aln.get_seq("x").add_feature("exon", "ex1", [(0, 4)])
        self.assertEqual(str(exon), 'exon "ex1" at [0:4]/9')
        self.assertEqual(str(exon.get_slice()), "CCCC")
        aln_exons = list(aln.get_annotations_from_seq("x", "exon"))
        self.assertEqual(str(aln_exons), '[exon "ex1" at [0:1, 2:5]/10]')
        self.assertEqual(str(aln_exons[0].get_slice()), ">x\nCCCC\n>y\n----\n")

        # Feature.as_one_span(), is applied to the exon that
        # straddles the gap in x. The result is we preserve that feature.

        self.assertEqual(
            str(aln_exons[0].as_one_span().get_slice()), ">x\nC-CCC\n>y\n-T---\n"
        )

        # These properties also are consistently replicated with reverse
        # complemented sequences.

        aln_rc = aln.rc()
        rc_exons = list(aln_rc.get_annotations_from_any_seq("exon"))
        # not using as_one_span, so gap removed from x
        self.assertEqual(str(aln_rc[rc_exons]), ">x\nCCCC\n>y\n----\n")
        self.assertEqual(
            str(aln_rc[rc_exons[0].as_one_span()]), ">x\nC-CCC\n>y\n-T---\n"
        )

        # Features can provide their coordinates, useful for custom analyses.

        all_exons = aln.get_region_covering_all(aln_exons)
        coords = all_exons.get_coordinates()
        assert coords == [(0, 1), (2, 5)]
Esempio n. 19
0
    def test_concat(self):
        """returns concatenated alignment"""
        alns = [
            make_aligned_seqs(data=d, moltype=DNA) for d in [
                {
                    "seq1": "AAA",
                    "seq2": "AAA",
                    "seq3": "AAA"
                },
                {
                    "seq1": "TTT",
                    "seq2": "TTT",
                    "seq3": "TTT",
                    "seq4": "TTT"
                },
                {
                    "seq1": "CC",
                    "seq2": "CC",
                    "seq3": "CC"
                },
            ]
        ]
        ccat = sample.concat(intersect=True)
        got = ccat(alns)
        self.assertEqual(got.to_dict(), {
            "seq1": "AAATTTCC",
            "seq2": "AAATTTCC",
            "seq3": "AAATTTCC"
        })

        ccat = sample.concat(intersect=False)
        got = ccat(alns)
        self.assertEqual(
            got.to_dict(),
            {
                "seq1": "AAATTTCC",
                "seq2": "AAATTTCC",
                "seq3": "AAATTTCC",
                "seq4": "???TTT??",
            },
        )
Esempio n. 20
0
 def test_usage(self):
     """Alignment.counts_per_seq method correctly applies CategoryCounter"""
     data = {
         "DogFaced": "TCATTA",
         "FalseVamp": "TCATTA",
         "FlyingFox": "TCTTTA",
         "FreeTaile": "TCATTA",
         "Horse": "TCATTG",
         "LeafNose": "TCTTTA",
         "LittleBro": "TCATTA",
         "Rhino": "TCATTG",
         "RoundEare": "TCATTA",
         "TombBat": "TCAGTA",
     }
     aln = make_aligned_seqs(data=data, moltype="dna")
     got = aln.counts_per_pos(motif_length=3)
     self.assertEqual(got[0, "TCA"], 8)
     self.assertEqual(got[0, "TCT"], 2)
     self.assertEqual(got[1, "TTA"], 7)
     self.assertEqual(got[1, "GTA"], 1)
     self.assertEqual(got[1, "TTG"], 2)
Esempio n. 21
0
    def test_model_result_total_length_split_codon(self):
        """returns summed branch lengths across positions when split_codons True"""
        _data = {
            "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG",
            "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG",
            "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG",
        }
        aln = make_aligned_seqs(data=_data, moltype="dna")
        model1 = evo_app.model(
            "GN",
            split_codons=True,
            opt_args=dict(max_evaluations=25, limit_action="ignore"),
        )
        result = model1(aln)
        expect = 0.0
        for lf in result.lf.values():
            tree = lf.get_annotated_tree(length_as="ENS")
            expect += tree.total_length()

        got = result.total_length(length_as="ENS")
        assert_allclose(got, expect)
Esempio n. 22
0
    def setUp(self):
        """constructs _model_results if they don't already exist"""
        if self._model_results:
            return

        _data = {
            "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG",
            "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG",
            "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG",
        }
        aln = make_aligned_seqs(data=_data, moltype="dna")
        model1 = evo_app.model("F81",
                               opt_args=dict(max_evaluations=25,
                                             limit_action="ignore"))
        model2 = evo_app.model("HKY85",
                               opt_args=dict(max_evaluations=25,
                                             limit_action="ignore"))
        mr1 = model1(aln)
        mr2 = model2(aln)
        self._model_results[mr1.name] = mr1
        self._model_results[mr2.name] = mr2
Esempio n. 23
0
    def test_model_collection_result(self):
        """round trip of model collection works"""
        from cogent3.app import evo as evo_app
        from cogent3.evolve.parameter_controller import (
            AlignmentLikelihoodFunction,
        )

        _data = {
            "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG",
            "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG",
            "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG",
        }
        aln = make_aligned_seqs(data=_data, moltype="dna")
        opt_args = dict(max_evaluations=10, limit_action="ignore")
        m1 = evo_app.model("F81", split_codons=True, opt_args=opt_args)
        m2 = evo_app.model("GTR", split_codons=True, opt_args=opt_args)
        models = (m1, m2)
        mc_result = model_collection_result(name="collection", source="blah")
        for model in models:
            mc_result[model.name] = model(aln)

        for model in models:
            for i in range(1, 4):
                self.assertIsInstance(
                    mc_result[model.name][i], AlignmentLikelihoodFunction
                )

        data = mc_result.to_json()
        got_obj = deserialise_object(data)
        for model in models:
            for i in range(1, 4):
                self.assertIsInstance(got_obj[model.name][i], dict)

        # but after invoking deserialised_values
        got_obj.deserialised_values()
        for model in models:
            for i in range(1, 4):
                self.assertIsInstance(
                    got_obj[model.name][i], AlignmentLikelihoodFunction
                )
Esempio n. 24
0
def CigarParser(
    seqs, cigars, sliced=False, ref_seqname=None, start=None, end=None, moltype=DNA
):
    """return an alignment from raw sequences and cigar strings
    if sliced, will return an alignment correspondent to ref sequence start to end

    Parameters
    ----------
        seqs - raw sequences as {seqname: seq}
        cigars - corresponding cigar text as {seqname: cigar_text}
        cigars and seqs should have the same seqnames
        moltype - optional default to DNA

    """
    data = {}
    if not sliced:
        for seqname in list(seqs.keys()):
            aligned_seq = aligned_from_cigar(
                cigars[seqname], seqs[seqname], moltype=moltype
            )
            data[seqname] = aligned_seq
    else:
        ref_aln_seq = aligned_from_cigar(
            cigars[ref_seqname], seqs[ref_seqname], moltype=moltype
        )
        m, aln_loc = slice_cigar(cigars[ref_seqname], start, end, by_align=False)
        data[ref_seqname] = ref_aln_seq[aln_loc[0] : aln_loc[1]]
        for seqname in [
            seqname for seqname in list(seqs.keys()) if seqname != ref_seqname
        ]:
            m, seq_loc = slice_cigar(cigars[seqname], aln_loc[0], aln_loc[1])
            if seq_loc:
                seq = seqs[seqname]
                if isinstance(seq, str):
                    seq = moltype.make_seq(seq)
                data[seqname] = seq[seq_loc[0] : seq_loc[1]].gapped_by_map(m)
            else:
                data[seqname] = DNA.make_seq("-" * (aln_loc[1] - aln_loc[0]))
    aln = make_aligned_seqs(data)
    return aln
Esempio n. 25
0
 def test_degap(self):
     """test stripping gaps from collections and alignments"""
     aln = make_aligned_seqs(
         data={
             "seq1": "--ACGT--GT---",
             "seq2": "--ACGTA-GT---",
             "seq3": "--ACGTA-GT---",
         })
     observed = aln.degap()
     expect = {"seq1": "ACGTGT", "seq2": "ACGTAGT", "seq3": "ACGTAGT"}
     self.assertEqual(observed.to_dict(), expect)
     collection = make_unaligned_seqs(
         data={
             "seq1": "--ACGT--GT---",
             "seq2": "--ACGTA-GT---",
             "seq3": "--ACGTA-GT---",
         },
         moltype=DNA,
     )
     observed = collection.degap()
     self.assertEqual(observed.to_dict(), expect)
     self.assertEqual(observed.moltype, DNA)
Esempio n. 26
0
    def test_logdet_missing_states(self):
        """should calculate logdet measurement with missing states"""
        data = [
            (
                "seq1",
                "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT",
            ),
            (
                "seq2",
                "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTNTTTTTTTTTTTTCCCCCCCCCCCCCCCCC",
            ),
        ]
        aln = make_aligned_seqs(data=data, moltype=DNA)
        logdet_calc = LogDetPair(moltype=DNA, alignment=aln)
        logdet_calc.run(use_tk_adjustment=True, show_progress=False)

        dists = logdet_calc.get_pairwise_distances().to_dict()
        self.assertTrue(list(dists.values())[0] is not None)

        logdet_calc.run(use_tk_adjustment=False, show_progress=False)
        dists = logdet_calc.get_pairwise_distances().to_dict()
        self.assertTrue(list(dists.values())[0] is not None)
Esempio n. 27
0
 def test_model_result_total_length(self):
     """returns summed branch lengths"""
     _data = {
         "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG",
         "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG",
         "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG",
     }
     aln = make_aligned_seqs(data=_data, moltype="dna")
     model1 = evo_app.model("GN",
                            opt_args=dict(max_evaluations=25,
                                          limit_action="ignore"))
     result = model1(aln)
     expect_tree = result.lf.get_annotated_tree(length_as="ENS")
     assert_allclose(result.tree.total_length(), expect_tree.total_length())
     # it will be different to the standard length values
     expect_tree = result.lf.get_annotated_tree()
     assert_raises(
         AssertionError,
         assert_allclose,
         result.tree.total_length(),
         expect_tree.total_length(),
     )
Esempio n. 28
0
    def test_composable_apps(self):
        """checks the ability of these two apps(fast_slow_dist and quick_tree) to communicate"""
        path = os.path.join(
            os.path.abspath(__file__).split("test_app")[0],
            "data/brca1_5.paml")
        aln1 = load_aligned_seqs(path, moltype=DNA)
        fast_slow_dist = dist.fast_slow_dist(fast_calc="hamming",
                                             moltype="dna")
        quick = tree_app.quick_tree(drop_invalid=False)
        proc = fast_slow_dist + quick
        self.assertEqual(
            str(proc),
            "fast_slow_dist(type='distance', distance=None, moltype='dna', fast_calc='hamming', slow_calc=None) + quick_tree(type='tree', drop_invalid=False)",
        )
        self.assertIsInstance(proc, tree_app.quick_tree)
        self.assertEqual(proc._type, "tree")
        self.assertIsInstance(proc.input, dist.fast_slow_dist)
        self.assertIs(proc.output, None)
        self.assertIsInstance(proc._input_types, frozenset)
        self.assertIsInstance(proc._output_types, frozenset)
        self.assertIsInstance(proc._in, dist.fast_slow_dist)
        self.assertIs(proc._out, None)

        tree1 = proc(aln1)
        self.assertIsInstance(tree1, PhyloNode)
        self.assertIsNotNone(tree1.children)
        self.assertEqual(set(tree1.get_tip_names()), set(aln1.names))

        # tests when distances contain None
        data = dict(
            seq1=
            "AGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT",
            seq2=
            "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC",
        )
        aln2 = make_aligned_seqs(data=data, moltype=DNA)
        tree2 = proc(aln2)
        self.assertIsInstance(tree2, NotCompleted)
Esempio n. 29
0
    def load(self, data):
        """returns sequences

        Parameters
        ----------
        data
            file path or cogent3 sequence collection / alignment
        """
        if type(data) == str:
            with open_(data) as infile:
                data = dict(record for record in self._parser(infile))
            seqs = self.klass(data=data, moltype=self.moltype)
            seqs.info.path = data
        elif not isinstance(data, SequenceCollection):
            if self.aligned:
                seqs = make_aligned_seqs(data, moltype=self.moltype)
            else:
                seqs = make_unaligned_seqs(data, moltype=self.moltype)

        if not (self._output_types & {"aligned"}):
            seqs = seqs.degap()

        return seqs
Esempio n. 30
0
    def test_hyp_init_sequential(self):
        """uses preceding model to initialise function"""
        opt_args = dict(max_evaluations=15, limit_action="ignore")
        model1 = evo_app.model("F81", opt_args=opt_args)
        model2 = evo_app.model("HKY85", opt_args=opt_args)
        model3 = evo_app.model("GTR", opt_args=opt_args)
        # defaults to initialise model3 from model 2 from model1
        hyp = evo_app.hypothesis(model1, model2, model3, sequential=True)
        _data = {
            "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG",
            "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG",
            "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG",
        }
        aln = make_aligned_seqs(data=_data, moltype="dna")
        result = hyp(aln)
        self.assertTrue(result["F81"].lf.lnL < result["HKY85"].lf.lnL <
                        result["GTR"].lf.lnL)

        # can be set to False, in which case all models start at defaults
        hyp = evo_app.hypothesis(model1, model2, model3, sequential=False)
        result = hyp(aln)
        self.assertFalse(result["F81"].lf.lnL < result["HKY85"].lf.lnL <
                         result["GTR"].lf.lnL)