Ejemplo n.º 1
0
 def test_deserialise_likelihood_function(self):
     """correctly deserialise data into likelihood function"""
     # tests multiple alignments
     data = load_aligned_seqs(
         filename=os.path.join(os.getcwd(), "data", "brca1_5.paml")
     )
     half = len(data) // 2
     aln1 = data[:half]
     aln2 = data[half:]
     loci_names = ["1st-half", "2nd-half"]
     loci = [aln1, aln2]
     tree = make_tree(tip_names=data.names)
     model = get_model("HKY85")
     lf = model.make_likelihood_function(tree, loci=loci_names)
     lf.set_alignment(loci)
     lf_rich_dict = lf.to_rich_dict()
     got = deserialise_likelihood_function(lf_rich_dict)
     self.assertEqual(str(lf.defn_for["mprobs"]), str(got.defn_for["mprobs"]))
     self.assertEqual(
         str(lf.defn_for["alignment"].assignments),
         str(got.defn_for["alignment"].assignments),
     )
     # tests single alignment
     model = get_model("HKY85")
     lf = model.make_likelihood_function(tree)
     lf.set_alignment(aln1)
     lf_rich_dict = lf.to_rich_dict()
     got = deserialise_likelihood_function(lf_rich_dict)
     self.assertEqual(str(lf.defn_for["mprobs"]), str(got.defn_for["mprobs"]))
     self.assertEqual(
         str(lf.defn_for["alignment"].assignments),
         str(got.defn_for["alignment"].assignments),
     )
Ejemplo n.º 2
0
    def test_get_motif_probs_by_node_mg94(self):
        """handles different statespace dimensions from process and stationary distribution"""
        from cogent3.evolve.models import get_model

        aln = load_aligned_seqs("data/primates_brca1.fasta", moltype="dna")
        aln = aln.no_degenerates(motif_length=3)

        tree = load_tree("data/primates_brca1.tree")

        # root mprobs are constant
        sm = get_model("MG94HKY")
        lf = sm.make_likelihood_function(tree)
        lf.set_alignment(aln)
        mprobs = lf.get_motif_probs()

        mprobs = lf.get_motif_probs_by_node()
        self.assertEqual(mprobs.shape, (len(tree.get_edge_vector()), 61))

        # root mprobs are variable
        sm = get_model("MG94HKY", optimise_motif_probs=True)
        sm = get_model("MG94HKY")
        lf = sm.make_likelihood_function(tree)
        lf.set_alignment(aln)
        mprobs = lf.get_motif_probs_by_node()
        self.assertEqual(mprobs.shape, (len(tree.get_edge_vector()), 61))

        # not imlemented for monomers variant
        sm = TimeReversibleCodon(mprob_model="monomers",
                                 model_gaps=False,
                                 recode_gaps=True)
        lf = sm.make_likelihood_function(tree)
        lf.set_alignment(aln)
        with self.assertRaises(NotImplementedError):
            _ = lf.get_motif_probs_by_node()
Ejemplo n.º 3
0
    def test_get_model(self):
        """get_models successfully creates model instances"""
        # just returns query if it's already a substitution model
        for mod in (CNFGTR(), WG01(), GN()):
            got = get_model(mod)
            self.assertEqual(id(got), id(mod))

        with self.assertRaises(ValueError):
            # unknown model raises exception
            _ = get_model("blah")
Ejemplo n.º 4
0
    def test_roundtrip_discrete_time_submod(self):
        """discrete time substitution models to_json enables roundtrip"""
        sm = get_model("DT")
        data = sm.to_json()
        got = deserialise_object(data)
        self.assertEqual(got.to_rich_dict(), sm.to_rich_dict())

        sm = get_model("DT", motif_length=2)
        data = sm.to_json()
        got = deserialise_object(data)
        self.assertEqual(got.to_rich_dict(), sm.to_rich_dict())
Ejemplo n.º 5
0
 def test_roundtrip_submod(self):
     """substitution model to_json enables roundtrip"""
     sm = get_model("HKY85")
     data = sm.to_json()
     got = deserialise_object(data)
     self.assertEqual(got.to_rich_dict(), sm.to_rich_dict())
     sm = get_model("GN")
     data = sm.to_json()
     got = deserialise_object(data)
     self.assertEqual(got.to_rich_dict(), sm.to_rich_dict())
     sm = get_model("CNFGTR")
     data = sm.to_json()
     got = deserialise_object(data)
     self.assertEqual(got.to_rich_dict(), sm.to_rich_dict())
Ejemplo n.º 6
0
def is_codon_model(sm):
    """True of sm, or get_model(sm), is a Codon substitution model"""
    from cogent3.evolve.substitution_model import _Codon

    if type(sm) == str:
        sm = get_model(sm)
    return isinstance(sm, _Codon)
Ejemplo n.º 7
0
    def test_roundtrip_model_result(self):
        """mode_result.to_json enables roundtrip and lazy evaluation"""
        _data = {
            "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG",
            "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG",
            "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG",
        }
        aln = make_aligned_seqs(data=_data, moltype="dna")
        tree = make_tree(tip_names=aln.names)
        sm = get_model("HKY85")
        lf = sm.make_likelihood_function(tree)
        lf.set_alignment(aln)
        edge_vals = zip(aln.names, (2, 3, 4))
        for edge, val in edge_vals:
            lf.set_param_rule("kappa", edge=edge, init=val)
        result = model_result(name="test")
        result[1] = lf
        self.assertIs(result[1], lf)
        self.assertEqual(result.nfp, lf.nfp)
        self.assertEqual(result.lnL, lf.lnL)

        data = result.to_json()
        got_obj = deserialise_object(data)
        # lazy evaluation means initially, the value is a dict
        self.assertIsInstance(got_obj[1], dict)
        # and properties match original
        self.assertEqual(got_obj.lnL, result.lnL)
        self.assertEqual(got_obj.nfp, result.nfp)
        self.assertEqual(got_obj.DLC, result.DLC)
        # when we ask for the lf attribute, it's no longer a dict
        self.assertNotIsInstance(got_obj.lf, dict)
        self.assertEqual(got_obj.lf.nfp, got_obj.nfp)
Ejemplo n.º 8
0
 def test_model_names(self):
     """name attribute matches model name"""
     for model_name in models:
         model = get_model(model_name)
         self.assertTrue(
             model.name.startswith(model_name),
             msg=f"{model.name} does not start with {model_name}",
         )
Ejemplo n.º 9
0
    def _make_model_cache(self):
        # constructs all the substitution  models
        if hasattr(self, "_cached_models"):
            return

        cache = {}
        for name in models:
            cache[name] = get_model(name)
        self._cached_models = cache
Ejemplo n.º 10
0
    def __init__(self,
                 distance=None,
                 moltype=None,
                 fast_calc=None,
                 slow_calc=None):
        super(fast_slow_dist, self).__init__(
            input_types=ALIGNED_TYPE,
            output_types=(PAIRWISE_DISTANCE_TYPE, SERIALISABLE_TYPE),
            data_types=("ArrayAlignment", "Alignment"),
        )
        self._formatted_params()
        self._moltype = moltype if moltype is None else get_moltype(moltype)
        self._sm = None

        if (fast_calc or slow_calc) and distance:
            raise ValueError("cannot combine distance and fast/slow")

        if distance:
            fast_calc = distance
            slow_calc = distance

        d = set(["hamming", "paralinear", "logdet"]) & set(
            [slow_calc, fast_calc])
        if d and not self._moltype:
            raise ValueError(f"you must provide a moltype for {d}")

        try:
            fast_calc = get_distance_calculator(fast_calc,
                                                moltype=self._moltype)
        except (ValueError, AttributeError):
            fast_calc = None

        try:
            slow_calc = get_model(slow_calc)
        except ValueError:
            slow_calc = None

        if not (fast_calc or slow_calc):
            raise ValueError(f"invalid values for {slow_calc} or {fast_calc}")

        self.fast_calc = fast_calc
        if fast_calc and self._moltype and fast_calc.moltype != self._moltype:
            raise ValueError(
                f"{self._moltype} incompatible moltype with fast calculator {fast_calc.moltype}"
            )
        elif fast_calc:
            self._moltype = fast_calc.moltype

        if slow_calc and self._moltype and slow_calc.moltype != self._moltype:
            raise ValueError("incompatible moltype with slow calculator")
        elif slow_calc:
            self._moltype = slow_calc.moltype
        self._sm = slow_calc
Ejemplo n.º 11
0
    def test_roundtrip_het_lf(self):
        """correctly round trips a site-het model"""
        with open("data/site-het-param-rules.json") as infile:
            rules = json.load(infile)

        aln = load_aligned_seqs("data/primates_brca1.fasta", moltype="dna")
        tree = load_tree("data/primates_brca1.tree")
        rule_lnL = rules.pop("phylohmm-gamma-kappa")
        sm = get_model("HKY85", ordered_param="rate", distribution="gamma")
        lf1 = sm.make_likelihood_function(tree, bins=4, sites_independent=False)
        lf1.set_alignment(aln)
        lf1.apply_param_rules(rule_lnL["rules"])
        data = lf1.to_json()
        got_lf = deserialise_object(data)
        assert_allclose(lf1.lnL, got_lf.lnL)
Ejemplo n.º 12
0
    def _test_aln(self, seqs, model=dna_model, param_vals=None, **kw):

        orig = {n: s.replace("-", "") for (n, s) in list(seqs.items())}
        aln = self._make_aln(orig, model=model, param_vals=param_vals, **kw)
        result = {n: s.lower() for (n, s) in list(aln.to_dict().items())}
        # assert the alignment result is correct
        self.assertEqual(seqs, result)
        # and the moltype matches the model
        model = get_model(model)
        self.assertIs(aln.moltype, model.moltype)

        # assert the returned alignment has the correct parameter values in the
        # align.info object.
        if param_vals:
            for param, val in param_vals:
                self.assertEqual(aln.info.align_params[param], val)
Ejemplo n.º 13
0
    def test_zhang(self):
        """natsel_zhang correctly configured and should not fail"""
        opt = dict(max_evaluations=20, limit_action="ignore")
        aln = load_aligned_seqs("data/primate_brca1.fasta", moltype="dna")
        natsel = evo_app.natsel_zhang(
            "CNFGTR",
            tree="data/primate_brca1.tree",
            tip1="Human",
            tip2="Chimpanzee",
            opt_args=opt,
        )
        result = natsel(aln)
        self.assertEqual(result.df, 3)
        self.assertEqual(result.alt.nfp, 21)
        # the naming scheme is model name followed by null/alt
        self.assertTrue("CNFGTR-null" in result)
        self.assertTrue("CNFGTR-alt" in result)

        # result keys correct when given a model
        Y98 = get_model("Y98")
        natsel = evo_app.natsel_zhang(
            Y98,
            tree="data/primate_brca1.tree",
            tip1="Human",
            tip2="Chimpanzee",
            opt_args=opt,
        )
        result = natsel(aln)
        self.assertEqual(result.df, 3)
        self.assertTrue("Y98-null" in result)
        self.assertTrue("Y98-alt" in result)

        # fails if not a codon model
        with self.assertRaises(ValueError):
            _ = evo_app.natsel_zhang(
                "F81",
                tree="data/primate_brca1.tree",
                tip1="Human",
                tip2="Chimpanzee",
                opt_args=opt,
            )

        # fails if no tip names provided
        with self.assertRaises(ValueError):
            _ = evo_app.natsel_zhang("Y98",
                                     tree="data/primate_brca1.tree",
                                     opt_args=opt)
Ejemplo n.º 14
0
 def test_roundtrip_discrete_time_likelihood_function(self):
     """discrete time likelihood function.to_json enables roundtrip"""
     _data = {
         "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG",
         "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG",
         "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG",
     }
     aln = make_aligned_seqs(data=_data, moltype="dna")
     tree = make_tree(tip_names=aln.names)
     sm = get_model("BH")
     lf = sm.make_likelihood_function(tree)
     lf.set_alignment(aln)
     lf.optimise(max_evaluations=25, limit_action="ignore", show_progress=False)
     lnL = lf.get_log_likelihood()
     data = lf.to_json()
     got_obj = deserialise_object(data)
     assert_allclose(got_obj.get_log_likelihood(), lnL)
Ejemplo n.º 15
0
 def test_roundtrip_likelihood_function(self):
     """likelihood function.to_json enables roundtrip"""
     _data = {
         "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG",
         "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG",
         "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG",
     }
     aln = make_aligned_seqs(data=_data, moltype="dna")
     tree = make_tree(tip_names=aln.names)
     sm = get_model("HKY85")
     lf = sm.make_likelihood_function(tree)
     lf.set_alignment(aln)
     edge_vals = zip(aln.names, (2, 3, 4))
     for edge, val in edge_vals:
         lf.set_param_rule("kappa", edge=edge, init=val)
     lnL = lf.get_log_likelihood()
     data = lf.to_json()
     got_obj = deserialise_object(data)
     self.assertFloatEqual(got_obj.get_log_likelihood(), lnL)
Ejemplo n.º 16
0
def deserialise_substitution_model(data):
    """returns a cogent3 substitution model instance"""
    from cogent3.evolve.models import get_model

    data.pop("version", None)
    kw = {} if "kw" not in data else data.pop("kw")
    sm = None
    if kw and "name" in kw:
        name = kw.pop("name")
        try:
            sm = get_model(name, **kw)
        except ValueError:  # user defined sm?
            pass

    if sm is None:
        alphabet = deserialise_alphabet(data.pop("alphabet"))
        klass = _get_class(data.pop("type"))
        sm = klass(alphabet, **data)

    return sm
Ejemplo n.º 17
0
    def test_roundtrip_from_file(self):
        """correctly roundtrips a likelihood function fro json file"""
        _data = {
            "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG",
            "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG",
            "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG",
        }
        aln = make_aligned_seqs(data=_data, moltype="dna")
        tree = make_tree(tip_names=aln.names)
        sm = get_model("HKY85")
        lf = sm.make_likelihood_function(tree)
        lf.set_alignment(aln)
        edge_vals = zip(aln.names, (2, 3, 4))
        for edge, val in edge_vals:
            lf.set_param_rule("kappa", edge=edge, init=val)
        lnL = lf.get_log_likelihood()
        data = lf.to_json()
        with TemporaryDirectory(dir=".") as dirname:
            outpath = dirname + "/delme.json"
            with open(outpath, "w") as outfile:
                outfile.write(data)

            got = deserialise_object(outpath)
            self.assertFloatEqual(got.get_log_likelihood(), lnL)
Ejemplo n.º 18
0
    def __init__(
        self,
        sm,
        tree=None,
        sm_args=None,
        gc=1,
        optimise_motif_probs=False,
        lf_args=None,
        opt_args=None,
        show_progress=False,
        verbose=False,
    ):
        """
        Parameters
        ----------
        sm : str or instance
            substitution model, if string must be available via get_model()
            (see cogent3.available_models).
        tree
            if None, assumes a star phylogeny (only valid for 3 taxa). Can be a
            newick formatted tree, a path to a file containing one, or a Tree
            instance.
        sm_args
            arguments to be passed to the substitution model constructor, e.g.
            dict(optimise_motif_probs=True)
        gc
            genetic code, either name or number (see cogent3.available_codes)
        optimise_motif_probs : bool
            If True, motif probabilities are free parameters. If False (default)
            they are estimated frokm the alignment.
        lf_args
            arguments to be passed to the likelihood function constructor
        opt_args
            arguments for the numerical optimiser, e.g.
            dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000,
            limit_action='ignore')
        show_progress : bool
            show progress bars during numerical optimisation
        verbose : bool
            prints intermediate states to screen during fitting
        """
        super(natsel_neutral, self).__init__(
            input_types=(ALIGNED_TYPE, SERIALISABLE_TYPE),
            output_types=(RESULT_TYPE, HYPOTHESIS_RESULT_TYPE, SERIALISABLE_TYPE),
            data_types=("ArrayAlignment", "Alignment"),
        )
        self._formatted_params()
        if not is_codon_model(sm):
            raise ValueError(f"{sm} is not a codon model")

        if misc.path_exists(tree):
            tree = load_tree(filename=tree, underscore_unmunge=True)
        elif type(tree) == str:
            tree = make_tree(treestring=tree, underscore_unmunge=True)

        if tree and not isinstance(tree, TreeNode):
            raise TypeError(f"invalid tree type {type(tree)}")

        # instantiate model, ensuring genetic code setting passed on
        sm_args = sm_args or {}
        sm_args["gc"] = sm_args.get("gc", gc)
        sm_args["optimise_motif_probs"] = optimise_motif_probs
        if type(sm) == str:
            sm = get_model(sm, **sm_args)

        model_name = sm.name
        # defining the null model
        lf_args = lf_args or {}
        null = model(
            sm,
            tree,
            name=f"{model_name}-null",
            sm_args=sm_args,
            opt_args=opt_args,
            show_progress=show_progress,
            param_rules=[dict(par_name="omega", is_constant=True, value=1.0)],
            lf_args=lf_args,
            verbose=verbose,
        )

        # defining the alternate model
        alt = model(
            sm,
            tree,
            name=f"{model_name}-alt",
            sm_args=sm_args,
            opt_args=opt_args,
            show_progress=show_progress,
            lf_args=lf_args,
            verbose=verbose,
        )
        hyp = hypothesis(null, alt)

        self.func = hyp
Ejemplo n.º 19
0
    def __init__(self,
                 distance=None,
                 moltype=None,
                 fast_calc=None,
                 slow_calc=None):
        """
        Parameters
        ----------
        moltype : str
            cogent3 moltype
        distance : str
            Name of a distance method available as both fast and slow calculator.
        fast_calc
            Name of a fast distance calculator. See cogent3.available_distances().
        slow_calc
            Name of a slow distance calculator. See cogent3.available_models().

        Notes
        -----
        If you provide fast_calc or slow_calc, you must specify the moltype.
        """
        super(fast_slow_dist, self).__init__(
            input_types=self._input_types,
            output_types=self._output_types,
            data_types=self._data_types,
        )
        self._formatted_params()
        self._moltype = moltype if moltype is None else get_moltype(moltype)
        self._sm = None

        if (fast_calc or slow_calc) and distance:
            raise ValueError("cannot combine distance and fast/slow")

        if distance:
            fast_calc = distance
            slow_calc = distance

        d = {"hamming", "percent", "paralinear", "logdet"
             } & {slow_calc, fast_calc}
        if d and not self._moltype:
            raise ValueError(f"you must provide a moltype for {d}")

        try:
            fast_calc = get_distance_calculator(fast_calc,
                                                moltype=self._moltype)
        except (ValueError, AttributeError):
            fast_calc = None

        try:
            slow_calc = get_model(slow_calc)
        except ValueError:
            slow_calc = None

        if not (fast_calc or slow_calc):
            raise ValueError(f"invalid values for {slow_calc} or {fast_calc}")

        self.fast_calc = fast_calc
        if fast_calc and self._moltype and fast_calc.moltype != self._moltype:
            raise ValueError(
                f"{self._moltype} incompatible moltype with fast calculator {fast_calc.moltype}"
            )
        elif fast_calc:
            self._moltype = fast_calc.moltype

        if slow_calc and self._moltype and slow_calc.moltype != self._moltype:
            raise ValueError("incompatible moltype with slow calculator")
        elif slow_calc:
            self._moltype = slow_calc.moltype
        self._sm = slow_calc
        self.func = self.calc_distance
Ejemplo n.º 20
0
    def __init__(
        self,
        sm,
        tree=None,
        name=None,
        sm_args=None,
        lf_args=None,
        time_het=None,
        param_rules=None,
        opt_args=None,
        split_codons=False,
        show_progress=False,
        verbose=False,
    ):
        """
        Parameters
        ----------
        sm : str or instance
            substitution model if string must be available via get_model()
        tree
            if None, assumes a star phylogeny (only valid for 3 taxa). Can be a
            newick formatted tree, a path to a file containing one, or a Tree
            instance.
        name
            name of the model
        sm_args
            arguments to be passed to the substitution model constructor, e.g.
            dict(optimise_motif_probs=True)
        lf_args
            arguments to be passed to the likelihood function constructor
        time_het
            'max' or a list of dicts corresponding to edge_sets, e.g.
            [dict(edges=['Human', 'Chimp'], is_independent=False, upper=10)].
            Passed to the likelihood function .set_time_heterogeneity()
            method.
        param_rules
            other parameter rules, passed to the likelihood function
            set_param_rule() method
        opt_args
            arguments for the numerical optimiser, e.g.
            dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000,
            limit_action='ignore')
        split_codons : bool
            if True, incoming alignments are split into the 3 frames and each
            frame is fit separately
        show_progress : bool
            show progress bars during numerical optimisation
        verbose : bool
            prints intermediate states to screen during fitting

        Returns
        -------
        Calling an instance with an alignment returns a model_result instance
        with the optimised likelihood function. In the case of split_codons,
        the result object has a separate entry for each.
        """
        super(model, self).__init__(
            input_types=("aligned", "serialisable"),
            output_types=("result", "model_result", "serialisable"),
            data_types=("ArrayAlignment", "Alignment"),
        )
        self._verbose = verbose
        self._formatted_params()
        sm_args = sm_args or {}
        if type(sm) == str:
            sm = get_model(sm, **sm_args)
        self._sm = sm
        if len(sm.get_motifs()[0]) > 1:
            split_codons = False

        if misc.path_exists(tree):
            tree = load_tree(filename=tree, underscore_unmunge=True)
        elif type(tree) == str:
            tree = make_tree(treestring=tree, underscore_unmunge=True)

        if tree and not isinstance(tree, TreeNode):
            raise TypeError(f"invalid tree type {type(tree)}")

        self._tree = tree
        self._lf_args = lf_args or {}
        if not name:
            name = sm.name or "unnamed model"
        self.name = name
        self._opt_args = opt_args or dict(max_restarts=5,
                                          show_progress=show_progress)
        self._opt_args["show_progress"] = self._opt_args.get(
            "show_progress", show_progress)
        param_rules = param_rules or {}
        if param_rules:
            for rule in param_rules:
                if rule.get("is_constant"):
                    continue
                rule["upper"] = rule.get("upper", 50)  # default upper bound
        self._param_rules = param_rules
        self._time_het = time_het
        self._split_codons = split_codons
        self.func = self.fit
Ejemplo n.º 21
0
    def __init__(
        self,
        sm,
        tree=None,
        sm_args=None,
        gc=1,
        optimise_motif_probs=False,
        upper_omega=20.0,
        lf_args=None,
        opt_args=None,
        show_progress=False,
        verbose=False,
    ):
        """
        Parameters
        ----------
        sm : str or instance
            substitution model, if string must be available via get_model()
            (see cogent3.available_models).
        tree
            if None, assumes a star phylogeny (only valid for 3 taxa). Can be a
            newick formatted tree, a path to a file containing one, or a Tree
            instance.
        sm_args
            arguments to be passed to the substitution model constructor, e.g.
            dict(optimise_motif_probs=True)
        gc
            genetic code, either name or number (see cogent3.available_codes)
        optimise_motif_probs : bool
            If True, motif probabilities are free parameters. If False (default)
            they are estimated from the alignment.
        upper_omega : float
            upper bound for positive selection omega
        lf_args
            arguments to be passed to the likelihood function constructor
        opt_args
            arguments for the numerical optimiser, e.g.
            dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000,
            limit_action='ignore')
        show_progress : bool
            show progress bars during numerical optimisation
        verbose : bool
            prints intermediate states to screen during fitting
        """
        super(natsel_sitehet, self).__init__(
            input_types=("aligned", "serialisable"),
            output_types=("result", "hypothesis_result", "serialisable"),
            data_types=("ArrayAlignment", "Alignment"),
        )
        self._formatted_params()
        if not is_codon_model(sm):
            raise ValueError(f"{sm} is not a codon model")

        if misc.path_exists(tree):
            tree = load_tree(filename=tree, underscore_unmunge=True)
        elif type(tree) == str:
            tree = make_tree(treestring=tree, underscore_unmunge=True)

        if tree and not isinstance(tree, TreeNode):
            raise TypeError(f"invalid tree type {type(tree)}")

        # instantiate model, ensuring genetic code setting passed on
        sm_args = sm_args or {}
        sm_args["gc"] = sm_args.get("gc", gc)
        sm_args["optimise_motif_probs"] = optimise_motif_probs
        if type(sm) == str:
            sm = get_model(sm, **sm_args)

        model_name = sm.name
        # defining the null model
        epsilon = 1e-6
        null_param_rules = [
            dict(par_name="omega",
                 bins="-ve",
                 upper=1 - epsilon,
                 init=1 - epsilon),
            dict(par_name="omega", bins="neutral", is_constant=True,
                 value=1.0),
        ]
        lf_args = lf_args or {}
        null_lf_args = lf_args.copy()
        null_lf_args.update(dict(bins=("-ve", "neutral")))
        self.null = model(
            sm,
            tree,
            name=f"{model_name}-null",
            sm_args=sm_args,
            param_rules=null_param_rules,
            lf_args=null_lf_args,
            opt_args=opt_args,
            show_progress=show_progress,
            verbose=verbose,
        )

        # defining the alternate model, param rules to be completed each call
        alt_lf_args = lf_args.copy()
        alt_lf_args.update(dict(bins=("-ve", "neutral", "+ve")))
        self.alt_args = dict(
            sm=sm,
            tree=tree,
            name=f"{model_name}-alt",
            sm_args=sm_args,
            lf_args=alt_lf_args,
            opt_args=opt_args,
            show_progress=show_progress,
            verbose=verbose,
            upper_omega=upper_omega,
        )

        self.func = self.test_hypothesis
Ejemplo n.º 22
0
    def __init__(
        self,
        sm,
        tree=None,
        sm_args=None,
        gc=1,
        optimise_motif_probs=False,
        tip1=None,
        tip2=None,
        outgroup=None,
        stem=False,
        clade=True,
        is_independent=False,
        lf_args=None,
        upper_omega=20,
        opt_args=None,
        show_progress=False,
        verbose=False,
    ):
        """
        Parameters
        ----------
        sm : str or instance
            substitution model, if string must be available via get_model()
            (see cogent3.available_models).
        tree
            if None, assumes a star phylogeny (only valid for 3 taxa). Can be a
            newick formatted tree, a path to a file containing one, or a Tree
            instance.
        sm_args
            arguments to be passed to the substitution model constructor, e.g.
            dict(optimise_motif_probs=True)
        gc
            genetic code, either name or number (see cogent3.available_codes)
        optimise_motif_probs : bool
            If True, motif probabilities are free parameters. If False (default)
            they are estimated frokm the alignment.
        tip1 : str
            name of tip 1
        tip2 : str
            name of tip 1
        outgroup : str
            name of tip outside clade of interest
        stem : bool
            include name of stem to clade defined by tip1, tip2, outgroup
        clade : bool
            include names of edges within clade defined by tip1, tip2, outgroup
        is_independent : bool
            if True, all edges specified by the scoping info get their own
            value of omega, if False, only a single omega
        lf_args
            arguments to be passed to the likelihood function constructor
        upper_omega : float
            upper bound for omega
        param_rules
            other parameter rules, passed to the likelihood function
            set_param_rule() method
        opt_args
            arguments for the numerical optimiser, e.g.
            dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000,
            limit_action='ignore')
        show_progress : bool
            show progress bars during numerical optimisation
        verbose : bool
            prints intermediate states to screen during fitting
        """
        super(natsel_timehet, self).__init__(
            input_types=("aligned", "serialisable"),
            output_types=("result", "hypothesis_result", "serialisable"),
            data_types=("ArrayAlignment", "Alignment"),
        )
        self._formatted_params()
        if not is_codon_model(sm):
            raise ValueError(f"{sm} is not a codon model")

        if not any([tip1, tip2]):
            raise ValueError("must provide at least a single tip name")

        if misc.path_exists(tree):
            tree = load_tree(filename=tree, underscore_unmunge=True)
        elif type(tree) == str:
            tree = make_tree(treestring=tree, underscore_unmunge=True)

        if tree and not isinstance(tree, TreeNode):
            raise TypeError(f"invalid tree type {type(tree)}")

        if all([tip1, tip2]) and tree:
            edges = tree.get_edge_names(tip1,
                                        tip2,
                                        stem=stem,
                                        clade=clade,
                                        outgroup_name=outgroup)
        elif all([tip1, tip2]):
            edges = [tip1, tip2]
        elif tip1:
            edges = [tip1]
        elif tip2:
            edges = [tip2]

        assert edges, "No edges"

        # instantiate model, ensuring genetic code setting passed on
        sm_args = sm_args or {}
        sm_args["gc"] = sm_args.get("gc", gc)
        sm_args["optimise_motif_probs"] = optimise_motif_probs
        if type(sm) == str:
            sm = get_model(sm, **sm_args)

        model_name = sm.name
        # defining the null model
        lf_args = lf_args or {}
        null_lf_args = lf_args.copy()
        null = model(
            sm,
            tree,
            name=f"{model_name}-null",
            sm_args=sm_args,
            lf_args=null_lf_args,
            opt_args=opt_args,
            show_progress=show_progress,
            verbose=verbose,
        )

        # defining the alternate model
        param_rules = [
            dict(
                par_name="omega",
                edges=edges,
                upper=upper_omega,
                is_independent=is_independent,
            )
        ]
        alt = model(
            sm,
            tree,
            name=f"{model_name}-alt",
            sm_args=sm_args,
            opt_args=opt_args,
            show_progress=show_progress,
            param_rules=param_rules,
            lf_args=lf_args,
            verbose=verbose,
        )
        hyp = hypothesis(null, alt)

        self.func = hyp
Ejemplo n.º 23
0
    def __init__(
        self,
        model,
        gc=None,
        param_vals=None,
        guide_tree=None,
        unique_guides=False,
        indel_length=1e-1,
        indel_rate=1e-10,
        distance="percent",
    ):
        """
        Parameters
        ----------
        model
            substitution model instance or name. If 'codon'
            (uses MG94HKY), 'nucleotide' (uses HKY85), 'protein'
            (uses WG01). These choices provide also provide default
            settings for param_vals.
        gc : int or string
            the genetic code for a codon alignment, defaults to the standard
            genetic code
        param_vals : dict
            param name, values for parameters in model. Overrides
            default choices.
        guide_tree
            newick string, tree instance (must have branch lengths), or a
            callable that will build a tree from unaligned collection. If not
            provided, estimated ONCE via constructing a crude alignment. In the
            case of callable, or not provided, the computed guide tree is stored
            in the returned alignment.info['guide_tree'].
        unique_guides : bool
            whether each alignment requires a new guide tree
        indel_rate : float
            probability of gap insertion
        indel_length : float
            probability of gap extension
        distance : string
            the distance measure for building a guide tree. Default is 'percent',
            the proportion of differences. This is applicable for any moltype,
            and sequences with very high percent identity. For more diverged
            sequences we recommend 'paralinear'.
        """
        super(progressive_align, self).__init__(
            input_types=self._input_types,
            output_types=self._output_types,
            data_types=self._data_types,
        )

        self._param_vals = {
            "codon": dict(omega=0.4, kappa=3),
            "nucleotide": dict(kappa=3),
        }.get(model, param_vals)
        sm = {"codon": "MG94HKY", "nucleotide": "HKY85", "protein": "JTT92"}.get(
            model, model
        )
        self._formatted_params()
        kwargs = {} if gc is None else dict(gc=gc)
        sm = get_model(sm, **kwargs)
        moltype = sm.alphabet.moltype
        self._model = sm
        self._scalar = sm.word_length
        self._indel_length = indel_length
        self._indel_rate = indel_rate
        self._moltype = moltype
        self._unique_guides = unique_guides
        self._distance = distance
        if callable(guide_tree):
            self._make_tree = guide_tree
            guide_tree = None  # callback takes precedence
        else:
            al_to_ref = align_to_ref(moltype=self._moltype)
            dist_calc = dist.fast_slow_dist(
                distance=self._distance, moltype=self._moltype
            )
            est_tree = quick_tree()
            self._make_tree = al_to_ref + dist_calc + est_tree

        if guide_tree is not None:
            if type(guide_tree) == str:
                guide_tree = make_tree(treestring=guide_tree, underscore_unmunge=True)
                if guide_tree.children[0].length is None:
                    raise ValueError("Guide tree must have branch lengths")
            # make sure no zero lengths
            guide_tree = scale_branches()(guide_tree)

        self._guide_tree = guide_tree
        self._kwargs = dict(
            indel_length=self._indel_length,
            indel_rate=self._indel_rate,
            tree=self._guide_tree,
            param_vals=self._param_vals,
            show_progress=False,
        )

        self.func = self.multiple_align
Ejemplo n.º 24
0
    def __init__(
        self,
        sm,
        tree=None,
        sm_args=None,
        gc=1,
        optimise_motif_probs=False,
        tip1=None,
        tip2=None,
        outgroup=None,
        stem=False,
        clade=True,
        lf_args=None,
        upper_omega=20,
        opt_args=None,
        show_progress=False,
        verbose=False,
    ):
        """
        Parameters
        ----------
        sm : str or instance
            substitution model, if string must be available via get_model()
            (see cogent3.available_models).
        tree
            if None, assumes a star phylogeny (only valid for 3 taxa). Can be a
            newick formatted tree, a path to a file containing one, or a Tree
            instance.
        sm_args
            arguments to be passed to the substitution model constructor, e.g.
            dict(optimise_motif_probs=True)
        gc
            genetic code, either name or number (see cogent3.available_codes)
        optimise_motif_probs : bool
            If True, motif probabilities are free parameters. If False (default)
            they are estimated frokm the alignment.
        tip1 : str
            name of tip 1
        tip2 : str
            name of tip 1
        outgroup : str
            name of tip outside clade of interest
        stem : bool
            include name of stem to clade defined by tip1, tip2, outgroup
        clade : bool
            include names of edges within clade defined by tip1, tip2, outgroup
        lf_args
            arguments to be passed to the likelihood function constructor
        upper_omega : float
            upper bound for positive selection omega
        param_rules
            other parameter rules, passed to the likelihood function
            set_param_rule() method
        opt_args
            arguments for the numerical optimiser, e.g.
            dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000,
            limit_action='ignore')
        show_progress : bool
            show progress bars during numerical optimisation
        verbose : bool
            prints intermediate states to screen during fitting
        Notes
        -----
        The scoping parameters (tip1, tip2, outgroup, stem, clade) define the
        foreground edges.
        """
        super(natsel_zhang, self).__init__(
            input_types=(ALIGNED_TYPE, SERIALISABLE_TYPE),
            output_types=(RESULT_TYPE, HYPOTHESIS_RESULT_TYPE, SERIALISABLE_TYPE),
            data_types=("ArrayAlignment", "Alignment"),
        )
        self._formatted_params()
        if not is_codon_model(sm):
            raise ValueError(f"{sm} is not a codon model")

        if not any([tip1, tip2]):
            raise ValueError("must provide at least a single tip name")

        if misc.path_exists(tree):
            tree = load_tree(filename=tree, underscore_unmunge=True)
        elif type(tree) == str:
            tree = make_tree(treestring=tree, underscore_unmunge=True)

        if tree and not isinstance(tree, TreeNode):
            raise TypeError(f"invalid tree type {type(tree)}")

        if all([tip1, tip2]) and tree:
            edges = tree.get_edge_names(
                tip1, tip2, stem=stem, clade=clade, outgroup_name=outgroup
            )
        elif all([tip1, tip2]):
            edges = [tip1, tip2]
        elif tip1:
            edges = [tip1]
        elif tip2:
            edges = [tip2]

        assert edges, "No edges"

        # instantiate model, ensuring genetic code setting passed on
        sm_args = sm_args or {}
        sm_args["gc"] = sm_args.get("gc", gc)
        sm_args["optimise_motif_probs"] = optimise_motif_probs
        if type(sm) == str:
            sm = get_model(sm, **sm_args)

        model_name = sm.name
        # defining the null model
        epsilon = 1e-6
        null_param_rules = [
            dict(par_name="omega", bins="0", upper=1 - epsilon, init=1 - epsilon),
            dict(par_name="omega", bins="1", is_constant=True, value=1.0),
        ]
        lf_args = lf_args or {}
        null_lf_args = lf_args.copy()
        null_lf_args.update(dict(bins=("0", "1")))
        self.null = model(
            sm,
            tree,
            name=f"{model_name}-null",
            sm_args=sm_args,
            param_rules=null_param_rules,
            lf_args=null_lf_args,
            opt_args=opt_args,
            show_progress=show_progress,
            verbose=verbose,
        )

        # defining the alternate model, param rules to be completed each call
        alt_lf_args = lf_args.copy()
        alt_lf_args.update(dict(bins=("0", "1", "2a", "2b")))
        self.alt_args = dict(
            sm=sm,
            tree=tree,
            name=f"{model_name}-alt",
            sm_args=sm_args,
            edges=edges,
            lf_args=alt_lf_args,
            opt_args=opt_args,
            show_progress=show_progress,
            verbose=verbose,
            upper_omega=upper_omega,
        )

        self.func = self.test_hypothesis