Beispiel #1
0
    def test_get_motif_probs_by_node_mg94(self):
        """handles different statespace dimensions from process and stationary distribution"""
        from cogent3.evolve.models import get_model

        aln = load_aligned_seqs("data/primates_brca1.fasta", moltype="dna")
        aln = aln.no_degenerates(motif_length=3)

        tree = load_tree("data/primates_brca1.tree")

        # root mprobs are constant
        sm = get_model("MG94HKY")
        lf = sm.make_likelihood_function(tree)
        lf.set_alignment(aln)
        mprobs = lf.get_motif_probs()

        mprobs = lf.get_motif_probs_by_node()
        self.assertEqual(mprobs.shape, (len(tree.get_edge_vector()), 61))

        # root mprobs are variable
        sm = get_model("MG94HKY", optimise_motif_probs=True)
        sm = get_model("MG94HKY")
        lf = sm.make_likelihood_function(tree)
        lf.set_alignment(aln)
        mprobs = lf.get_motif_probs_by_node()
        self.assertEqual(mprobs.shape, (len(tree.get_edge_vector()), 61))

        # not imlemented for monomers variant
        sm = TimeReversibleCodon(mprob_model="monomers",
                                 model_gaps=False,
                                 recode_gaps=True)
        lf = sm.make_likelihood_function(tree)
        lf.set_alignment(aln)
        with self.assertRaises(NotImplementedError):
            _ = lf.get_motif_probs_by_node()
Beispiel #2
0
    def test_species_tree(self):
        """should match the one used by ensembl"""
        comp = Compara(
            ["human", "rat", "dog", "platypus"],
            release=ENSEMBL_RELEASE,
            account=account,
        )

        # sub-tree should have correct species
        sub_species = comp.get_species_tree(just_members=True)
        self.assertEqual(
            set(sub_species.get_tip_names()),
            {
                "H**o sapiens",
                "Rattus norvegicus",
                "Canis lupus familiaris",
                "Ornithorhynchus anatinus",
            },
        )
        # topology should match current topology belief
        expect = make_tree(
            treestring="(((Homo_sapiens,Rattus_norvegicus),"
            "Canis_lupus_familiaris),Ornithorhynchus_anatinus)",
            underscore_unmunge=True,
        )
        self.assertTrue(sub_species.same_topology(expect))

        # returned full tree should match download from ensembl
        # but taxon names are customised in what they put up on
        # the web-site, so need a better test.
        sptree = comp.get_species_tree(just_members=False)
        expect = load_tree("data/ensembl_all_species.nh",
                           underscore_unmunge=True)
        self.assertTrue(
            len(sptree.get_tip_names()) > len(expect.get_tip_names()))
Beispiel #3
0
 def test_gene_tree(self):
     """gene tree should match one downloaded from ensembl web"""
     hbb = self.comp.Human.get_gene_by_stableid("ENSG00000244734")
     paras = list(
         self.comp.get_related_genes(gene_region=hbb,
                                     relationship="within_species_paralog"))
     t = paras[0].get_tree()
     expect = load_tree("data/HBB_gene_tree.nh")
     expect = expect.get_sub_tree(t.get_tip_names(), ignore_missing=True)
     self.assertTrue(expect.same_topology(t))
Beispiel #4
0
    def test_roundtrip_het_lf(self):
        """correctly round trips a site-het model"""
        with open("data/site-het-param-rules.json") as infile:
            rules = json.load(infile)

        aln = load_aligned_seqs("data/primates_brca1.fasta", moltype="dna")
        tree = load_tree("data/primates_brca1.tree")
        rule_lnL = rules.pop("phylohmm-gamma-kappa")
        sm = get_model("HKY85", ordered_param="rate", distribution="gamma")
        lf1 = sm.make_likelihood_function(tree, bins=4, sites_independent=False)
        lf1.set_alignment(aln)
        lf1.apply_param_rules(rule_lnL["rules"])
        data = lf1.to_json()
        got_lf = deserialise_object(data)
        assert_allclose(lf1.lnL, got_lf.lnL)
Beispiel #5
0
    TimeReversibleNucleotide,
)
from cogent3.maths import optimisers
from cogent3.util import parallel

__author__ = "Peter Maxwell and  Gavin Huttley"
__copyright__ = "Copyright 2007-2019, The Cogent Project"
__credits__ = ["Peter Maxwell", "Gavin Huttley"]
__license__ = "BSD-3"
__version__ = "2019.9.13a"
__maintainer__ = "Gavin Huttley"
__email__ = "*****@*****.**"
__status__ = "Production"

ALIGNMENT = load_aligned_seqs(filename="data/brca1.fasta")
TREE = load_tree(filename="data/murphy.tree")


def subtree(size):
    names = ALIGNMENT.names[:size]
    assert len(names) == size
    tree = TREE.get_sub_tree(names)  # .balanced()
    return names, tree


def brca_test(subMod, names, tree, length, par_rules, **kw):
    # names = ALIGNMENT.names[:taxa]
    # assert len(names) == taxa
    tree = TREE.get_sub_tree(names)  # .balanced()
    aln = ALIGNMENT.take_seqs(names).omit_gap_pos()[:length]
    assert len(aln) == length, (len(aln), length)
Beispiel #6
0
    def gettree(self):
        treeobj = load_tree(filename=os.path.join(data_path, "murphy.tree"))

        return treeobj.get_sub_tree(seqnames)
Beispiel #7
0
 def test_get_tree_get_splits(self):
     """get_tree should provide a reciprocal map of get_splits"""
     tree = load_tree(os.path.join(data_path, "murphy.tree"))
     self.assertTrue(tree.same_topology(get_tree(get_splits(tree))))
Beispiel #8
0
    def __init__(
        self,
        sm,
        tree=None,
        sm_args=None,
        gc=1,
        optimise_motif_probs=False,
        tip1=None,
        tip2=None,
        outgroup=None,
        stem=False,
        clade=True,
        is_independent=False,
        lf_args=None,
        upper_omega=20,
        opt_args=None,
        show_progress=False,
        verbose=False,
    ):
        """
        Parameters
        ----------
        sm : str or instance
            substitution model, if string must be available via get_model()
            (see cogent3.available_models).
        tree
            if None, assumes a star phylogeny (only valid for 3 taxa). Can be a
            newick formatted tree, a path to a file containing one, or a Tree
            instance.
        sm_args
            arguments to be passed to the substitution model constructor, e.g.
            dict(optimise_motif_probs=True)
        gc
            genetic code, either name or number (see cogent3.available_codes)
        optimise_motif_probs : bool
            If True, motif probabilities are free parameters. If False (default)
            they are estimated frokm the alignment.
        tip1 : str
            name of tip 1
        tip2 : str
            name of tip 1
        outgroup : str
            name of tip outside clade of interest
        stem : bool
            include name of stem to clade defined by tip1, tip2, outgroup
        clade : bool
            include names of edges within clade defined by tip1, tip2, outgroup
        is_independent : bool
            if True, all edges specified by the scoping info get their own
            value of omega, if False, only a single omega
        lf_args
            arguments to be passed to the likelihood function constructor
        upper_omega : float
            upper bound for omega
        param_rules
            other parameter rules, passed to the likelihood function
            set_param_rule() method
        opt_args
            arguments for the numerical optimiser, e.g.
            dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000,
            limit_action='ignore')
        show_progress : bool
            show progress bars during numerical optimisation
        verbose : bool
            prints intermediate states to screen during fitting
        """
        super(natsel_timehet, self).__init__(
            input_types=("aligned", "serialisable"),
            output_types=("result", "hypothesis_result", "serialisable"),
            data_types=("ArrayAlignment", "Alignment"),
        )
        self._formatted_params()
        if not is_codon_model(sm):
            raise ValueError(f"{sm} is not a codon model")

        if not any([tip1, tip2]):
            raise ValueError("must provide at least a single tip name")

        if misc.path_exists(tree):
            tree = load_tree(filename=tree, underscore_unmunge=True)
        elif type(tree) == str:
            tree = make_tree(treestring=tree, underscore_unmunge=True)

        if tree and not isinstance(tree, TreeNode):
            raise TypeError(f"invalid tree type {type(tree)}")

        if all([tip1, tip2]) and tree:
            edges = tree.get_edge_names(tip1,
                                        tip2,
                                        stem=stem,
                                        clade=clade,
                                        outgroup_name=outgroup)
        elif all([tip1, tip2]):
            edges = [tip1, tip2]
        elif tip1:
            edges = [tip1]
        elif tip2:
            edges = [tip2]

        assert edges, "No edges"

        # instantiate model, ensuring genetic code setting passed on
        sm_args = sm_args or {}
        sm_args["gc"] = sm_args.get("gc", gc)
        sm_args["optimise_motif_probs"] = optimise_motif_probs
        if type(sm) == str:
            sm = get_model(sm, **sm_args)

        model_name = sm.name
        # defining the null model
        lf_args = lf_args or {}
        null_lf_args = lf_args.copy()
        null = model(
            sm,
            tree,
            name=f"{model_name}-null",
            sm_args=sm_args,
            lf_args=null_lf_args,
            opt_args=opt_args,
            show_progress=show_progress,
            verbose=verbose,
        )

        # defining the alternate model
        param_rules = [
            dict(
                par_name="omega",
                edges=edges,
                upper=upper_omega,
                is_independent=is_independent,
            )
        ]
        alt = model(
            sm,
            tree,
            name=f"{model_name}-alt",
            sm_args=sm_args,
            opt_args=opt_args,
            show_progress=show_progress,
            param_rules=param_rules,
            lf_args=lf_args,
            verbose=verbose,
        )
        hyp = hypothesis(null, alt)

        self.func = hyp
Beispiel #9
0
    def __init__(
        self,
        sm,
        tree=None,
        sm_args=None,
        gc=1,
        optimise_motif_probs=False,
        upper_omega=20.0,
        lf_args=None,
        opt_args=None,
        show_progress=False,
        verbose=False,
    ):
        """
        Parameters
        ----------
        sm : str or instance
            substitution model, if string must be available via get_model()
            (see cogent3.available_models).
        tree
            if None, assumes a star phylogeny (only valid for 3 taxa). Can be a
            newick formatted tree, a path to a file containing one, or a Tree
            instance.
        sm_args
            arguments to be passed to the substitution model constructor, e.g.
            dict(optimise_motif_probs=True)
        gc
            genetic code, either name or number (see cogent3.available_codes)
        optimise_motif_probs : bool
            If True, motif probabilities are free parameters. If False (default)
            they are estimated from the alignment.
        upper_omega : float
            upper bound for positive selection omega
        lf_args
            arguments to be passed to the likelihood function constructor
        opt_args
            arguments for the numerical optimiser, e.g.
            dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000,
            limit_action='ignore')
        show_progress : bool
            show progress bars during numerical optimisation
        verbose : bool
            prints intermediate states to screen during fitting
        """
        super(natsel_sitehet, self).__init__(
            input_types=("aligned", "serialisable"),
            output_types=("result", "hypothesis_result", "serialisable"),
            data_types=("ArrayAlignment", "Alignment"),
        )
        self._formatted_params()
        if not is_codon_model(sm):
            raise ValueError(f"{sm} is not a codon model")

        if misc.path_exists(tree):
            tree = load_tree(filename=tree, underscore_unmunge=True)
        elif type(tree) == str:
            tree = make_tree(treestring=tree, underscore_unmunge=True)

        if tree and not isinstance(tree, TreeNode):
            raise TypeError(f"invalid tree type {type(tree)}")

        # instantiate model, ensuring genetic code setting passed on
        sm_args = sm_args or {}
        sm_args["gc"] = sm_args.get("gc", gc)
        sm_args["optimise_motif_probs"] = optimise_motif_probs
        if type(sm) == str:
            sm = get_model(sm, **sm_args)

        model_name = sm.name
        # defining the null model
        epsilon = 1e-6
        null_param_rules = [
            dict(par_name="omega",
                 bins="-ve",
                 upper=1 - epsilon,
                 init=1 - epsilon),
            dict(par_name="omega", bins="neutral", is_constant=True,
                 value=1.0),
        ]
        lf_args = lf_args or {}
        null_lf_args = lf_args.copy()
        null_lf_args.update(dict(bins=("-ve", "neutral")))
        self.null = model(
            sm,
            tree,
            name=f"{model_name}-null",
            sm_args=sm_args,
            param_rules=null_param_rules,
            lf_args=null_lf_args,
            opt_args=opt_args,
            show_progress=show_progress,
            verbose=verbose,
        )

        # defining the alternate model, param rules to be completed each call
        alt_lf_args = lf_args.copy()
        alt_lf_args.update(dict(bins=("-ve", "neutral", "+ve")))
        self.alt_args = dict(
            sm=sm,
            tree=tree,
            name=f"{model_name}-alt",
            sm_args=sm_args,
            lf_args=alt_lf_args,
            opt_args=opt_args,
            show_progress=show_progress,
            verbose=verbose,
            upper_omega=upper_omega,
        )

        self.func = self.test_hypothesis
Beispiel #10
0
    def __init__(
        self,
        sm,
        tree=None,
        name=None,
        sm_args=None,
        lf_args=None,
        time_het=None,
        param_rules=None,
        opt_args=None,
        split_codons=False,
        show_progress=False,
        verbose=False,
    ):
        """
        Parameters
        ----------
        sm : str or instance
            substitution model if string must be available via get_model()
        tree
            if None, assumes a star phylogeny (only valid for 3 taxa). Can be a
            newick formatted tree, a path to a file containing one, or a Tree
            instance.
        name
            name of the model
        sm_args
            arguments to be passed to the substitution model constructor, e.g.
            dict(optimise_motif_probs=True)
        lf_args
            arguments to be passed to the likelihood function constructor
        time_het
            'max' or a list of dicts corresponding to edge_sets, e.g.
            [dict(edges=['Human', 'Chimp'], is_independent=False, upper=10)].
            Passed to the likelihood function .set_time_heterogeneity()
            method.
        param_rules
            other parameter rules, passed to the likelihood function
            set_param_rule() method
        opt_args
            arguments for the numerical optimiser, e.g.
            dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000,
            limit_action='ignore')
        split_codons : bool
            if True, incoming alignments are split into the 3 frames and each
            frame is fit separately
        show_progress : bool
            show progress bars during numerical optimisation
        verbose : bool
            prints intermediate states to screen during fitting

        Returns
        -------
        Calling an instance with an alignment returns a model_result instance
        with the optimised likelihood function. In the case of split_codons,
        the result object has a separate entry for each.
        """
        super(model, self).__init__(
            input_types=("aligned", "serialisable"),
            output_types=("result", "model_result", "serialisable"),
            data_types=("ArrayAlignment", "Alignment"),
        )
        self._verbose = verbose
        self._formatted_params()
        sm_args = sm_args or {}
        if type(sm) == str:
            sm = get_model(sm, **sm_args)
        self._sm = sm
        if len(sm.get_motifs()[0]) > 1:
            split_codons = False

        if misc.path_exists(tree):
            tree = load_tree(filename=tree, underscore_unmunge=True)
        elif type(tree) == str:
            tree = make_tree(treestring=tree, underscore_unmunge=True)

        if tree and not isinstance(tree, TreeNode):
            raise TypeError(f"invalid tree type {type(tree)}")

        self._tree = tree
        self._lf_args = lf_args or {}
        if not name:
            name = sm.name or "unnamed model"
        self.name = name
        self._opt_args = opt_args or dict(max_restarts=5,
                                          show_progress=show_progress)
        self._opt_args["show_progress"] = self._opt_args.get(
            "show_progress", show_progress)
        param_rules = param_rules or {}
        if param_rules:
            for rule in param_rules:
                if rule.get("is_constant"):
                    continue
                rule["upper"] = rule.get("upper", 50)  # default upper bound
        self._param_rules = param_rules
        self._time_het = time_het
        self._split_codons = split_codons
        self.func = self.fit
Beispiel #11
0
    def __init__(
        self,
        sm,
        tree=None,
        sm_args=None,
        gc=1,
        optimise_motif_probs=False,
        tip1=None,
        tip2=None,
        outgroup=None,
        stem=False,
        clade=True,
        lf_args=None,
        upper_omega=20,
        opt_args=None,
        show_progress=False,
        verbose=False,
    ):
        """
        Parameters
        ----------
        sm : str or instance
            substitution model, if string must be available via get_model()
            (see cogent3.available_models).
        tree
            if None, assumes a star phylogeny (only valid for 3 taxa). Can be a
            newick formatted tree, a path to a file containing one, or a Tree
            instance.
        sm_args
            arguments to be passed to the substitution model constructor, e.g.
            dict(optimise_motif_probs=True)
        gc
            genetic code, either name or number (see cogent3.available_codes)
        optimise_motif_probs : bool
            If True, motif probabilities are free parameters. If False (default)
            they are estimated frokm the alignment.
        tip1 : str
            name of tip 1
        tip2 : str
            name of tip 1
        outgroup : str
            name of tip outside clade of interest
        stem : bool
            include name of stem to clade defined by tip1, tip2, outgroup
        clade : bool
            include names of edges within clade defined by tip1, tip2, outgroup
        lf_args
            arguments to be passed to the likelihood function constructor
        upper_omega : float
            upper bound for positive selection omega
        param_rules
            other parameter rules, passed to the likelihood function
            set_param_rule() method
        opt_args
            arguments for the numerical optimiser, e.g.
            dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000,
            limit_action='ignore')
        show_progress : bool
            show progress bars during numerical optimisation
        verbose : bool
            prints intermediate states to screen during fitting
        Notes
        -----
        The scoping parameters (tip1, tip2, outgroup, stem, clade) define the
        foreground edges.
        """
        super(natsel_zhang, self).__init__(
            input_types=(ALIGNED_TYPE, SERIALISABLE_TYPE),
            output_types=(RESULT_TYPE, HYPOTHESIS_RESULT_TYPE, SERIALISABLE_TYPE),
            data_types=("ArrayAlignment", "Alignment"),
        )
        self._formatted_params()
        if not is_codon_model(sm):
            raise ValueError(f"{sm} is not a codon model")

        if not any([tip1, tip2]):
            raise ValueError("must provide at least a single tip name")

        if misc.path_exists(tree):
            tree = load_tree(filename=tree, underscore_unmunge=True)
        elif type(tree) == str:
            tree = make_tree(treestring=tree, underscore_unmunge=True)

        if tree and not isinstance(tree, TreeNode):
            raise TypeError(f"invalid tree type {type(tree)}")

        if all([tip1, tip2]) and tree:
            edges = tree.get_edge_names(
                tip1, tip2, stem=stem, clade=clade, outgroup_name=outgroup
            )
        elif all([tip1, tip2]):
            edges = [tip1, tip2]
        elif tip1:
            edges = [tip1]
        elif tip2:
            edges = [tip2]

        assert edges, "No edges"

        # instantiate model, ensuring genetic code setting passed on
        sm_args = sm_args or {}
        sm_args["gc"] = sm_args.get("gc", gc)
        sm_args["optimise_motif_probs"] = optimise_motif_probs
        if type(sm) == str:
            sm = get_model(sm, **sm_args)

        model_name = sm.name
        # defining the null model
        epsilon = 1e-6
        null_param_rules = [
            dict(par_name="omega", bins="0", upper=1 - epsilon, init=1 - epsilon),
            dict(par_name="omega", bins="1", is_constant=True, value=1.0),
        ]
        lf_args = lf_args or {}
        null_lf_args = lf_args.copy()
        null_lf_args.update(dict(bins=("0", "1")))
        self.null = model(
            sm,
            tree,
            name=f"{model_name}-null",
            sm_args=sm_args,
            param_rules=null_param_rules,
            lf_args=null_lf_args,
            opt_args=opt_args,
            show_progress=show_progress,
            verbose=verbose,
        )

        # defining the alternate model, param rules to be completed each call
        alt_lf_args = lf_args.copy()
        alt_lf_args.update(dict(bins=("0", "1", "2a", "2b")))
        self.alt_args = dict(
            sm=sm,
            tree=tree,
            name=f"{model_name}-alt",
            sm_args=sm_args,
            edges=edges,
            lf_args=alt_lf_args,
            opt_args=opt_args,
            show_progress=show_progress,
            verbose=verbose,
            upper_omega=upper_omega,
        )

        self.func = self.test_hypothesis
Beispiel #12
0
    def __init__(
        self,
        sm,
        tree=None,
        sm_args=None,
        gc=1,
        optimise_motif_probs=False,
        lf_args=None,
        opt_args=None,
        show_progress=False,
        verbose=False,
    ):
        """
        Parameters
        ----------
        sm : str or instance
            substitution model, if string must be available via get_model()
            (see cogent3.available_models).
        tree
            if None, assumes a star phylogeny (only valid for 3 taxa). Can be a
            newick formatted tree, a path to a file containing one, or a Tree
            instance.
        sm_args
            arguments to be passed to the substitution model constructor, e.g.
            dict(optimise_motif_probs=True)
        gc
            genetic code, either name or number (see cogent3.available_codes)
        optimise_motif_probs : bool
            If True, motif probabilities are free parameters. If False (default)
            they are estimated frokm the alignment.
        lf_args
            arguments to be passed to the likelihood function constructor
        opt_args
            arguments for the numerical optimiser, e.g.
            dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000,
            limit_action='ignore')
        show_progress : bool
            show progress bars during numerical optimisation
        verbose : bool
            prints intermediate states to screen during fitting
        """
        super(natsel_neutral, self).__init__(
            input_types=(ALIGNED_TYPE, SERIALISABLE_TYPE),
            output_types=(RESULT_TYPE, HYPOTHESIS_RESULT_TYPE, SERIALISABLE_TYPE),
            data_types=("ArrayAlignment", "Alignment"),
        )
        self._formatted_params()
        if not is_codon_model(sm):
            raise ValueError(f"{sm} is not a codon model")

        if misc.path_exists(tree):
            tree = load_tree(filename=tree, underscore_unmunge=True)
        elif type(tree) == str:
            tree = make_tree(treestring=tree, underscore_unmunge=True)

        if tree and not isinstance(tree, TreeNode):
            raise TypeError(f"invalid tree type {type(tree)}")

        # instantiate model, ensuring genetic code setting passed on
        sm_args = sm_args or {}
        sm_args["gc"] = sm_args.get("gc", gc)
        sm_args["optimise_motif_probs"] = optimise_motif_probs
        if type(sm) == str:
            sm = get_model(sm, **sm_args)

        model_name = sm.name
        # defining the null model
        lf_args = lf_args or {}
        null = model(
            sm,
            tree,
            name=f"{model_name}-null",
            sm_args=sm_args,
            opt_args=opt_args,
            show_progress=show_progress,
            param_rules=[dict(par_name="omega", is_constant=True, value=1.0)],
            lf_args=lf_args,
            verbose=verbose,
        )

        # defining the alternate model
        alt = model(
            sm,
            tree,
            name=f"{model_name}-alt",
            sm_args=sm_args,
            opt_args=opt_args,
            show_progress=show_progress,
            lf_args=lf_args,
            verbose=verbose,
        )
        hyp = hypothesis(null, alt)

        self.func = hyp