def test_get_motif_probs_by_node_mg94(self): """handles different statespace dimensions from process and stationary distribution""" from cogent3.evolve.models import get_model aln = load_aligned_seqs("data/primates_brca1.fasta", moltype="dna") aln = aln.no_degenerates(motif_length=3) tree = load_tree("data/primates_brca1.tree") # root mprobs are constant sm = get_model("MG94HKY") lf = sm.make_likelihood_function(tree) lf.set_alignment(aln) mprobs = lf.get_motif_probs() mprobs = lf.get_motif_probs_by_node() self.assertEqual(mprobs.shape, (len(tree.get_edge_vector()), 61)) # root mprobs are variable sm = get_model("MG94HKY", optimise_motif_probs=True) sm = get_model("MG94HKY") lf = sm.make_likelihood_function(tree) lf.set_alignment(aln) mprobs = lf.get_motif_probs_by_node() self.assertEqual(mprobs.shape, (len(tree.get_edge_vector()), 61)) # not imlemented for monomers variant sm = TimeReversibleCodon(mprob_model="monomers", model_gaps=False, recode_gaps=True) lf = sm.make_likelihood_function(tree) lf.set_alignment(aln) with self.assertRaises(NotImplementedError): _ = lf.get_motif_probs_by_node()
def test_species_tree(self): """should match the one used by ensembl""" comp = Compara( ["human", "rat", "dog", "platypus"], release=ENSEMBL_RELEASE, account=account, ) # sub-tree should have correct species sub_species = comp.get_species_tree(just_members=True) self.assertEqual( set(sub_species.get_tip_names()), { "H**o sapiens", "Rattus norvegicus", "Canis lupus familiaris", "Ornithorhynchus anatinus", }, ) # topology should match current topology belief expect = make_tree( treestring="(((Homo_sapiens,Rattus_norvegicus)," "Canis_lupus_familiaris),Ornithorhynchus_anatinus)", underscore_unmunge=True, ) self.assertTrue(sub_species.same_topology(expect)) # returned full tree should match download from ensembl # but taxon names are customised in what they put up on # the web-site, so need a better test. sptree = comp.get_species_tree(just_members=False) expect = load_tree("data/ensembl_all_species.nh", underscore_unmunge=True) self.assertTrue( len(sptree.get_tip_names()) > len(expect.get_tip_names()))
def test_gene_tree(self): """gene tree should match one downloaded from ensembl web""" hbb = self.comp.Human.get_gene_by_stableid("ENSG00000244734") paras = list( self.comp.get_related_genes(gene_region=hbb, relationship="within_species_paralog")) t = paras[0].get_tree() expect = load_tree("data/HBB_gene_tree.nh") expect = expect.get_sub_tree(t.get_tip_names(), ignore_missing=True) self.assertTrue(expect.same_topology(t))
def test_roundtrip_het_lf(self): """correctly round trips a site-het model""" with open("data/site-het-param-rules.json") as infile: rules = json.load(infile) aln = load_aligned_seqs("data/primates_brca1.fasta", moltype="dna") tree = load_tree("data/primates_brca1.tree") rule_lnL = rules.pop("phylohmm-gamma-kappa") sm = get_model("HKY85", ordered_param="rate", distribution="gamma") lf1 = sm.make_likelihood_function(tree, bins=4, sites_independent=False) lf1.set_alignment(aln) lf1.apply_param_rules(rule_lnL["rules"]) data = lf1.to_json() got_lf = deserialise_object(data) assert_allclose(lf1.lnL, got_lf.lnL)
TimeReversibleNucleotide, ) from cogent3.maths import optimisers from cogent3.util import parallel __author__ = "Peter Maxwell and Gavin Huttley" __copyright__ = "Copyright 2007-2019, The Cogent Project" __credits__ = ["Peter Maxwell", "Gavin Huttley"] __license__ = "BSD-3" __version__ = "2019.9.13a" __maintainer__ = "Gavin Huttley" __email__ = "*****@*****.**" __status__ = "Production" ALIGNMENT = load_aligned_seqs(filename="data/brca1.fasta") TREE = load_tree(filename="data/murphy.tree") def subtree(size): names = ALIGNMENT.names[:size] assert len(names) == size tree = TREE.get_sub_tree(names) # .balanced() return names, tree def brca_test(subMod, names, tree, length, par_rules, **kw): # names = ALIGNMENT.names[:taxa] # assert len(names) == taxa tree = TREE.get_sub_tree(names) # .balanced() aln = ALIGNMENT.take_seqs(names).omit_gap_pos()[:length] assert len(aln) == length, (len(aln), length)
def gettree(self): treeobj = load_tree(filename=os.path.join(data_path, "murphy.tree")) return treeobj.get_sub_tree(seqnames)
def test_get_tree_get_splits(self): """get_tree should provide a reciprocal map of get_splits""" tree = load_tree(os.path.join(data_path, "murphy.tree")) self.assertTrue(tree.same_topology(get_tree(get_splits(tree))))
def __init__( self, sm, tree=None, sm_args=None, gc=1, optimise_motif_probs=False, tip1=None, tip2=None, outgroup=None, stem=False, clade=True, is_independent=False, lf_args=None, upper_omega=20, opt_args=None, show_progress=False, verbose=False, ): """ Parameters ---------- sm : str or instance substitution model, if string must be available via get_model() (see cogent3.available_models). tree if None, assumes a star phylogeny (only valid for 3 taxa). Can be a newick formatted tree, a path to a file containing one, or a Tree instance. sm_args arguments to be passed to the substitution model constructor, e.g. dict(optimise_motif_probs=True) gc genetic code, either name or number (see cogent3.available_codes) optimise_motif_probs : bool If True, motif probabilities are free parameters. If False (default) they are estimated frokm the alignment. tip1 : str name of tip 1 tip2 : str name of tip 1 outgroup : str name of tip outside clade of interest stem : bool include name of stem to clade defined by tip1, tip2, outgroup clade : bool include names of edges within clade defined by tip1, tip2, outgroup is_independent : bool if True, all edges specified by the scoping info get their own value of omega, if False, only a single omega lf_args arguments to be passed to the likelihood function constructor upper_omega : float upper bound for omega param_rules other parameter rules, passed to the likelihood function set_param_rule() method opt_args arguments for the numerical optimiser, e.g. dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000, limit_action='ignore') show_progress : bool show progress bars during numerical optimisation verbose : bool prints intermediate states to screen during fitting """ super(natsel_timehet, self).__init__( input_types=("aligned", "serialisable"), output_types=("result", "hypothesis_result", "serialisable"), data_types=("ArrayAlignment", "Alignment"), ) self._formatted_params() if not is_codon_model(sm): raise ValueError(f"{sm} is not a codon model") if not any([tip1, tip2]): raise ValueError("must provide at least a single tip name") if misc.path_exists(tree): tree = load_tree(filename=tree, underscore_unmunge=True) elif type(tree) == str: tree = make_tree(treestring=tree, underscore_unmunge=True) if tree and not isinstance(tree, TreeNode): raise TypeError(f"invalid tree type {type(tree)}") if all([tip1, tip2]) and tree: edges = tree.get_edge_names(tip1, tip2, stem=stem, clade=clade, outgroup_name=outgroup) elif all([tip1, tip2]): edges = [tip1, tip2] elif tip1: edges = [tip1] elif tip2: edges = [tip2] assert edges, "No edges" # instantiate model, ensuring genetic code setting passed on sm_args = sm_args or {} sm_args["gc"] = sm_args.get("gc", gc) sm_args["optimise_motif_probs"] = optimise_motif_probs if type(sm) == str: sm = get_model(sm, **sm_args) model_name = sm.name # defining the null model lf_args = lf_args or {} null_lf_args = lf_args.copy() null = model( sm, tree, name=f"{model_name}-null", sm_args=sm_args, lf_args=null_lf_args, opt_args=opt_args, show_progress=show_progress, verbose=verbose, ) # defining the alternate model param_rules = [ dict( par_name="omega", edges=edges, upper=upper_omega, is_independent=is_independent, ) ] alt = model( sm, tree, name=f"{model_name}-alt", sm_args=sm_args, opt_args=opt_args, show_progress=show_progress, param_rules=param_rules, lf_args=lf_args, verbose=verbose, ) hyp = hypothesis(null, alt) self.func = hyp
def __init__( self, sm, tree=None, sm_args=None, gc=1, optimise_motif_probs=False, upper_omega=20.0, lf_args=None, opt_args=None, show_progress=False, verbose=False, ): """ Parameters ---------- sm : str or instance substitution model, if string must be available via get_model() (see cogent3.available_models). tree if None, assumes a star phylogeny (only valid for 3 taxa). Can be a newick formatted tree, a path to a file containing one, or a Tree instance. sm_args arguments to be passed to the substitution model constructor, e.g. dict(optimise_motif_probs=True) gc genetic code, either name or number (see cogent3.available_codes) optimise_motif_probs : bool If True, motif probabilities are free parameters. If False (default) they are estimated from the alignment. upper_omega : float upper bound for positive selection omega lf_args arguments to be passed to the likelihood function constructor opt_args arguments for the numerical optimiser, e.g. dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000, limit_action='ignore') show_progress : bool show progress bars during numerical optimisation verbose : bool prints intermediate states to screen during fitting """ super(natsel_sitehet, self).__init__( input_types=("aligned", "serialisable"), output_types=("result", "hypothesis_result", "serialisable"), data_types=("ArrayAlignment", "Alignment"), ) self._formatted_params() if not is_codon_model(sm): raise ValueError(f"{sm} is not a codon model") if misc.path_exists(tree): tree = load_tree(filename=tree, underscore_unmunge=True) elif type(tree) == str: tree = make_tree(treestring=tree, underscore_unmunge=True) if tree and not isinstance(tree, TreeNode): raise TypeError(f"invalid tree type {type(tree)}") # instantiate model, ensuring genetic code setting passed on sm_args = sm_args or {} sm_args["gc"] = sm_args.get("gc", gc) sm_args["optimise_motif_probs"] = optimise_motif_probs if type(sm) == str: sm = get_model(sm, **sm_args) model_name = sm.name # defining the null model epsilon = 1e-6 null_param_rules = [ dict(par_name="omega", bins="-ve", upper=1 - epsilon, init=1 - epsilon), dict(par_name="omega", bins="neutral", is_constant=True, value=1.0), ] lf_args = lf_args or {} null_lf_args = lf_args.copy() null_lf_args.update(dict(bins=("-ve", "neutral"))) self.null = model( sm, tree, name=f"{model_name}-null", sm_args=sm_args, param_rules=null_param_rules, lf_args=null_lf_args, opt_args=opt_args, show_progress=show_progress, verbose=verbose, ) # defining the alternate model, param rules to be completed each call alt_lf_args = lf_args.copy() alt_lf_args.update(dict(bins=("-ve", "neutral", "+ve"))) self.alt_args = dict( sm=sm, tree=tree, name=f"{model_name}-alt", sm_args=sm_args, lf_args=alt_lf_args, opt_args=opt_args, show_progress=show_progress, verbose=verbose, upper_omega=upper_omega, ) self.func = self.test_hypothesis
def __init__( self, sm, tree=None, name=None, sm_args=None, lf_args=None, time_het=None, param_rules=None, opt_args=None, split_codons=False, show_progress=False, verbose=False, ): """ Parameters ---------- sm : str or instance substitution model if string must be available via get_model() tree if None, assumes a star phylogeny (only valid for 3 taxa). Can be a newick formatted tree, a path to a file containing one, or a Tree instance. name name of the model sm_args arguments to be passed to the substitution model constructor, e.g. dict(optimise_motif_probs=True) lf_args arguments to be passed to the likelihood function constructor time_het 'max' or a list of dicts corresponding to edge_sets, e.g. [dict(edges=['Human', 'Chimp'], is_independent=False, upper=10)]. Passed to the likelihood function .set_time_heterogeneity() method. param_rules other parameter rules, passed to the likelihood function set_param_rule() method opt_args arguments for the numerical optimiser, e.g. dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000, limit_action='ignore') split_codons : bool if True, incoming alignments are split into the 3 frames and each frame is fit separately show_progress : bool show progress bars during numerical optimisation verbose : bool prints intermediate states to screen during fitting Returns ------- Calling an instance with an alignment returns a model_result instance with the optimised likelihood function. In the case of split_codons, the result object has a separate entry for each. """ super(model, self).__init__( input_types=("aligned", "serialisable"), output_types=("result", "model_result", "serialisable"), data_types=("ArrayAlignment", "Alignment"), ) self._verbose = verbose self._formatted_params() sm_args = sm_args or {} if type(sm) == str: sm = get_model(sm, **sm_args) self._sm = sm if len(sm.get_motifs()[0]) > 1: split_codons = False if misc.path_exists(tree): tree = load_tree(filename=tree, underscore_unmunge=True) elif type(tree) == str: tree = make_tree(treestring=tree, underscore_unmunge=True) if tree and not isinstance(tree, TreeNode): raise TypeError(f"invalid tree type {type(tree)}") self._tree = tree self._lf_args = lf_args or {} if not name: name = sm.name or "unnamed model" self.name = name self._opt_args = opt_args or dict(max_restarts=5, show_progress=show_progress) self._opt_args["show_progress"] = self._opt_args.get( "show_progress", show_progress) param_rules = param_rules or {} if param_rules: for rule in param_rules: if rule.get("is_constant"): continue rule["upper"] = rule.get("upper", 50) # default upper bound self._param_rules = param_rules self._time_het = time_het self._split_codons = split_codons self.func = self.fit
def __init__( self, sm, tree=None, sm_args=None, gc=1, optimise_motif_probs=False, tip1=None, tip2=None, outgroup=None, stem=False, clade=True, lf_args=None, upper_omega=20, opt_args=None, show_progress=False, verbose=False, ): """ Parameters ---------- sm : str or instance substitution model, if string must be available via get_model() (see cogent3.available_models). tree if None, assumes a star phylogeny (only valid for 3 taxa). Can be a newick formatted tree, a path to a file containing one, or a Tree instance. sm_args arguments to be passed to the substitution model constructor, e.g. dict(optimise_motif_probs=True) gc genetic code, either name or number (see cogent3.available_codes) optimise_motif_probs : bool If True, motif probabilities are free parameters. If False (default) they are estimated frokm the alignment. tip1 : str name of tip 1 tip2 : str name of tip 1 outgroup : str name of tip outside clade of interest stem : bool include name of stem to clade defined by tip1, tip2, outgroup clade : bool include names of edges within clade defined by tip1, tip2, outgroup lf_args arguments to be passed to the likelihood function constructor upper_omega : float upper bound for positive selection omega param_rules other parameter rules, passed to the likelihood function set_param_rule() method opt_args arguments for the numerical optimiser, e.g. dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000, limit_action='ignore') show_progress : bool show progress bars during numerical optimisation verbose : bool prints intermediate states to screen during fitting Notes ----- The scoping parameters (tip1, tip2, outgroup, stem, clade) define the foreground edges. """ super(natsel_zhang, self).__init__( input_types=(ALIGNED_TYPE, SERIALISABLE_TYPE), output_types=(RESULT_TYPE, HYPOTHESIS_RESULT_TYPE, SERIALISABLE_TYPE), data_types=("ArrayAlignment", "Alignment"), ) self._formatted_params() if not is_codon_model(sm): raise ValueError(f"{sm} is not a codon model") if not any([tip1, tip2]): raise ValueError("must provide at least a single tip name") if misc.path_exists(tree): tree = load_tree(filename=tree, underscore_unmunge=True) elif type(tree) == str: tree = make_tree(treestring=tree, underscore_unmunge=True) if tree and not isinstance(tree, TreeNode): raise TypeError(f"invalid tree type {type(tree)}") if all([tip1, tip2]) and tree: edges = tree.get_edge_names( tip1, tip2, stem=stem, clade=clade, outgroup_name=outgroup ) elif all([tip1, tip2]): edges = [tip1, tip2] elif tip1: edges = [tip1] elif tip2: edges = [tip2] assert edges, "No edges" # instantiate model, ensuring genetic code setting passed on sm_args = sm_args or {} sm_args["gc"] = sm_args.get("gc", gc) sm_args["optimise_motif_probs"] = optimise_motif_probs if type(sm) == str: sm = get_model(sm, **sm_args) model_name = sm.name # defining the null model epsilon = 1e-6 null_param_rules = [ dict(par_name="omega", bins="0", upper=1 - epsilon, init=1 - epsilon), dict(par_name="omega", bins="1", is_constant=True, value=1.0), ] lf_args = lf_args or {} null_lf_args = lf_args.copy() null_lf_args.update(dict(bins=("0", "1"))) self.null = model( sm, tree, name=f"{model_name}-null", sm_args=sm_args, param_rules=null_param_rules, lf_args=null_lf_args, opt_args=opt_args, show_progress=show_progress, verbose=verbose, ) # defining the alternate model, param rules to be completed each call alt_lf_args = lf_args.copy() alt_lf_args.update(dict(bins=("0", "1", "2a", "2b"))) self.alt_args = dict( sm=sm, tree=tree, name=f"{model_name}-alt", sm_args=sm_args, edges=edges, lf_args=alt_lf_args, opt_args=opt_args, show_progress=show_progress, verbose=verbose, upper_omega=upper_omega, ) self.func = self.test_hypothesis
def __init__( self, sm, tree=None, sm_args=None, gc=1, optimise_motif_probs=False, lf_args=None, opt_args=None, show_progress=False, verbose=False, ): """ Parameters ---------- sm : str or instance substitution model, if string must be available via get_model() (see cogent3.available_models). tree if None, assumes a star phylogeny (only valid for 3 taxa). Can be a newick formatted tree, a path to a file containing one, or a Tree instance. sm_args arguments to be passed to the substitution model constructor, e.g. dict(optimise_motif_probs=True) gc genetic code, either name or number (see cogent3.available_codes) optimise_motif_probs : bool If True, motif probabilities are free parameters. If False (default) they are estimated frokm the alignment. lf_args arguments to be passed to the likelihood function constructor opt_args arguments for the numerical optimiser, e.g. dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000, limit_action='ignore') show_progress : bool show progress bars during numerical optimisation verbose : bool prints intermediate states to screen during fitting """ super(natsel_neutral, self).__init__( input_types=(ALIGNED_TYPE, SERIALISABLE_TYPE), output_types=(RESULT_TYPE, HYPOTHESIS_RESULT_TYPE, SERIALISABLE_TYPE), data_types=("ArrayAlignment", "Alignment"), ) self._formatted_params() if not is_codon_model(sm): raise ValueError(f"{sm} is not a codon model") if misc.path_exists(tree): tree = load_tree(filename=tree, underscore_unmunge=True) elif type(tree) == str: tree = make_tree(treestring=tree, underscore_unmunge=True) if tree and not isinstance(tree, TreeNode): raise TypeError(f"invalid tree type {type(tree)}") # instantiate model, ensuring genetic code setting passed on sm_args = sm_args or {} sm_args["gc"] = sm_args.get("gc", gc) sm_args["optimise_motif_probs"] = optimise_motif_probs if type(sm) == str: sm = get_model(sm, **sm_args) model_name = sm.name # defining the null model lf_args = lf_args or {} null = model( sm, tree, name=f"{model_name}-null", sm_args=sm_args, opt_args=opt_args, show_progress=show_progress, param_rules=[dict(par_name="omega", is_constant=True, value=1.0)], lf_args=lf_args, verbose=verbose, ) # defining the alternate model alt = model( sm, tree, name=f"{model_name}-alt", sm_args=sm_args, opt_args=opt_args, show_progress=show_progress, lf_args=lf_args, verbose=verbose, ) hyp = hypothesis(null, alt) self.func = hyp