def test_deserialise_likelihood_function(self): """correctly deserialise data into likelihood function""" # tests multiple alignments data = load_aligned_seqs( filename=os.path.join(os.getcwd(), "data", "brca1_5.paml") ) half = len(data) // 2 aln1 = data[:half] aln2 = data[half:] loci_names = ["1st-half", "2nd-half"] loci = [aln1, aln2] tree = make_tree(tip_names=data.names) model = get_model("HKY85") lf = model.make_likelihood_function(tree, loci=loci_names) lf.set_alignment(loci) lf_rich_dict = lf.to_rich_dict() got = deserialise_likelihood_function(lf_rich_dict) self.assertEqual(str(lf.defn_for["mprobs"]), str(got.defn_for["mprobs"])) self.assertEqual( str(lf.defn_for["alignment"].assignments), str(got.defn_for["alignment"].assignments), ) # tests single alignment model = get_model("HKY85") lf = model.make_likelihood_function(tree) lf.set_alignment(aln1) lf_rich_dict = lf.to_rich_dict() got = deserialise_likelihood_function(lf_rich_dict) self.assertEqual(str(lf.defn_for["mprobs"]), str(got.defn_for["mprobs"])) self.assertEqual( str(lf.defn_for["alignment"].assignments), str(got.defn_for["alignment"].assignments), )
def test_get_motif_probs_by_node_mg94(self): """handles different statespace dimensions from process and stationary distribution""" from cogent3.evolve.models import get_model aln = load_aligned_seqs("data/primates_brca1.fasta", moltype="dna") aln = aln.no_degenerates(motif_length=3) tree = load_tree("data/primates_brca1.tree") # root mprobs are constant sm = get_model("MG94HKY") lf = sm.make_likelihood_function(tree) lf.set_alignment(aln) mprobs = lf.get_motif_probs() mprobs = lf.get_motif_probs_by_node() self.assertEqual(mprobs.shape, (len(tree.get_edge_vector()), 61)) # root mprobs are variable sm = get_model("MG94HKY", optimise_motif_probs=True) sm = get_model("MG94HKY") lf = sm.make_likelihood_function(tree) lf.set_alignment(aln) mprobs = lf.get_motif_probs_by_node() self.assertEqual(mprobs.shape, (len(tree.get_edge_vector()), 61)) # not imlemented for monomers variant sm = TimeReversibleCodon(mprob_model="monomers", model_gaps=False, recode_gaps=True) lf = sm.make_likelihood_function(tree) lf.set_alignment(aln) with self.assertRaises(NotImplementedError): _ = lf.get_motif_probs_by_node()
def test_get_model(self): """get_models successfully creates model instances""" # just returns query if it's already a substitution model for mod in (CNFGTR(), WG01(), GN()): got = get_model(mod) self.assertEqual(id(got), id(mod)) with self.assertRaises(ValueError): # unknown model raises exception _ = get_model("blah")
def test_roundtrip_discrete_time_submod(self): """discrete time substitution models to_json enables roundtrip""" sm = get_model("DT") data = sm.to_json() got = deserialise_object(data) self.assertEqual(got.to_rich_dict(), sm.to_rich_dict()) sm = get_model("DT", motif_length=2) data = sm.to_json() got = deserialise_object(data) self.assertEqual(got.to_rich_dict(), sm.to_rich_dict())
def test_roundtrip_submod(self): """substitution model to_json enables roundtrip""" sm = get_model("HKY85") data = sm.to_json() got = deserialise_object(data) self.assertEqual(got.to_rich_dict(), sm.to_rich_dict()) sm = get_model("GN") data = sm.to_json() got = deserialise_object(data) self.assertEqual(got.to_rich_dict(), sm.to_rich_dict()) sm = get_model("CNFGTR") data = sm.to_json() got = deserialise_object(data) self.assertEqual(got.to_rich_dict(), sm.to_rich_dict())
def is_codon_model(sm): """True of sm, or get_model(sm), is a Codon substitution model""" from cogent3.evolve.substitution_model import _Codon if type(sm) == str: sm = get_model(sm) return isinstance(sm, _Codon)
def test_roundtrip_model_result(self): """mode_result.to_json enables roundtrip and lazy evaluation""" _data = { "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG", "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG", "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG", } aln = make_aligned_seqs(data=_data, moltype="dna") tree = make_tree(tip_names=aln.names) sm = get_model("HKY85") lf = sm.make_likelihood_function(tree) lf.set_alignment(aln) edge_vals = zip(aln.names, (2, 3, 4)) for edge, val in edge_vals: lf.set_param_rule("kappa", edge=edge, init=val) result = model_result(name="test") result[1] = lf self.assertIs(result[1], lf) self.assertEqual(result.nfp, lf.nfp) self.assertEqual(result.lnL, lf.lnL) data = result.to_json() got_obj = deserialise_object(data) # lazy evaluation means initially, the value is a dict self.assertIsInstance(got_obj[1], dict) # and properties match original self.assertEqual(got_obj.lnL, result.lnL) self.assertEqual(got_obj.nfp, result.nfp) self.assertEqual(got_obj.DLC, result.DLC) # when we ask for the lf attribute, it's no longer a dict self.assertNotIsInstance(got_obj.lf, dict) self.assertEqual(got_obj.lf.nfp, got_obj.nfp)
def test_model_names(self): """name attribute matches model name""" for model_name in models: model = get_model(model_name) self.assertTrue( model.name.startswith(model_name), msg=f"{model.name} does not start with {model_name}", )
def _make_model_cache(self): # constructs all the substitution models if hasattr(self, "_cached_models"): return cache = {} for name in models: cache[name] = get_model(name) self._cached_models = cache
def __init__(self, distance=None, moltype=None, fast_calc=None, slow_calc=None): super(fast_slow_dist, self).__init__( input_types=ALIGNED_TYPE, output_types=(PAIRWISE_DISTANCE_TYPE, SERIALISABLE_TYPE), data_types=("ArrayAlignment", "Alignment"), ) self._formatted_params() self._moltype = moltype if moltype is None else get_moltype(moltype) self._sm = None if (fast_calc or slow_calc) and distance: raise ValueError("cannot combine distance and fast/slow") if distance: fast_calc = distance slow_calc = distance d = set(["hamming", "paralinear", "logdet"]) & set( [slow_calc, fast_calc]) if d and not self._moltype: raise ValueError(f"you must provide a moltype for {d}") try: fast_calc = get_distance_calculator(fast_calc, moltype=self._moltype) except (ValueError, AttributeError): fast_calc = None try: slow_calc = get_model(slow_calc) except ValueError: slow_calc = None if not (fast_calc or slow_calc): raise ValueError(f"invalid values for {slow_calc} or {fast_calc}") self.fast_calc = fast_calc if fast_calc and self._moltype and fast_calc.moltype != self._moltype: raise ValueError( f"{self._moltype} incompatible moltype with fast calculator {fast_calc.moltype}" ) elif fast_calc: self._moltype = fast_calc.moltype if slow_calc and self._moltype and slow_calc.moltype != self._moltype: raise ValueError("incompatible moltype with slow calculator") elif slow_calc: self._moltype = slow_calc.moltype self._sm = slow_calc
def test_roundtrip_het_lf(self): """correctly round trips a site-het model""" with open("data/site-het-param-rules.json") as infile: rules = json.load(infile) aln = load_aligned_seqs("data/primates_brca1.fasta", moltype="dna") tree = load_tree("data/primates_brca1.tree") rule_lnL = rules.pop("phylohmm-gamma-kappa") sm = get_model("HKY85", ordered_param="rate", distribution="gamma") lf1 = sm.make_likelihood_function(tree, bins=4, sites_independent=False) lf1.set_alignment(aln) lf1.apply_param_rules(rule_lnL["rules"]) data = lf1.to_json() got_lf = deserialise_object(data) assert_allclose(lf1.lnL, got_lf.lnL)
def _test_aln(self, seqs, model=dna_model, param_vals=None, **kw): orig = {n: s.replace("-", "") for (n, s) in list(seqs.items())} aln = self._make_aln(orig, model=model, param_vals=param_vals, **kw) result = {n: s.lower() for (n, s) in list(aln.to_dict().items())} # assert the alignment result is correct self.assertEqual(seqs, result) # and the moltype matches the model model = get_model(model) self.assertIs(aln.moltype, model.moltype) # assert the returned alignment has the correct parameter values in the # align.info object. if param_vals: for param, val in param_vals: self.assertEqual(aln.info.align_params[param], val)
def test_zhang(self): """natsel_zhang correctly configured and should not fail""" opt = dict(max_evaluations=20, limit_action="ignore") aln = load_aligned_seqs("data/primate_brca1.fasta", moltype="dna") natsel = evo_app.natsel_zhang( "CNFGTR", tree="data/primate_brca1.tree", tip1="Human", tip2="Chimpanzee", opt_args=opt, ) result = natsel(aln) self.assertEqual(result.df, 3) self.assertEqual(result.alt.nfp, 21) # the naming scheme is model name followed by null/alt self.assertTrue("CNFGTR-null" in result) self.assertTrue("CNFGTR-alt" in result) # result keys correct when given a model Y98 = get_model("Y98") natsel = evo_app.natsel_zhang( Y98, tree="data/primate_brca1.tree", tip1="Human", tip2="Chimpanzee", opt_args=opt, ) result = natsel(aln) self.assertEqual(result.df, 3) self.assertTrue("Y98-null" in result) self.assertTrue("Y98-alt" in result) # fails if not a codon model with self.assertRaises(ValueError): _ = evo_app.natsel_zhang( "F81", tree="data/primate_brca1.tree", tip1="Human", tip2="Chimpanzee", opt_args=opt, ) # fails if no tip names provided with self.assertRaises(ValueError): _ = evo_app.natsel_zhang("Y98", tree="data/primate_brca1.tree", opt_args=opt)
def test_roundtrip_discrete_time_likelihood_function(self): """discrete time likelihood function.to_json enables roundtrip""" _data = { "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG", "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG", "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG", } aln = make_aligned_seqs(data=_data, moltype="dna") tree = make_tree(tip_names=aln.names) sm = get_model("BH") lf = sm.make_likelihood_function(tree) lf.set_alignment(aln) lf.optimise(max_evaluations=25, limit_action="ignore", show_progress=False) lnL = lf.get_log_likelihood() data = lf.to_json() got_obj = deserialise_object(data) assert_allclose(got_obj.get_log_likelihood(), lnL)
def test_roundtrip_likelihood_function(self): """likelihood function.to_json enables roundtrip""" _data = { "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG", "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG", "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG", } aln = make_aligned_seqs(data=_data, moltype="dna") tree = make_tree(tip_names=aln.names) sm = get_model("HKY85") lf = sm.make_likelihood_function(tree) lf.set_alignment(aln) edge_vals = zip(aln.names, (2, 3, 4)) for edge, val in edge_vals: lf.set_param_rule("kappa", edge=edge, init=val) lnL = lf.get_log_likelihood() data = lf.to_json() got_obj = deserialise_object(data) self.assertFloatEqual(got_obj.get_log_likelihood(), lnL)
def deserialise_substitution_model(data): """returns a cogent3 substitution model instance""" from cogent3.evolve.models import get_model data.pop("version", None) kw = {} if "kw" not in data else data.pop("kw") sm = None if kw and "name" in kw: name = kw.pop("name") try: sm = get_model(name, **kw) except ValueError: # user defined sm? pass if sm is None: alphabet = deserialise_alphabet(data.pop("alphabet")) klass = _get_class(data.pop("type")) sm = klass(alphabet, **data) return sm
def test_roundtrip_from_file(self): """correctly roundtrips a likelihood function fro json file""" _data = { "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG", "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG", "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG", } aln = make_aligned_seqs(data=_data, moltype="dna") tree = make_tree(tip_names=aln.names) sm = get_model("HKY85") lf = sm.make_likelihood_function(tree) lf.set_alignment(aln) edge_vals = zip(aln.names, (2, 3, 4)) for edge, val in edge_vals: lf.set_param_rule("kappa", edge=edge, init=val) lnL = lf.get_log_likelihood() data = lf.to_json() with TemporaryDirectory(dir=".") as dirname: outpath = dirname + "/delme.json" with open(outpath, "w") as outfile: outfile.write(data) got = deserialise_object(outpath) self.assertFloatEqual(got.get_log_likelihood(), lnL)
def __init__( self, sm, tree=None, sm_args=None, gc=1, optimise_motif_probs=False, lf_args=None, opt_args=None, show_progress=False, verbose=False, ): """ Parameters ---------- sm : str or instance substitution model, if string must be available via get_model() (see cogent3.available_models). tree if None, assumes a star phylogeny (only valid for 3 taxa). Can be a newick formatted tree, a path to a file containing one, or a Tree instance. sm_args arguments to be passed to the substitution model constructor, e.g. dict(optimise_motif_probs=True) gc genetic code, either name or number (see cogent3.available_codes) optimise_motif_probs : bool If True, motif probabilities are free parameters. If False (default) they are estimated frokm the alignment. lf_args arguments to be passed to the likelihood function constructor opt_args arguments for the numerical optimiser, e.g. dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000, limit_action='ignore') show_progress : bool show progress bars during numerical optimisation verbose : bool prints intermediate states to screen during fitting """ super(natsel_neutral, self).__init__( input_types=(ALIGNED_TYPE, SERIALISABLE_TYPE), output_types=(RESULT_TYPE, HYPOTHESIS_RESULT_TYPE, SERIALISABLE_TYPE), data_types=("ArrayAlignment", "Alignment"), ) self._formatted_params() if not is_codon_model(sm): raise ValueError(f"{sm} is not a codon model") if misc.path_exists(tree): tree = load_tree(filename=tree, underscore_unmunge=True) elif type(tree) == str: tree = make_tree(treestring=tree, underscore_unmunge=True) if tree and not isinstance(tree, TreeNode): raise TypeError(f"invalid tree type {type(tree)}") # instantiate model, ensuring genetic code setting passed on sm_args = sm_args or {} sm_args["gc"] = sm_args.get("gc", gc) sm_args["optimise_motif_probs"] = optimise_motif_probs if type(sm) == str: sm = get_model(sm, **sm_args) model_name = sm.name # defining the null model lf_args = lf_args or {} null = model( sm, tree, name=f"{model_name}-null", sm_args=sm_args, opt_args=opt_args, show_progress=show_progress, param_rules=[dict(par_name="omega", is_constant=True, value=1.0)], lf_args=lf_args, verbose=verbose, ) # defining the alternate model alt = model( sm, tree, name=f"{model_name}-alt", sm_args=sm_args, opt_args=opt_args, show_progress=show_progress, lf_args=lf_args, verbose=verbose, ) hyp = hypothesis(null, alt) self.func = hyp
def __init__(self, distance=None, moltype=None, fast_calc=None, slow_calc=None): """ Parameters ---------- moltype : str cogent3 moltype distance : str Name of a distance method available as both fast and slow calculator. fast_calc Name of a fast distance calculator. See cogent3.available_distances(). slow_calc Name of a slow distance calculator. See cogent3.available_models(). Notes ----- If you provide fast_calc or slow_calc, you must specify the moltype. """ super(fast_slow_dist, self).__init__( input_types=self._input_types, output_types=self._output_types, data_types=self._data_types, ) self._formatted_params() self._moltype = moltype if moltype is None else get_moltype(moltype) self._sm = None if (fast_calc or slow_calc) and distance: raise ValueError("cannot combine distance and fast/slow") if distance: fast_calc = distance slow_calc = distance d = {"hamming", "percent", "paralinear", "logdet" } & {slow_calc, fast_calc} if d and not self._moltype: raise ValueError(f"you must provide a moltype for {d}") try: fast_calc = get_distance_calculator(fast_calc, moltype=self._moltype) except (ValueError, AttributeError): fast_calc = None try: slow_calc = get_model(slow_calc) except ValueError: slow_calc = None if not (fast_calc or slow_calc): raise ValueError(f"invalid values for {slow_calc} or {fast_calc}") self.fast_calc = fast_calc if fast_calc and self._moltype and fast_calc.moltype != self._moltype: raise ValueError( f"{self._moltype} incompatible moltype with fast calculator {fast_calc.moltype}" ) elif fast_calc: self._moltype = fast_calc.moltype if slow_calc and self._moltype and slow_calc.moltype != self._moltype: raise ValueError("incompatible moltype with slow calculator") elif slow_calc: self._moltype = slow_calc.moltype self._sm = slow_calc self.func = self.calc_distance
def __init__( self, sm, tree=None, name=None, sm_args=None, lf_args=None, time_het=None, param_rules=None, opt_args=None, split_codons=False, show_progress=False, verbose=False, ): """ Parameters ---------- sm : str or instance substitution model if string must be available via get_model() tree if None, assumes a star phylogeny (only valid for 3 taxa). Can be a newick formatted tree, a path to a file containing one, or a Tree instance. name name of the model sm_args arguments to be passed to the substitution model constructor, e.g. dict(optimise_motif_probs=True) lf_args arguments to be passed to the likelihood function constructor time_het 'max' or a list of dicts corresponding to edge_sets, e.g. [dict(edges=['Human', 'Chimp'], is_independent=False, upper=10)]. Passed to the likelihood function .set_time_heterogeneity() method. param_rules other parameter rules, passed to the likelihood function set_param_rule() method opt_args arguments for the numerical optimiser, e.g. dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000, limit_action='ignore') split_codons : bool if True, incoming alignments are split into the 3 frames and each frame is fit separately show_progress : bool show progress bars during numerical optimisation verbose : bool prints intermediate states to screen during fitting Returns ------- Calling an instance with an alignment returns a model_result instance with the optimised likelihood function. In the case of split_codons, the result object has a separate entry for each. """ super(model, self).__init__( input_types=("aligned", "serialisable"), output_types=("result", "model_result", "serialisable"), data_types=("ArrayAlignment", "Alignment"), ) self._verbose = verbose self._formatted_params() sm_args = sm_args or {} if type(sm) == str: sm = get_model(sm, **sm_args) self._sm = sm if len(sm.get_motifs()[0]) > 1: split_codons = False if misc.path_exists(tree): tree = load_tree(filename=tree, underscore_unmunge=True) elif type(tree) == str: tree = make_tree(treestring=tree, underscore_unmunge=True) if tree and not isinstance(tree, TreeNode): raise TypeError(f"invalid tree type {type(tree)}") self._tree = tree self._lf_args = lf_args or {} if not name: name = sm.name or "unnamed model" self.name = name self._opt_args = opt_args or dict(max_restarts=5, show_progress=show_progress) self._opt_args["show_progress"] = self._opt_args.get( "show_progress", show_progress) param_rules = param_rules or {} if param_rules: for rule in param_rules: if rule.get("is_constant"): continue rule["upper"] = rule.get("upper", 50) # default upper bound self._param_rules = param_rules self._time_het = time_het self._split_codons = split_codons self.func = self.fit
def __init__( self, sm, tree=None, sm_args=None, gc=1, optimise_motif_probs=False, upper_omega=20.0, lf_args=None, opt_args=None, show_progress=False, verbose=False, ): """ Parameters ---------- sm : str or instance substitution model, if string must be available via get_model() (see cogent3.available_models). tree if None, assumes a star phylogeny (only valid for 3 taxa). Can be a newick formatted tree, a path to a file containing one, or a Tree instance. sm_args arguments to be passed to the substitution model constructor, e.g. dict(optimise_motif_probs=True) gc genetic code, either name or number (see cogent3.available_codes) optimise_motif_probs : bool If True, motif probabilities are free parameters. If False (default) they are estimated from the alignment. upper_omega : float upper bound for positive selection omega lf_args arguments to be passed to the likelihood function constructor opt_args arguments for the numerical optimiser, e.g. dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000, limit_action='ignore') show_progress : bool show progress bars during numerical optimisation verbose : bool prints intermediate states to screen during fitting """ super(natsel_sitehet, self).__init__( input_types=("aligned", "serialisable"), output_types=("result", "hypothesis_result", "serialisable"), data_types=("ArrayAlignment", "Alignment"), ) self._formatted_params() if not is_codon_model(sm): raise ValueError(f"{sm} is not a codon model") if misc.path_exists(tree): tree = load_tree(filename=tree, underscore_unmunge=True) elif type(tree) == str: tree = make_tree(treestring=tree, underscore_unmunge=True) if tree and not isinstance(tree, TreeNode): raise TypeError(f"invalid tree type {type(tree)}") # instantiate model, ensuring genetic code setting passed on sm_args = sm_args or {} sm_args["gc"] = sm_args.get("gc", gc) sm_args["optimise_motif_probs"] = optimise_motif_probs if type(sm) == str: sm = get_model(sm, **sm_args) model_name = sm.name # defining the null model epsilon = 1e-6 null_param_rules = [ dict(par_name="omega", bins="-ve", upper=1 - epsilon, init=1 - epsilon), dict(par_name="omega", bins="neutral", is_constant=True, value=1.0), ] lf_args = lf_args or {} null_lf_args = lf_args.copy() null_lf_args.update(dict(bins=("-ve", "neutral"))) self.null = model( sm, tree, name=f"{model_name}-null", sm_args=sm_args, param_rules=null_param_rules, lf_args=null_lf_args, opt_args=opt_args, show_progress=show_progress, verbose=verbose, ) # defining the alternate model, param rules to be completed each call alt_lf_args = lf_args.copy() alt_lf_args.update(dict(bins=("-ve", "neutral", "+ve"))) self.alt_args = dict( sm=sm, tree=tree, name=f"{model_name}-alt", sm_args=sm_args, lf_args=alt_lf_args, opt_args=opt_args, show_progress=show_progress, verbose=verbose, upper_omega=upper_omega, ) self.func = self.test_hypothesis
def __init__( self, sm, tree=None, sm_args=None, gc=1, optimise_motif_probs=False, tip1=None, tip2=None, outgroup=None, stem=False, clade=True, is_independent=False, lf_args=None, upper_omega=20, opt_args=None, show_progress=False, verbose=False, ): """ Parameters ---------- sm : str or instance substitution model, if string must be available via get_model() (see cogent3.available_models). tree if None, assumes a star phylogeny (only valid for 3 taxa). Can be a newick formatted tree, a path to a file containing one, or a Tree instance. sm_args arguments to be passed to the substitution model constructor, e.g. dict(optimise_motif_probs=True) gc genetic code, either name or number (see cogent3.available_codes) optimise_motif_probs : bool If True, motif probabilities are free parameters. If False (default) they are estimated frokm the alignment. tip1 : str name of tip 1 tip2 : str name of tip 1 outgroup : str name of tip outside clade of interest stem : bool include name of stem to clade defined by tip1, tip2, outgroup clade : bool include names of edges within clade defined by tip1, tip2, outgroup is_independent : bool if True, all edges specified by the scoping info get their own value of omega, if False, only a single omega lf_args arguments to be passed to the likelihood function constructor upper_omega : float upper bound for omega param_rules other parameter rules, passed to the likelihood function set_param_rule() method opt_args arguments for the numerical optimiser, e.g. dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000, limit_action='ignore') show_progress : bool show progress bars during numerical optimisation verbose : bool prints intermediate states to screen during fitting """ super(natsel_timehet, self).__init__( input_types=("aligned", "serialisable"), output_types=("result", "hypothesis_result", "serialisable"), data_types=("ArrayAlignment", "Alignment"), ) self._formatted_params() if not is_codon_model(sm): raise ValueError(f"{sm} is not a codon model") if not any([tip1, tip2]): raise ValueError("must provide at least a single tip name") if misc.path_exists(tree): tree = load_tree(filename=tree, underscore_unmunge=True) elif type(tree) == str: tree = make_tree(treestring=tree, underscore_unmunge=True) if tree and not isinstance(tree, TreeNode): raise TypeError(f"invalid tree type {type(tree)}") if all([tip1, tip2]) and tree: edges = tree.get_edge_names(tip1, tip2, stem=stem, clade=clade, outgroup_name=outgroup) elif all([tip1, tip2]): edges = [tip1, tip2] elif tip1: edges = [tip1] elif tip2: edges = [tip2] assert edges, "No edges" # instantiate model, ensuring genetic code setting passed on sm_args = sm_args or {} sm_args["gc"] = sm_args.get("gc", gc) sm_args["optimise_motif_probs"] = optimise_motif_probs if type(sm) == str: sm = get_model(sm, **sm_args) model_name = sm.name # defining the null model lf_args = lf_args or {} null_lf_args = lf_args.copy() null = model( sm, tree, name=f"{model_name}-null", sm_args=sm_args, lf_args=null_lf_args, opt_args=opt_args, show_progress=show_progress, verbose=verbose, ) # defining the alternate model param_rules = [ dict( par_name="omega", edges=edges, upper=upper_omega, is_independent=is_independent, ) ] alt = model( sm, tree, name=f"{model_name}-alt", sm_args=sm_args, opt_args=opt_args, show_progress=show_progress, param_rules=param_rules, lf_args=lf_args, verbose=verbose, ) hyp = hypothesis(null, alt) self.func = hyp
def __init__( self, model, gc=None, param_vals=None, guide_tree=None, unique_guides=False, indel_length=1e-1, indel_rate=1e-10, distance="percent", ): """ Parameters ---------- model substitution model instance or name. If 'codon' (uses MG94HKY), 'nucleotide' (uses HKY85), 'protein' (uses WG01). These choices provide also provide default settings for param_vals. gc : int or string the genetic code for a codon alignment, defaults to the standard genetic code param_vals : dict param name, values for parameters in model. Overrides default choices. guide_tree newick string, tree instance (must have branch lengths), or a callable that will build a tree from unaligned collection. If not provided, estimated ONCE via constructing a crude alignment. In the case of callable, or not provided, the computed guide tree is stored in the returned alignment.info['guide_tree']. unique_guides : bool whether each alignment requires a new guide tree indel_rate : float probability of gap insertion indel_length : float probability of gap extension distance : string the distance measure for building a guide tree. Default is 'percent', the proportion of differences. This is applicable for any moltype, and sequences with very high percent identity. For more diverged sequences we recommend 'paralinear'. """ super(progressive_align, self).__init__( input_types=self._input_types, output_types=self._output_types, data_types=self._data_types, ) self._param_vals = { "codon": dict(omega=0.4, kappa=3), "nucleotide": dict(kappa=3), }.get(model, param_vals) sm = {"codon": "MG94HKY", "nucleotide": "HKY85", "protein": "JTT92"}.get( model, model ) self._formatted_params() kwargs = {} if gc is None else dict(gc=gc) sm = get_model(sm, **kwargs) moltype = sm.alphabet.moltype self._model = sm self._scalar = sm.word_length self._indel_length = indel_length self._indel_rate = indel_rate self._moltype = moltype self._unique_guides = unique_guides self._distance = distance if callable(guide_tree): self._make_tree = guide_tree guide_tree = None # callback takes precedence else: al_to_ref = align_to_ref(moltype=self._moltype) dist_calc = dist.fast_slow_dist( distance=self._distance, moltype=self._moltype ) est_tree = quick_tree() self._make_tree = al_to_ref + dist_calc + est_tree if guide_tree is not None: if type(guide_tree) == str: guide_tree = make_tree(treestring=guide_tree, underscore_unmunge=True) if guide_tree.children[0].length is None: raise ValueError("Guide tree must have branch lengths") # make sure no zero lengths guide_tree = scale_branches()(guide_tree) self._guide_tree = guide_tree self._kwargs = dict( indel_length=self._indel_length, indel_rate=self._indel_rate, tree=self._guide_tree, param_vals=self._param_vals, show_progress=False, ) self.func = self.multiple_align
def __init__( self, sm, tree=None, sm_args=None, gc=1, optimise_motif_probs=False, tip1=None, tip2=None, outgroup=None, stem=False, clade=True, lf_args=None, upper_omega=20, opt_args=None, show_progress=False, verbose=False, ): """ Parameters ---------- sm : str or instance substitution model, if string must be available via get_model() (see cogent3.available_models). tree if None, assumes a star phylogeny (only valid for 3 taxa). Can be a newick formatted tree, a path to a file containing one, or a Tree instance. sm_args arguments to be passed to the substitution model constructor, e.g. dict(optimise_motif_probs=True) gc genetic code, either name or number (see cogent3.available_codes) optimise_motif_probs : bool If True, motif probabilities are free parameters. If False (default) they are estimated frokm the alignment. tip1 : str name of tip 1 tip2 : str name of tip 1 outgroup : str name of tip outside clade of interest stem : bool include name of stem to clade defined by tip1, tip2, outgroup clade : bool include names of edges within clade defined by tip1, tip2, outgroup lf_args arguments to be passed to the likelihood function constructor upper_omega : float upper bound for positive selection omega param_rules other parameter rules, passed to the likelihood function set_param_rule() method opt_args arguments for the numerical optimiser, e.g. dict(max_restarts=5, tolerance=1e-6, max_evaluations=1000, limit_action='ignore') show_progress : bool show progress bars during numerical optimisation verbose : bool prints intermediate states to screen during fitting Notes ----- The scoping parameters (tip1, tip2, outgroup, stem, clade) define the foreground edges. """ super(natsel_zhang, self).__init__( input_types=(ALIGNED_TYPE, SERIALISABLE_TYPE), output_types=(RESULT_TYPE, HYPOTHESIS_RESULT_TYPE, SERIALISABLE_TYPE), data_types=("ArrayAlignment", "Alignment"), ) self._formatted_params() if not is_codon_model(sm): raise ValueError(f"{sm} is not a codon model") if not any([tip1, tip2]): raise ValueError("must provide at least a single tip name") if misc.path_exists(tree): tree = load_tree(filename=tree, underscore_unmunge=True) elif type(tree) == str: tree = make_tree(treestring=tree, underscore_unmunge=True) if tree and not isinstance(tree, TreeNode): raise TypeError(f"invalid tree type {type(tree)}") if all([tip1, tip2]) and tree: edges = tree.get_edge_names( tip1, tip2, stem=stem, clade=clade, outgroup_name=outgroup ) elif all([tip1, tip2]): edges = [tip1, tip2] elif tip1: edges = [tip1] elif tip2: edges = [tip2] assert edges, "No edges" # instantiate model, ensuring genetic code setting passed on sm_args = sm_args or {} sm_args["gc"] = sm_args.get("gc", gc) sm_args["optimise_motif_probs"] = optimise_motif_probs if type(sm) == str: sm = get_model(sm, **sm_args) model_name = sm.name # defining the null model epsilon = 1e-6 null_param_rules = [ dict(par_name="omega", bins="0", upper=1 - epsilon, init=1 - epsilon), dict(par_name="omega", bins="1", is_constant=True, value=1.0), ] lf_args = lf_args or {} null_lf_args = lf_args.copy() null_lf_args.update(dict(bins=("0", "1"))) self.null = model( sm, tree, name=f"{model_name}-null", sm_args=sm_args, param_rules=null_param_rules, lf_args=null_lf_args, opt_args=opt_args, show_progress=show_progress, verbose=verbose, ) # defining the alternate model, param rules to be completed each call alt_lf_args = lf_args.copy() alt_lf_args.update(dict(bins=("0", "1", "2a", "2b"))) self.alt_args = dict( sm=sm, tree=tree, name=f"{model_name}-alt", sm_args=sm_args, edges=edges, lf_args=alt_lf_args, opt_args=opt_args, show_progress=show_progress, verbose=verbose, upper_omega=upper_omega, ) self.func = self.test_hypothesis