def test_load_seqs_interface(self): """load_aligned_seqs correctly loads nexus alignments""" aln = load_aligned_seqs("data/nexus_mixed.nex") self.assertEqual(aln.num_seqs, 4) self.assertEqual(len(aln), 20) aln = load_aligned_seqs("data/nexus_aa.nxs") self.assertEqual(aln.num_seqs, 10) self.assertEqual(len(aln), 234)
def test_load_aligned_seqs(self): """test loading aligned from file""" path = os.path.join(data_path, "brca1_5.paml") got = load_aligned_seqs(path) self.assertIsInstance(got, ArrayAlignment) self.assertTrue("Human" in got.to_dict()) self.assertEqual(got.info["source"], path) got = load_aligned_seqs(path, moltype="dna") self.assertEqual(got.moltype.label, "dna") got = load_aligned_seqs(path, moltype="dna", array_align=False) self.assertEqual(got.moltype.label, "dna") self.assertIsInstance(got, Alignment)
def test_dotplot_regression(self): """Tests whether dotplot produces traces and in correct ordering. Also tests if pop_trace() works""" aln = load_aligned_seqs("data/brca1.fasta", moltype="dna") aln = aln.take_seqs(["Human", "Chimpanzee"]) aln = aln[:200] dp = aln.dotplot() _ = dp.figure trace_names = dp.get_trace_titles() self.assertTrue( dp.get_trace_titles() != [] and len(trace_names) == len(dp.traces), "No traces found for dotplot", ) self.assertTrue( [ trace_names[i] == dp.traces[i]["name"] for i in range(len(trace_names)) ], "Order of traces don't match with get_trace_titles()", ) for trace_name in trace_names: dp.pop_trace(trace_name) self.assertFalse( trace_name in dp.get_trace_titles(), "Trace name still present in get_trace_titles() even after popping off trace", )
def test_get_motif_probs_by_node_mg94(self): """handles different statespace dimensions from process and stationary distribution""" from cogent3.evolve.models import get_model aln = load_aligned_seqs("data/primates_brca1.fasta", moltype="dna") aln = aln.no_degenerates(motif_length=3) tree = load_tree("data/primates_brca1.tree") # root mprobs are constant sm = get_model("MG94HKY") lf = sm.make_likelihood_function(tree) lf.set_alignment(aln) mprobs = lf.get_motif_probs() mprobs = lf.get_motif_probs_by_node() self.assertEqual(mprobs.shape, (len(tree.get_edge_vector()), 61)) # root mprobs are variable sm = get_model("MG94HKY", optimise_motif_probs=True) sm = get_model("MG94HKY") lf = sm.make_likelihood_function(tree) lf.set_alignment(aln) mprobs = lf.get_motif_probs_by_node() self.assertEqual(mprobs.shape, (len(tree.get_edge_vector()), 61)) # not imlemented for monomers variant sm = TimeReversibleCodon(mprob_model="monomers", model_gaps=False, recode_gaps=True) lf = sm.make_likelihood_function(tree) lf.set_alignment(aln) with self.assertRaises(NotImplementedError): _ = lf.get_motif_probs_by_node()
def test_deserialise_likelihood_function(self): """correctly deserialise data into likelihood function""" # tests multiple alignments data = load_aligned_seqs( filename=os.path.join(os.getcwd(), "data", "brca1_5.paml") ) half = len(data) // 2 aln1 = data[:half] aln2 = data[half:] loci_names = ["1st-half", "2nd-half"] loci = [aln1, aln2] tree = make_tree(tip_names=data.names) model = get_model("HKY85") lf = model.make_likelihood_function(tree, loci=loci_names) lf.set_alignment(loci) lf_rich_dict = lf.to_rich_dict() got = deserialise_likelihood_function(lf_rich_dict) self.assertEqual(str(lf.defn_for["mprobs"]), str(got.defn_for["mprobs"])) self.assertEqual( str(lf.defn_for["alignment"].assignments), str(got.defn_for["alignment"].assignments), ) # tests single alignment model = get_model("HKY85") lf = model.make_likelihood_function(tree) lf.set_alignment(aln1) lf_rich_dict = lf.to_rich_dict() got = deserialise_likelihood_function(lf_rich_dict) self.assertEqual(str(lf.defn_for["mprobs"]), str(got.defn_for["mprobs"])) self.assertEqual( str(lf.defn_for["alignment"].assignments), str(got.defn_for["alignment"].assignments), )
def test_logdet_pair_aa(self): """logdet shouldn't fail to produce distances for aa seqs""" aln = load_aligned_seqs("data/brca1_5.paml", moltype=DNA) aln = aln.get_translation() logdet_calc = LogDetPair(moltype=PROTEIN, alignment=aln) logdet_calc.run(use_tk_adjustment=True, show_progress=False) dists = logdet_calc.get_pairwise_distances()
def test_paralinear_pair_aa(self): """paralinear shouldn't fail to produce distances for aa seqs""" aln = load_aligned_seqs("data/brca1_5.paml", moltype=DNA) aln = aln.get_translation() paralinear_calc = ParalinearPair(moltype=PROTEIN, alignment=aln) paralinear_calc.run(show_progress=False) dists = paralinear_calc.get_pairwise_distances()
def test_composable_apps(self): """checks the ability of these two apps(fast_slow_dist and quick_tree) to communicate""" path = os.path.join(data_path, "brca1_5.paml") aln1 = load_aligned_seqs(path, moltype=DNA) fast_slow_dist = dist.fast_slow_dist(fast_calc="hamming", moltype="dna") quick = tree_app.quick_tree(drop_invalid=False) proc = fast_slow_dist + quick self.assertEqual( str(proc), "fast_slow_dist(type='distance', distance=None, moltype='dna',\n" "fast_calc='hamming', slow_calc=None) + quick_tree(type='tree',\n" "drop_invalid=False)", ) self.assertIsInstance(proc, tree_app.quick_tree) self.assertEqual(proc._type, "tree") self.assertIsInstance(proc.input, dist.fast_slow_dist) self.assertIs(proc.output, None) self.assertIsInstance(proc._input_types, frozenset) self.assertIsInstance(proc._output_types, frozenset) self.assertIsInstance(proc._in, dist.fast_slow_dist) self.assertIs(proc._out, None) tree1 = proc(aln1) self.assertIsInstance(tree1, PhyloNode) self.assertIsNotNone(tree1.children) self.assertEqual(set(tree1.get_tip_names()), set(aln1.names)) # tests when distances contain None data = dict( seq1="AGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT", seq2="TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC", ) aln2 = make_aligned_seqs(data=data, moltype=DNA) tree2 = proc(aln2) self.assertIsInstance(tree2, NotCompleted)
def test_dotplot_regression(self): """Tests whether dotplot produces traces and in correct ordering.""" aln = load_aligned_seqs("data/brca1.fasta", moltype="dna") aln = aln.take_seqs(["Human", "Chimpanzee"]) aln = aln[:200] dp = aln.dotplot() _ = dp.figure trace_names = [tr.name for tr in dp.traces] self.assertTrue( [tr.name for tr in dp.traces] != [] and len(trace_names) == len(dp.traces), "No traces found for dotplot", ) self.assertTrue( [trace_names[i] == dp.traces[i]["name"] for i in range(len(trace_names))], "Order of traces don't match with dp traces", ) for trace_name in trace_names: index = [tr.name for tr in dp.traces].index(trace_name) dp.traces.pop(index) self.assertFalse( trace_name in [tr.name for tr in dp.traces], "Trace name still present in dp traces even after popping off trace", )
def test_logdet_pair_dna(self): """logdet should produce distances that match MEGA""" aln = load_aligned_seqs("data/brca1_5.paml", moltype=DNA) logdet_calc = LogDetPair(moltype=DNA, alignment=aln) logdet_calc.run(use_tk_adjustment=True, show_progress=False) dists = logdet_calc.get_pairwise_distances().to_dict() all_expected = { ("Human", "NineBande"): 0.075336929999999996, ("NineBande", "DogFaced"): 0.0898575452, ("DogFaced", "Human"): 0.1061747919, ("HowlerMon", "DogFaced"): 0.0934480008, ("Mouse", "HowlerMon"): 0.26422862920000001, ("NineBande", "Human"): 0.075336929999999996, ("HowlerMon", "NineBande"): 0.062202897899999998, ("DogFaced", "NineBande"): 0.0898575452, ("DogFaced", "HowlerMon"): 0.0934480008, ("Human", "DogFaced"): 0.1061747919, ("Mouse", "Human"): 0.26539976700000001, ("NineBande", "HowlerMon"): 0.062202897899999998, ("HowlerMon", "Human"): 0.036571181899999999, ("DogFaced", "Mouse"): 0.2652555144, ("HowlerMon", "Mouse"): 0.26422862920000001, ("Mouse", "DogFaced"): 0.2652555144, ("NineBande", "Mouse"): 0.22754789210000001, ("Mouse", "NineBande"): 0.22754789210000001, ("Human", "Mouse"): 0.26539976700000001, ("Human", "HowlerMon"): 0.036571181899999999, } for pair in dists: got = dists[pair] expected = all_expected[pair] assert_allclose(got, expected)
def test_neutral_nstat_model(self): """test of neutrality, non-stationary codon model""" opt = dict(max_evaluations=2, limit_action="ignore") aln = load_aligned_seqs("data/ENSG00000198712.fa", moltype="dna") neutral = evo_app.natsel_neutral("GNC", opt_args=opt, gc=2) result = neutral(aln) # 11 rate matrix params for GNC (omega omitted in null), 3 edges self.assertEqual(result.null.lf.nfp, 3 + 11)
def test_quick_tree(self): """correctly calc a nj tree""" path = os.path.join(data_path, "brca1_5.paml") aln = load_aligned_seqs(path, moltype=DNA) fast_slow_dist = dist.fast_slow_dist(fast_calc="hamming", moltype="dna") dist_matrix = fast_slow_dist(aln) quick1 = tree_app.quick_tree() tree1 = quick1.quick_tree(dist_matrix) self.assertEqual(set(tree1.get_tip_names()), set(aln.names))
def test_write_unknown_raises(self): """writing unknown format raises FileFormatError""" filename = os.path.join(data_path, "primates_brca1.fasta") aln = load_aligned_seqs(filename) self.assertRaises(FileFormatError, aln.write, filename="blah") self.assertRaises(FileFormatError, aln.write, filename="blah.txt") self.assertRaises( FileFormatError, aln.write, filename="blah.fasta", format="noway" )
def test_logdet_tk_adjustment(self): """logdet using tamura kumar differs from classic""" aln = load_aligned_seqs("data/brca1_5.paml", moltype=DNA) logdet_calc = LogDetPair(moltype=DNA, alignment=aln) logdet_calc.run(use_tk_adjustment=True, show_progress=False) tk = logdet_calc.get_pairwise_distances() logdet_calc.run(use_tk_adjustment=False, show_progress=False) not_tk = logdet_calc.get_pairwise_distances() self.assertNotEqual(tk, not_tk)
def test_ml(self): """exercise the ML tree estimation""" from numpy.testing import assert_allclose aln = load_aligned_seqs(os.path.join(data_path, "brca1.fasta"), moltype="dna") aln = aln.take_seqs(["Human", "Mouse", "Rat", "Dog"]) aln = aln.omit_gap_pos(allowed_gap_frac=0) model = get_model("JC69") lnL, tree = ML(model, aln).trex(a=3, k=1, show_progress=False) assert_allclose(lnL, -8882.217502905267) self.assertTrue(tree.same_topology(make_tree("(Mouse,Rat,(Human,Dog));")))
def test_bstrap_parallel(self): """exercising bootstrap with parallel""" aln = load_aligned_seqs(join(data_dir, "brca1.fasta"), moltype="dna") aln = aln.take_seqs(aln.names[:3]) aln = aln.omit_gap_pos(allowed_gap_frac=0) opt_args = dict(max_evaluations=20, limit_action="ignore") m1 = evo_app.model("F81", opt_args=opt_args) m2 = evo_app.model("HKY85", opt_args=opt_args) hyp = evo_app.hypothesis(m1, m2) strapper = evo_app.bootstrap(hyp, num_reps=2, parallel=True) result = strapper(aln) self.assertIsInstance(result, evo_app.bootstrap_result)
def load_alignment(annotate1=False, annotate2=False): """creates an alignment with None, one or two sequences annotated""" path = str(pathlib.Path(__file__).parent.parent / "data/brca1_5.paml") aln = load_aligned_seqs(path, array_align=False, moltype="dna") aln = aln.omit_gap_pos() if annotate1: x1 = aln.get_seq(aln.names[0]).add_feature("gene", "abcde1", [(20, 50)]) x2 = aln.get_seq(aln.names[0]).add_feature("variation", "one", [(11, 12)]) if annotate2: y1 = aln.get_seq(aln.names[1]).add_feature("gene", "abcde2", [(20, 50)]) y2 = aln.get_seq(aln.names[1]).add_feature("domain", "abcde2", [(10, 15)]) return aln
def test_zhang_mtseq(self): """genetic code setting should work""" from cogent3.app.composable import NotCompleted opt = dict(max_evaluations=20, limit_action="ignore") aln = load_aligned_seqs("data/ENSG00000198712.fa", moltype="dna") natsel = evo_app.natsel_zhang("CNFGTR", tip1="Human", opt_args=opt, gc=2) result = natsel(aln) self.assertEqual(result.df, 3) # but if provide wrong gc, get NotCompleted natsel = evo_app.natsel_zhang("CNFGTR", tip1="Human", opt_args=opt, gc=1) result = natsel(aln) self.assertIsInstance(result, NotCompleted)
def test_neutral_mtdna(self): """test of neutrality, different genetic code""" from cogent3.app.composable import NotCompleted opt = dict(max_evaluations=2, limit_action="ignore") aln = load_aligned_seqs("data/ENSG00000198712.fa", moltype="dna") neutral = evo_app.natsel_neutral("MG94HKY", opt_args=opt, gc=2) result = neutral(aln) self.assertEqual(result.df, 1) # not completed if wrong gc neutral = evo_app.natsel_neutral("MG94HKY", opt_args=opt, gc=1) result = neutral(aln) self.assertIsInstance(result, NotCompleted)
def _loadfromfile(self, filename, test_write=True, **kw): filename = os.path.join(data_path, filename) aln = load_aligned_seqs(filename, **kw) if test_write: suffix, cmpr = get_format_suffixes(filename) cmpr = "" if not cmpr else f".{cmpr}" fn = tempfile.mktemp(suffix="." + suffix + cmpr) aln.write(filename=fn) os.remove(fn) # now use pathlib fn = pathlib.Path(fn) aln.write(filename=fn) fn.unlink()
def test_progressive_align_protein_moltype(self): """tests guide_tree is None and moltype is protein""" from cogent3 import load_aligned_seqs seqs = load_aligned_seqs("data/nexus_aa.nxs", moltype="protein") seqs = seqs.degap() seqs = seqs.take_seqs(["Rat", "Cow", "Human", "Mouse", "Whale"]) aligner = align_app.progressive_align(model="WG01") got = aligner(seqs) self.assertNotIsInstance(got, NotCompleted) aligner = align_app.progressive_align(model="protein") got = aligner(seqs) self.assertNotIsInstance(got, NotCompleted)
def test_neutral(self): """test of neutrality, one omega != 1""" opt = dict(max_evaluations=20, limit_action="ignore") aln = load_aligned_seqs("data/primate_brca1.fasta", moltype="dna") neutral = evo_app.natsel_neutral("MG94HKY", tree="data/primate_brca1.tree", opt_args=opt) result = neutral(aln) self.assertEqual(result.df, 1) self.assertTrue("MG94HKY-null" in result) self.assertTrue("MG94HKY-alt" in result) # fails if not a codon model with self.assertRaises(ValueError): _ = evo_app.natsel_neutral("F81", tree="data/primate_brca1.tree")
def _loadfromfile(self, filename, test_write=True, **kw): filename = os.path.join(data_path, filename) aln = load_aligned_seqs(filename, **kw) if test_write: r = _compression.search(filename) if r: cmpr = filename[r.start() :] suffix = filename[: r.start()].split(".")[-1] else: suffix = filename.split(".")[-1] cmpr = "" fn = tempfile.mktemp(suffix="." + suffix + cmpr) aln.write(filename=fn) os.remove(fn)
def test_natsel_sitehet_mprob(self): """natsel_sitehet correctly applies genetic code and optimise_motif_probs args""" opt = dict(max_evaluations=2, limit_action="ignore") aln = load_aligned_seqs("data/ENSG00000198712.fa", moltype="dna") # optimising root probs natsel = evo_app.natsel_sitehet( "MG94HKY", opt_args=opt, gc=2, optimise_motif_probs=True ) # test of genetic code is implicit, if not correct, the following # call would return NotCompleted (for this mtDNA gene), which does not # have a .null attribute result = natsel(aln) # 3 edges, 1 kappa, 1 omega, 1 bprob, 3 mprob self.assertEqual(result.null.lf.nfp, 9)
def test_load_aligned_seqs_from_json(self): """tests loading an aligned object from json file""" with TemporaryDirectory(dir=".") as dirname: path = os.path.join(data_path, "brca1_5.paml") alignment = load_aligned_seqs(path, array_align=False, moltype="dna") alignment_json_path = os.path.join(dirname, "alignment.json") alignment.write(alignment_json_path) array_alignment = load_aligned_seqs(path, moltype="dna") array_alignment_json_path = os.path.join(dirname, "array_alignment.json") array_alignment.write(array_alignment_json_path) # tests case Alignment got = load_aligned_seqs(alignment_json_path) self.assertIsInstance(got, Alignment) self.assertEqual(got.moltype.label, "dna") self.assertEqual(got.to_dict(), alignment.to_dict()) self.assertEqual(got.info["source"], path) # tests case ArrayAlignment got = load_aligned_seqs(array_alignment_json_path) self.assertIsInstance(got, ArrayAlignment) self.assertEqual(got.moltype.label, "dna") self.assertEqual(got.to_dict(), array_alignment.to_dict()) self.assertEqual(got.info["source"], path) # tests json generated by make_record_for_json uncompleted_record = make_record_for_json("delme", got, False) completed_record = make_record_for_json("delme", got, True) uncompleted_record_path = os.path.join(dirname, "uncompleted_record.json") completed_record_path = os.path.join(dirname, "completed_record.json") with open(uncompleted_record_path, "w") as out: out.write(json.dumps(uncompleted_record)) with open(completed_record_path, "w") as out: out.write(json.dumps(completed_record)) # tests when provided record json file is uncompleted with self.assertRaises(TypeError): load_unaligned_seqs(uncompleted_record_path) # tests when provided record json is completed got = load_aligned_seqs(completed_record_path) self.assertIsInstance(got, ArrayAlignment) self.assertEqual(got.to_dict(), array_alignment.to_dict()) self.assertEqual(got.info["source"], path) # tests wrong input json file json_path = os.path.join(dirname, "unaligned.json") path = os.path.join(data_path, "brca1_5.paml") unaligned = load_unaligned_seqs(path) unaligned.write(json_path) with self.assertRaises(TypeError): load_aligned_seqs(json_path)
def test_neutral_mprobs(self): """test of neutrality, optimise_motif_probs setting should work""" opt = dict(max_evaluations=2, limit_action="ignore") aln = load_aligned_seqs("data/ENSG00000198712.fa", moltype="dna") # default, not optimising root probs natsel = evo_app.natsel_neutral("MG94HKY", opt_args=opt, gc=2) result = natsel(aln) self.assertEqual(result.null.lf.nfp, 4) # optimising root probs natsel = evo_app.natsel_neutral( "MG94HKY", opt_args=opt, gc=2, optimise_motif_probs=True ) result = natsel(aln) self.assertEqual(result.null.lf.nfp, 7)
def test_roundtrip_het_lf(self): """correctly round trips a site-het model""" with open("data/site-het-param-rules.json") as infile: rules = json.load(infile) aln = load_aligned_seqs("data/primates_brca1.fasta", moltype="dna") tree = load_tree("data/primates_brca1.tree") rule_lnL = rules.pop("phylohmm-gamma-kappa") sm = get_model("HKY85", ordered_param="rate", distribution="gamma") lf1 = sm.make_likelihood_function(tree, bins=4, sites_independent=False) lf1.set_alignment(aln) lf1.apply_param_rules(rule_lnL["rules"]) data = lf1.to_json() got_lf = deserialise_object(data) assert_allclose(lf1.lnL, got_lf.lnL)
def test_bstrap(self): """exercising bootstrap with simple hypothesis""" aln = load_aligned_seqs(join(data_dir, "brca1.fasta"), moltype="dna") aln = aln.take_seqs(aln.names[:3]) aln = aln.omit_gap_pos(allowed_gap_frac=0) opt_args = dict(max_evaluations=20, limit_action="ignore") m1 = evo_app.model("F81", opt_args=opt_args) m2 = evo_app.model("HKY85", opt_args=opt_args) hyp = evo_app.hypothesis(m1, m2) strapper = evo_app.bootstrap(hyp, num_reps=2, parallel=False) result = strapper(aln) nd = result.null_dist self.assertTrue(set(type(v) for v in nd), {float}) json = result.to_json() got = deserialise_object(json) self.assertIsInstance(got, evo_app.bootstrap_result)
def test_natsel_sitehet(self): """site-het natsel hypothesis test""" opt = dict(max_evaluations=2, limit_action="ignore") aln = load_aligned_seqs("data/primate_brca1.fasta", moltype="dna") # default, not optimising root probs natsel = evo_app.natsel_sitehet("MG94HKY", tree="data/primate_brca1.tree", opt_args=opt) result = natsel(aln) # one free param for each edge, 1 for kappa, 1 for omega, 1 for bprobs self.assertEqual(result.null.lf.nfp, 14) # plus one extra bprob and one extra omega self.assertEqual(result.alt.lf.nfp, 16) # fails if not a codon model with self.assertRaises(ValueError): _ = evo_app.natsel_sitehet("F81", tree="data/primate_brca1.tree")
def test_zhang(self): """natsel_zhang correctly configured and should not fail""" opt = dict(max_evaluations=20, limit_action="ignore") aln = load_aligned_seqs("data/primate_brca1.fasta", moltype="dna") natsel = evo_app.natsel_zhang( "CNFGTR", tree="data/primate_brca1.tree", tip1="Human", tip2="Chimpanzee", opt_args=opt, ) result = natsel(aln) self.assertEqual(result.df, 3) self.assertEqual(result.alt.nfp, 21) # the naming scheme is model name followed by null/alt self.assertTrue("CNFGTR-null" in result) self.assertTrue("CNFGTR-alt" in result) # result keys correct when given a model Y98 = get_model("Y98") natsel = evo_app.natsel_zhang( Y98, tree="data/primate_brca1.tree", tip1="Human", tip2="Chimpanzee", opt_args=opt, ) result = natsel(aln) self.assertEqual(result.df, 3) self.assertTrue("Y98-null" in result) self.assertTrue("Y98-alt" in result) # fails if not a codon model with self.assertRaises(ValueError): _ = evo_app.natsel_zhang( "F81", tree="data/primate_brca1.tree", tip1="Human", tip2="Chimpanzee", opt_args=opt, ) # fails if no tip names provided with self.assertRaises(ValueError): _ = evo_app.natsel_zhang("Y98", tree="data/primate_brca1.tree", opt_args=opt)