def test_model_result_alignment_split_pos_model(self): """returns alignment from lf with split codon positions""" _data = { "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG", "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG", "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG", } aln = make_aligned_seqs(data=_data, moltype="dna") mod = evo_app.model( "F81", split_codons=True, show_progress=False, opt_args=dict(max_evaluations=5, limit_action="ignore"), ) result = mod(aln) for i in range(1, 4): got = result.alignment[i] expect = aln[i - 1 :: 3] self.assertEqual(got.to_dict(), expect.to_dict())
def test_model_result_tree_split_pos_model(self): """returns tree from lf with split codon positions""" _data = { "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG", "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG", "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG", } aln = make_aligned_seqs(data=_data, moltype="dna") mod = evo_app.model( "F81", split_codons=True, show_progress=False, opt_args=dict(max_evaluations=55, limit_action="ignore"), ) result = mod(aln) self.assertTrue(len(result.tree), 3) # check the trees are different by summing lengths lengths = {t.total_length() for _, t in result.tree.items()} self.assertTrue(len(lengths) > 1)
def test_set_motif_probs(self): """Mprobs supplied to the parameter controller""" def compare_mprobs(got, exp): # handle min val for e in got: self.assertFloatEqual(got[e], exp[e], eps=3e-6) model = cogent3.evolve.substitution_model.TimeReversibleNucleotide( model_gaps=True, motif_probs=None) lf = model.make_likelihood_function(self.tree, motif_probs_from_align=False) mprobs = {"A": 0.1, "C": 0.2, "G": 0.2, "T": 0.5, "-": 0.0} lf.set_motif_probs(mprobs) # node the LF adjust motif probs so they are all >= 1e-6 got = lf.get_motif_probs().to_dict() compare_mprobs(got, mprobs) lf.set_motif_probs_from_data(self.al[:1], is_constant=True) self.assertFloatEqual(lf.get_motif_probs()["G"], 0.6, eps=3e-6) lf.set_motif_probs_from_data(self.al[:1], pseudocount=1) self.assertNotEqual(lf.get_motif_probs()["G"], 0.6) # test with consideration of ambiguous states al = make_aligned_seqs(data={ "seq1": "ACGTAAGNA", "seq2": "ACGTANGTC", "seq3": "ACGTACGTG" }) lf.set_motif_probs_from_data(al, include_ambiguity=True, is_constant=True) motif_probs = dict(lf.get_motif_probs()) correct_probs = { "A": 8.5 / 27, "C": 5.5 / 27, "-": 0.0, "T": 5.5 / 27, "G": 7.5 / 27, } compare_mprobs(motif_probs, correct_probs) self.assertFloatEqual(sum(motif_probs.values()), 1.0)
def test_aln_to_ref_known(self): """correctly recapitulates known case""" orig = make_aligned_seqs( { "Ref": "CAG---GAGAACAGAAACCCAT--TACTCACT", "Qu1": "CAG---GAGAACAG---CCCGTGTTACTCACT", "Qu2": "CAGCATGAGAACAGAAACCCGT--TA---ACT", "Qu3": "CAGCATGAGAACAGAAACCCGT----CTCACT", "Qu4": "CAGCATGAGAACAGAAACCCGTGTTACTCACT", "Qu5": "CAG---GAGAACAG---CCCAT--TACTCACT", "Qu6": "CAG---GA-AACAG---CCCAT--TACTCACT", "Qu7": "CAG---GA--ACAGA--CCCGT--TA---ACT", }, moltype="dna", ) expect = orig.to_dict() aligner = align_app.align_to_ref(ref_seq="Ref") aln = aligner(orig.degap()) self.assertEqual(aln.to_dict(), expect)
def BestLogLikelihood( aln, alphabet=None, exclude_chars=None, allowed_chars="ACGT", motif_length=None, return_length=False, ): """returns the best log-likelihood according to Goldman 1993. Parameters ---------- alphabet a sequence alphabet object. motif_length 1 for nucleotide, 2 for dinucleotide, etc .. exclude_chars a series of characters used to exclude motifs allowed_chars only motifs that contain a subset of these are allowed return_length whether to also return the number of alignment columns """ assert alphabet or motif_length, ( "Must provide either an alphabet or a" " motif_length" ) # need to use the alphabet, so we can enforce character compliance if alphabet: kwargs = dict(moltype=alphabet.moltype) motif_length = alphabet.get_motif_len() else: kwargs = {} aln = make_aligned_seqs(aln.to_dict(), **kwargs) columns = aligned_columns_to_rows(aln, motif_length, exclude_chars, allowed_chars) num_cols = len(columns) log_likelihood = get_G93_lnL_from_array(columns) if return_length: return log_likelihood, num_cols return log_likelihood
def test_roundtrip_likelihood_function(self): """likelihood function.to_json enables roundtrip""" _data = { "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG", "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG", "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG", } aln = make_aligned_seqs(data=_data, moltype="dna") tree = make_tree(tip_names=aln.names) sm = get_model("HKY85") lf = sm.make_likelihood_function(tree) lf.set_alignment(aln) edge_vals = zip(aln.names, (2, 3, 4)) for edge, val in edge_vals: lf.set_param_rule("kappa", edge=edge, init=val) lnL = lf.get_log_likelihood() data = lf.to_json() got_obj = deserialise_object(data) self.assertFloatEqual(got_obj.get_log_likelihood(), lnL)
def test_roundtrip_discrete_time_likelihood_function(self): """discrete time likelihood function.to_json enables roundtrip""" _data = { "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG", "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG", "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG", } aln = make_aligned_seqs(data=_data, moltype="dna") tree = make_tree(tip_names=aln.names) sm = get_model("BH") lf = sm.make_likelihood_function(tree) lf.set_alignment(aln) lf.optimise(max_evaluations=25, limit_action="ignore", show_progress=False) lnL = lf.get_log_likelihood() data = lf.to_json() got_obj = deserialise_object(data) self.assertFloatEqual(got_obj.get_log_likelihood(), lnL)
def test_user_function_multiple(self): """user defined composable functions should not interfere with each other""" from cogent3 import make_aligned_seqs from cogent3.core.alignment import Alignment u_function_1 = user_function(self.foo, "aligned", "aligned") u_function_2 = user_function(self.bar, "aligned", "pairwise_distances") aln_1 = make_aligned_seqs(data=[("a", "GCAAGCGTTTAT"), ("b", "GCTTTTGTCAAT")]) data = dict([("s1", "ACGTACGTA"), ("s2", "GTGTACGTA")]) aln_2 = Alignment(data=data, moltype="dna") got_1 = u_function_1(aln_1) got_2 = u_function_2(aln_2) self.assertEqual(got_1.to_dict(), {"a": "GCAA", "b": "GCTT"}) self.assertEqual(got_2, {("s1", "s2"): 2.0, ("s2", "s1"): 2.0})
def test_roundtrip_rc_annotated_align(self): """should work for an alignment that has been reverse complemented""" # the key that exposed the bug was a gap in the middle of the sequence aln = make_aligned_seqs( data=[["x", "-AAAGGGGGAAC-CT"], ["y", "TTTT--TTTTAGGGA"]], array_align=False, moltype="dna", ) of1 = aln.get_seq("x").add_annotation(Feature, "exon", "E1", [(3, 8)]) of2 = aln.get_seq("x").add_annotation(Feature, "exon", "E2", [(10, 13)]) raln = aln.rc() json = raln.to_json() got = deserialise_object(json) self.assertEqual(got.to_dict(), raln.to_dict()) orig_annots = { a.name: a.get_slice() for a in raln.get_annotations_from_any_seq() } got_annots = {a.name: a.get_slice() for a in got.get_annotations_from_any_seq()} self.assertEqual(got_annots, orig_annots)
def test_tabulate(self): """call returns tabular_result with Tables""" from cogent3.util.table import Table _data = { "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG", "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG", "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG", } aln = make_aligned_seqs(data=_data, moltype="dna") mod = evo_app.model("GN", opt_args=dict(max_evaluations=25, limit_action="ignore")) result = mod(aln) tabulator = evo_app.tabulate_stats() tabulated = tabulator(result) self.assertEqual(len(tabulated), 3) for title in ("motif params", "global params", "edge params"): self.assertTrue(title in tabulated) self.assertIsInstance(tabulated[title], Table)
def test_logdet_for_determinant_lte_zero(self): """returns distance of None if the determinant is <= 0""" data = dict( seq1="AGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT", seq2="TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC", ) aln = make_aligned_seqs(data=data, moltype=DNA) logdet_calc = LogDetPair(moltype=DNA, alignment=aln) logdet_calc.run(use_tk_adjustment=True, show_progress=False) dists = logdet_calc.get_pairwise_distances().to_dict() self.assertTrue(numpy.isnan(list(dists.values())[0])) logdet_calc.run(use_tk_adjustment=False, show_progress=False) dists = logdet_calc.get_pairwise_distances().to_dict() self.assertTrue(numpy.isnan(list(dists.values())[0])) # but raises ArithmeticError if told to logdet_calc = LogDetPair(moltype=DNA, alignment=aln, invalid_raises=True) with self.assertRaises(ArithmeticError): logdet_calc.run(use_tk_adjustment=True, show_progress=False)
def test_paralinear_pair_dna(self): """calculate paralinear distance consistent with logdet distance""" data = [ ( "seq1", "TAATTCATTGGGACGTCGAATCCGGCAGTCCTGCCGCAAAAGCTTCCGGAATCGAATTTTGGCA", ), ( "seq2", "AAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGG", ), ] aln = make_aligned_seqs(data=data, moltype=DNA) paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln) paralinear_calc.run(show_progress=False) logdet_calc = LogDetPair(moltype=DNA, alignment=aln) logdet_calc.run(show_progress=False) self.assertEqual(logdet_calc.dists[1, 1], paralinear_calc.dists[1, 1]) self.assertEqual(paralinear_calc.variances[1, 1], logdet_calc.variances[1, 1])
def test_split_codon_model_result_json(self): """round trip split_codon result""" _data = { "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG", "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG", "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG", } aln = make_aligned_seqs(data=_data, moltype="dna") tree = make_tree(tip_names=aln.names) mod = evo_app.model( "F81", tree=tree, split_codons=True, opt_args=dict(max_evaluations=5, limit_action="ignore"), ) result = mod(aln) lf1 = result.lf[1] json = result.to_json() deser = deserialise_object(json) assert_allclose(deser.lf[1].lnL, lf1.lnL)
def test_model_hypothesis_result_repr(self): """result objects __repr__ and _repr_html_ methods work correctly""" _data = { "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG", "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG", "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG", } aln = make_aligned_seqs(data=_data, moltype="dna") model1 = evo_app.model("F81", opt_args=dict(max_evaluations=25, limit_action="ignore")) model2 = evo_app.model("HKY85", opt_args=dict(max_evaluations=25, limit_action="ignore")) hyp = evo_app.hypothesis(model1, model2) result = hyp(aln) self.assertIsInstance(result.__repr__(), str) self.assertIsInstance(result._repr_html_(), str) self.assertIsInstance(result.null.__repr__(), str) self.assertIsInstance(result.null._repr_html_(), str)
def test_hyp_init(self): """uses user specified init_alt function, or not""" opt_args = dict(max_evaluations=25, limit_action="ignore") model1 = evo_app.model("F81", opt_args=opt_args) model2 = evo_app.model("HKY85", opt_args=opt_args) # defaults to using null for init hyp = evo_app.hypothesis(model1, model2) _data = { "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG", "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG", "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG", } aln = make_aligned_seqs(data=_data, moltype="dna") result = hyp(aln) self.assertEqual(result.df, 1) # user specified function hyp = evo_app.hypothesis(model1, model2, init_alt=lambda x, y: x) result = hyp(aln) self.assertEqual(result.df, 1)
def setUp(self): """ Initialize some variables for the tests """ self.canonical_abbrevs = "ACDEFGHIKLMNPQRSTVWY" self.ambiguous_abbrevs = "BXZ" self.all_to_a = [("A", self.canonical_abbrevs + self.ambiguous_abbrevs) ] self.charge_2 = alphabets["charge_2"] self.hydropathy_3 = alphabets["hydropathy_3"] self.orig = alphabets["orig"] self.aln = ArrayAlignment(data={ "1": "CDDFBXZ", "2": "CDD-BXZ", "3": "AAAASS-" }) self.aln2 = make_aligned_seqs(data={ "1": "CDDFBXZ", "2": "CDD-BXZ", "3": "AAAASS-" })
def test_trim_stop_codons(self): """trims stop codons using the specified genetic code""" trimmer = sample.trim_stop_codons() # defaults to standard code seqs = make_unaligned_seqs(data={ "seq1": "AAATTTCCC", "seq2": "AAATTTTAA" }, moltype="dna") got = trimmer(seqs) expect = {"seq1": "AAATTTCCC", "seq2": "AAATTT"} self.assertEqual(got.to_dict(), expect) trimmer = sample.trim_stop_codons(gc=1) # standard code seqs = make_unaligned_seqs(data={ "seq1": "AAATTTCCC", "seq2": "AAATTTTAA" }, moltype="dna") got = trimmer(seqs) expect = {"seq1": "AAATTTCCC", "seq2": "AAATTT"} self.assertEqual(got.to_dict(), expect) trimmer = sample.trim_stop_codons(gc=1) # standard code aln = make_aligned_seqs(data={ "seq1": "AAATTTCCC", "seq2": "AAATTTTAA" }, moltype="dna") got = trimmer(aln) expect = {"seq1": "AAATTTCCC", "seq2": "AAATTT---"} self.assertEqual(got.to_dict(), expect) # different genetic code trimmer = sample.trim_stop_codons(gc=2) # mt code seqs = make_unaligned_seqs(data={ "seq1": "AAATTTCCC", "seq2": "AAATTTAGA" }, moltype="dna") got = trimmer(seqs) expect = {"seq1": "AAATTTCCC", "seq2": "AAATTT"} self.assertEqual(got.to_dict(), expect)
def test_feature_residue(self): """seq features on alignment operate in sequence coordinates""" # In this case, only those residues included within the feature are # covered - note the omission of the T in y opposite the gap in x. aln = make_aligned_seqs( data=[["x", "C-CCCAAAAA"], ["y", "-T----TTTT"]], moltype=DNA, array_align=False, ) self.assertEqual(str(aln), ">x\nC-CCCAAAAA\n>y\n-T----TTTT\n") exon = aln.get_seq("x").add_feature("exon", "ex1", [(0, 4)]) self.assertEqual(str(exon), 'exon "ex1" at [0:4]/9') self.assertEqual(str(exon.get_slice()), "CCCC") aln_exons = list(aln.get_annotations_from_seq("x", "exon")) self.assertEqual(str(aln_exons), '[exon "ex1" at [0:1, 2:5]/10]') self.assertEqual(str(aln_exons[0].get_slice()), ">x\nCCCC\n>y\n----\n") # Feature.as_one_span(), is applied to the exon that # straddles the gap in x. The result is we preserve that feature. self.assertEqual( str(aln_exons[0].as_one_span().get_slice()), ">x\nC-CCC\n>y\n-T---\n" ) # These properties also are consistently replicated with reverse # complemented sequences. aln_rc = aln.rc() rc_exons = list(aln_rc.get_annotations_from_any_seq("exon")) # not using as_one_span, so gap removed from x self.assertEqual(str(aln_rc[rc_exons]), ">x\nCCCC\n>y\n----\n") self.assertEqual( str(aln_rc[rc_exons[0].as_one_span()]), ">x\nC-CCC\n>y\n-T---\n" ) # Features can provide their coordinates, useful for custom analyses. all_exons = aln.get_region_covering_all(aln_exons) coords = all_exons.get_coordinates() assert coords == [(0, 1), (2, 5)]
def test_concat(self): """returns concatenated alignment""" alns = [ make_aligned_seqs(data=d, moltype=DNA) for d in [ { "seq1": "AAA", "seq2": "AAA", "seq3": "AAA" }, { "seq1": "TTT", "seq2": "TTT", "seq3": "TTT", "seq4": "TTT" }, { "seq1": "CC", "seq2": "CC", "seq3": "CC" }, ] ] ccat = sample.concat(intersect=True) got = ccat(alns) self.assertEqual(got.to_dict(), { "seq1": "AAATTTCC", "seq2": "AAATTTCC", "seq3": "AAATTTCC" }) ccat = sample.concat(intersect=False) got = ccat(alns) self.assertEqual( got.to_dict(), { "seq1": "AAATTTCC", "seq2": "AAATTTCC", "seq3": "AAATTTCC", "seq4": "???TTT??", }, )
def test_usage(self): """Alignment.counts_per_seq method correctly applies CategoryCounter""" data = { "DogFaced": "TCATTA", "FalseVamp": "TCATTA", "FlyingFox": "TCTTTA", "FreeTaile": "TCATTA", "Horse": "TCATTG", "LeafNose": "TCTTTA", "LittleBro": "TCATTA", "Rhino": "TCATTG", "RoundEare": "TCATTA", "TombBat": "TCAGTA", } aln = make_aligned_seqs(data=data, moltype="dna") got = aln.counts_per_pos(motif_length=3) self.assertEqual(got[0, "TCA"], 8) self.assertEqual(got[0, "TCT"], 2) self.assertEqual(got[1, "TTA"], 7) self.assertEqual(got[1, "GTA"], 1) self.assertEqual(got[1, "TTG"], 2)
def test_model_result_total_length_split_codon(self): """returns summed branch lengths across positions when split_codons True""" _data = { "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG", "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG", "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG", } aln = make_aligned_seqs(data=_data, moltype="dna") model1 = evo_app.model( "GN", split_codons=True, opt_args=dict(max_evaluations=25, limit_action="ignore"), ) result = model1(aln) expect = 0.0 for lf in result.lf.values(): tree = lf.get_annotated_tree(length_as="ENS") expect += tree.total_length() got = result.total_length(length_as="ENS") assert_allclose(got, expect)
def setUp(self): """constructs _model_results if they don't already exist""" if self._model_results: return _data = { "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG", "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG", "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG", } aln = make_aligned_seqs(data=_data, moltype="dna") model1 = evo_app.model("F81", opt_args=dict(max_evaluations=25, limit_action="ignore")) model2 = evo_app.model("HKY85", opt_args=dict(max_evaluations=25, limit_action="ignore")) mr1 = model1(aln) mr2 = model2(aln) self._model_results[mr1.name] = mr1 self._model_results[mr2.name] = mr2
def test_model_collection_result(self): """round trip of model collection works""" from cogent3.app import evo as evo_app from cogent3.evolve.parameter_controller import ( AlignmentLikelihoodFunction, ) _data = { "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG", "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG", "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG", } aln = make_aligned_seqs(data=_data, moltype="dna") opt_args = dict(max_evaluations=10, limit_action="ignore") m1 = evo_app.model("F81", split_codons=True, opt_args=opt_args) m2 = evo_app.model("GTR", split_codons=True, opt_args=opt_args) models = (m1, m2) mc_result = model_collection_result(name="collection", source="blah") for model in models: mc_result[model.name] = model(aln) for model in models: for i in range(1, 4): self.assertIsInstance( mc_result[model.name][i], AlignmentLikelihoodFunction ) data = mc_result.to_json() got_obj = deserialise_object(data) for model in models: for i in range(1, 4): self.assertIsInstance(got_obj[model.name][i], dict) # but after invoking deserialised_values got_obj.deserialised_values() for model in models: for i in range(1, 4): self.assertIsInstance( got_obj[model.name][i], AlignmentLikelihoodFunction )
def CigarParser( seqs, cigars, sliced=False, ref_seqname=None, start=None, end=None, moltype=DNA ): """return an alignment from raw sequences and cigar strings if sliced, will return an alignment correspondent to ref sequence start to end Parameters ---------- seqs - raw sequences as {seqname: seq} cigars - corresponding cigar text as {seqname: cigar_text} cigars and seqs should have the same seqnames moltype - optional default to DNA """ data = {} if not sliced: for seqname in list(seqs.keys()): aligned_seq = aligned_from_cigar( cigars[seqname], seqs[seqname], moltype=moltype ) data[seqname] = aligned_seq else: ref_aln_seq = aligned_from_cigar( cigars[ref_seqname], seqs[ref_seqname], moltype=moltype ) m, aln_loc = slice_cigar(cigars[ref_seqname], start, end, by_align=False) data[ref_seqname] = ref_aln_seq[aln_loc[0] : aln_loc[1]] for seqname in [ seqname for seqname in list(seqs.keys()) if seqname != ref_seqname ]: m, seq_loc = slice_cigar(cigars[seqname], aln_loc[0], aln_loc[1]) if seq_loc: seq = seqs[seqname] if isinstance(seq, str): seq = moltype.make_seq(seq) data[seqname] = seq[seq_loc[0] : seq_loc[1]].gapped_by_map(m) else: data[seqname] = DNA.make_seq("-" * (aln_loc[1] - aln_loc[0])) aln = make_aligned_seqs(data) return aln
def test_degap(self): """test stripping gaps from collections and alignments""" aln = make_aligned_seqs( data={ "seq1": "--ACGT--GT---", "seq2": "--ACGTA-GT---", "seq3": "--ACGTA-GT---", }) observed = aln.degap() expect = {"seq1": "ACGTGT", "seq2": "ACGTAGT", "seq3": "ACGTAGT"} self.assertEqual(observed.to_dict(), expect) collection = make_unaligned_seqs( data={ "seq1": "--ACGT--GT---", "seq2": "--ACGTA-GT---", "seq3": "--ACGTA-GT---", }, moltype=DNA, ) observed = collection.degap() self.assertEqual(observed.to_dict(), expect) self.assertEqual(observed.moltype, DNA)
def test_logdet_missing_states(self): """should calculate logdet measurement with missing states""" data = [ ( "seq1", "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT", ), ( "seq2", "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTNTTTTTTTTTTTTCCCCCCCCCCCCCCCCC", ), ] aln = make_aligned_seqs(data=data, moltype=DNA) logdet_calc = LogDetPair(moltype=DNA, alignment=aln) logdet_calc.run(use_tk_adjustment=True, show_progress=False) dists = logdet_calc.get_pairwise_distances().to_dict() self.assertTrue(list(dists.values())[0] is not None) logdet_calc.run(use_tk_adjustment=False, show_progress=False) dists = logdet_calc.get_pairwise_distances().to_dict() self.assertTrue(list(dists.values())[0] is not None)
def test_model_result_total_length(self): """returns summed branch lengths""" _data = { "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG", "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG", "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG", } aln = make_aligned_seqs(data=_data, moltype="dna") model1 = evo_app.model("GN", opt_args=dict(max_evaluations=25, limit_action="ignore")) result = model1(aln) expect_tree = result.lf.get_annotated_tree(length_as="ENS") assert_allclose(result.tree.total_length(), expect_tree.total_length()) # it will be different to the standard length values expect_tree = result.lf.get_annotated_tree() assert_raises( AssertionError, assert_allclose, result.tree.total_length(), expect_tree.total_length(), )
def test_composable_apps(self): """checks the ability of these two apps(fast_slow_dist and quick_tree) to communicate""" path = os.path.join( os.path.abspath(__file__).split("test_app")[0], "data/brca1_5.paml") aln1 = load_aligned_seqs(path, moltype=DNA) fast_slow_dist = dist.fast_slow_dist(fast_calc="hamming", moltype="dna") quick = tree_app.quick_tree(drop_invalid=False) proc = fast_slow_dist + quick self.assertEqual( str(proc), "fast_slow_dist(type='distance', distance=None, moltype='dna', fast_calc='hamming', slow_calc=None) + quick_tree(type='tree', drop_invalid=False)", ) self.assertIsInstance(proc, tree_app.quick_tree) self.assertEqual(proc._type, "tree") self.assertIsInstance(proc.input, dist.fast_slow_dist) self.assertIs(proc.output, None) self.assertIsInstance(proc._input_types, frozenset) self.assertIsInstance(proc._output_types, frozenset) self.assertIsInstance(proc._in, dist.fast_slow_dist) self.assertIs(proc._out, None) tree1 = proc(aln1) self.assertIsInstance(tree1, PhyloNode) self.assertIsNotNone(tree1.children) self.assertEqual(set(tree1.get_tip_names()), set(aln1.names)) # tests when distances contain None data = dict( seq1= "AGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT", seq2= "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC", ) aln2 = make_aligned_seqs(data=data, moltype=DNA) tree2 = proc(aln2) self.assertIsInstance(tree2, NotCompleted)
def load(self, data): """returns sequences Parameters ---------- data file path or cogent3 sequence collection / alignment """ if type(data) == str: with open_(data) as infile: data = dict(record for record in self._parser(infile)) seqs = self.klass(data=data, moltype=self.moltype) seqs.info.path = data elif not isinstance(data, SequenceCollection): if self.aligned: seqs = make_aligned_seqs(data, moltype=self.moltype) else: seqs = make_unaligned_seqs(data, moltype=self.moltype) if not (self._output_types & {"aligned"}): seqs = seqs.degap() return seqs
def test_hyp_init_sequential(self): """uses preceding model to initialise function""" opt_args = dict(max_evaluations=15, limit_action="ignore") model1 = evo_app.model("F81", opt_args=opt_args) model2 = evo_app.model("HKY85", opt_args=opt_args) model3 = evo_app.model("GTR", opt_args=opt_args) # defaults to initialise model3 from model 2 from model1 hyp = evo_app.hypothesis(model1, model2, model3, sequential=True) _data = { "Human": "ATGCGGCTCGCGGAGGCCGCGCTCGCGGAG", "Mouse": "ATGCCCGGCGCCAAGGCAGCGCTGGCGGAG", "Opossum": "ATGCCAGTGAAAGTGGCGGCGGTGGCTGAG", } aln = make_aligned_seqs(data=_data, moltype="dna") result = hyp(aln) self.assertTrue(result["F81"].lf.lnL < result["HKY85"].lf.lnL < result["GTR"].lf.lnL) # can be set to False, in which case all models start at defaults hyp = evo_app.hypothesis(model1, model2, model3, sequential=False) result = hyp(aln) self.assertFalse(result["F81"].lf.lnL < result["HKY85"].lf.lnL < result["GTR"].lf.lnL)