def test_make_aligned_seqs(self): """test Alignment/ArrayAlignment constructor utility function""" data = {"a": "AGGTT", "b": "AGAA-"} got = make_aligned_seqs(data) self.assertIsInstance(got, ArrayAlignment) self.assertEqual(got.to_dict(), data) self.assertEqual(got.info["source"], "unknown") # moltype arg works got = make_aligned_seqs(data, moltype="dna") self.assertEqual(got.moltype.label, "dna") # info works got = make_aligned_seqs(data, info=dict(a=2)) self.assertEqual(got.info["a"], 2) with self.assertRaises(AssertionError): _ = make_unaligned_seqs(data, info=2) # source works got = make_aligned_seqs(data, source="somewhere") self.assertEqual(got.info["source"], "somewhere") # array_align works got = make_aligned_seqs(data, array_align=False) self.assertIsInstance(got, Alignment) self.assertEqual(got.to_dict(), data) self.assertEqual(got.info["source"], "unknown")
def test_roundtrip_seqcoll(self): """SequenceCollection to_json enables roundtrip""" data = dict(A="TTGT", B="GGCT") seqcoll = make_unaligned_seqs(data=data, moltype="dna") got = deserialise_object(seqcoll.to_json()) self.assertEqual(got.rc().to_dict(), seqcoll.rc().to_dict()) self.assertIsInstance(got, alignment.SequenceCollection)
def test_replace_seqs(self): """synchronize gaps between protein seqs and codon seqs""" pd = { "FlyingFox": "C-TNAH", "DogFaced": "CGTNT-", "FreeTaile": "-GTDTH", "LittleBro": "C-TD-H", "TombBat": "C--STH", } pal = make_aligned_seqs(moltype=PROTEIN, data=pd) cu = { "TombBat": "TGTAGTACTCAT", "FreeTaile": "GGCACAGATACTCAT", "FlyingFox": "TGTACAAATGCTCAT", "LittleBro": "TGTACAGATCAT", "DogFaced": "TGTGGCACAAATACT", } co = make_unaligned_seqs(moltype=DNA, data=cu) cal = pal.replace_seqs(co) result = cal.to_dict() for taxon, expected_sequence in [ ("FlyingFox", "TGT---ACAAATGCTCAT"), ("DogFaced", "TGTGGCACAAATACT---"), ("FreeTaile", "---GGCACAGATACTCAT"), ("LittleBro", "TGT---ACAGAT---CAT"), ("TombBat", "TGT------AGTACTCAT"), ]: self.assertEqual(result[taxon], expected_sequence)
def test_progress_with_guide_tree(self): """progressive align works with provided guide tree""" tree = make_tree(treestring=self.treestring) aligner = align_app.progressive_align(model="nucleotide", guide_tree=self.treestring) aln = aligner(self.seqs) self.assertEqual(len(aln), 42) aligner = align_app.progressive_align(model="nucleotide", guide_tree=tree) aln = aligner(self.seqs) self.assertEqual(len(aln), 42) # even if it has underscores in name treestring = ("(Bandicoot:0.4,FlyingFox:0.05,(Rhesus_macaque:0.06," "Human:0.0):0.04);") aligner = align_app.progressive_align(model="nucleotide", guide_tree=treestring) data = self.seqs.to_dict() data["Rhesus macaque"] = data.pop("Rhesus") seqs = make_unaligned_seqs(data) aln = aligner(seqs) self.assertEqual(len(aln), 42) # guide tree with no lengths raises value error with self.assertRaises(ValueError): _ = align_app.progressive_align( model="nucleotide", guide_tree="(Bandicoot,FlyingFox,(Rhesus_macaque,Human));", )
def test_dotplot_seqcoll(self): """dotplot sequence collection, gaps are removed""" seqs = make_unaligned_seqs( {"seq1": "ACGG", "seq2": "CGCA", "seq3": "CCG-"}, moltype="dna" ) dp = seqs.dotplot("seq1", "seq3") self.assertNotEqual(dp._aligned_coords, None) self.assertEqual(len(dp.seq1), 4) self.assertEqual(len(dp.seq2), 3)
def test_roundtrip_annotated_seqcoll(self): """SequenceCollection to_json enables roundtrip of annotated sequences""" data = dict(A="TTGTA", B="GGCT") seqs = make_unaligned_seqs(data=data, moltype="dna") f = seqs.named_seqs["A"].add_feature("gene", "n1", [(2, 5)]) data = seqs.to_json() expect = str(f.get_slice()) got = deserialise_object(data) self.assertEqual(str(got.named_seqs["A"].annotations[0].get_slice()), expect)
def test_make_unaligned_seqs(self): """test SequenceCollection constructor utility function""" data = {"a": "AGGTT", "b": "AG"} got = make_unaligned_seqs(data) self.assertIsInstance(got, SequenceCollection) self.assertEqual(got.to_dict(), data) self.assertEqual(got.info["source"], "unknown") # moltype arg works got = make_unaligned_seqs(data, moltype="dna") self.assertEqual(got.moltype.label, "dna") # info works got = make_unaligned_seqs(data, info=dict(a=2)) self.assertEqual(got.info["a"], 2) with self.assertRaises(AssertionError): _ = make_unaligned_seqs(data, info=2) # source works got = make_unaligned_seqs(data, source="somewhere") self.assertEqual(got.info["source"], "somewhere")
def test_dotplot_title(self): """setting empty string title works""" seqs = make_unaligned_seqs( { "seq1": "ACGG", "seq2": "CGCA", "seq3": "CCG-" }, moltype="dna") dp = seqs.dotplot("seq1", "seq3", title="") self.assertEqual(dp.figure.layout.title, "")
def test_progressive_fails(self): """should return NotCompletedResult along with message""" # Bandicoot has an inf-frame stop codon seqs = make_unaligned_seqs( data={"Human": "GCCTCA", "Rhesus": "GCCAGCTCA", "Bandicoot": "TGATCATTA"}, moltype="dna", ) aligner = align_app.progressive_align(model="codon") got = aligner(seqs) self.assertTrue(type(got), NotCompleted)
def test_trim_stop_codons(self): """trims stop codons using the specified genetic code""" trimmer = sample.trim_stop_codons() # defaults to standard code seqs = make_unaligned_seqs(data={ "seq1": "AAATTTCCC", "seq2": "AAATTTTAA" }, moltype="dna") got = trimmer(seqs) expect = {"seq1": "AAATTTCCC", "seq2": "AAATTT"} self.assertEqual(got.to_dict(), expect) trimmer = sample.trim_stop_codons(gc=1) # standard code seqs = make_unaligned_seqs(data={ "seq1": "AAATTTCCC", "seq2": "AAATTTTAA" }, moltype="dna") got = trimmer(seqs) expect = {"seq1": "AAATTTCCC", "seq2": "AAATTT"} self.assertEqual(got.to_dict(), expect) trimmer = sample.trim_stop_codons(gc=1) # standard code aln = make_aligned_seqs(data={ "seq1": "AAATTTCCC", "seq2": "AAATTTTAA" }, moltype="dna") got = trimmer(aln) expect = {"seq1": "AAATTTCCC", "seq2": "AAATTT---"} self.assertEqual(got.to_dict(), expect) # different genetic code trimmer = sample.trim_stop_codons(gc=2) # mt code seqs = make_unaligned_seqs(data={ "seq1": "AAATTTCCC", "seq2": "AAATTTAGA" }, moltype="dna") got = trimmer(seqs) expect = {"seq1": "AAATTTCCC", "seq2": "AAATTT"} self.assertEqual(got.to_dict(), expect)
def test_translate_seqcoll(self): """correctly translate a sequence collection""" seqs = dict(a="ATGAGG", b="ATGTAA") seqs = make_unaligned_seqs(seqs) # trim terminal stops translater = translate_seqs() aa = translater(seqs) self.assertEqual(aa.to_dict(), dict(a="MR", b="M")) self.assertEqual(aa.moltype.label, "protein") # don't trim terminal stops, returns NotCompleted translater = translate_seqs(trim_terminal_stop=False) aa = translater(seqs) self.assertIsInstance(aa, NotCompleted)
def test_dotplot_missing(self): """fail if a sequence name not present""" seqs = make_unaligned_seqs( { "seq1": "ACGG", "seq2": "CGCA", "seq3": "CCG-" }, moltype="dna") with self.assertRaises(ValueError): _ = seqs.dotplot("seq1", "seq5") with self.assertRaises(ValueError): _ = seqs.dotplot("seq5", "seq1") with self.assertRaises(ValueError): _ = seqs.dotplot("seq5", "seq6")
def test_select_translatable(self): """correctly get translatable seqs""" data = { "a": "AATATAAATGCCAGCTCATTACAGCATGAGAACA" "GCAGTTTATTACTTCATAAAGTCATA", "rc": "TATGACTTTATGAAGTAATAAACTGCTGTTCTCA" "TGCTGTAATGAGCTGGCATTTATATT", } seqs = make_unaligned_seqs(data=data, moltype=DNA) trans = select_translatable(allow_rc=False) tr = trans(seqs) ex = data.copy() ex.pop("rc") self.assertEqual(tr.to_dict(), ex) trans = select_translatable(allow_rc=True) tr = trans(seqs) ex = data.copy() ex["rc"] = data["a"] self.assertEqual(tr.to_dict(), ex) # if seqs not translatable returns NotCompletedResult data = dict(a="TAATTGATTAA", b="GCAGTTTATTA") seqs = make_unaligned_seqs(data=data, moltype=DNA) got = select_translatable(allow_rc=False) self.assertTrue(type(got), NotCompleted)
def test_minlength(self): """correctly identifies data with minimal length""" aln = make_aligned_seqs(data=[("a", "GCAAGCGTTTAT"), ("b", "GCTTTTGTCAAT")]) # if using subtract_degen, fails if incorect moltype ml = sample.min_length(9, subtract_degen=True) got = ml(aln) self.assertIsInstance(got, NotCompleted) self.assertEqual(got.type, "ERROR") # but works if subtract_degen is False ml = sample.min_length(9, subtract_degen=False) aln = ml(aln) self.assertEqual(len(aln), 12) # or if moltype provided ml = sample.min_length(9, subtract_degen=True, moltype="dna") aln = ml(aln) self.assertEqual(len(aln), 12) alns = [ make_aligned_seqs(data=[("a", "GCAAGCGTTTAT"), ("b", "GCTTTTGTCAAT")], moltype=DNA), make_aligned_seqs(data=[("a", "GGAAGCGT"), ("b", "GCTTT-GT")], moltype=DNA), ] ml = sample.min_length(9) got = [aln.to_dict() for aln in map(ml, alns) if aln] expected = [dict((("a", "GCAAGCGTTTAT"), ("b", "GCTTTTGTCAAT")))] self.assertEqual(got, expected) # returns NotCompletedResult if nothing satisifies got = ml(alns[1]) self.assertTrue(type(got) == sample.NotCompleted) alns = [ make_unaligned_seqs(data=[("a", "GGAAGCGT"), ("b", "GCTTNGT")], moltype=DNA) ] ml = sample.min_length(6) got = [aln.to_dict() for aln in map(ml, alns) if aln] expected = [dict((("a", "GGAAGCGT"), ("b", "GCTTNGT")))] self.assertEqual(got, expected) ml = sample.min_length(7) got = [aln.to_dict() for aln in map(ml, alns) if aln] expected = [] self.assertEqual(got, expected)
def setUp(self): self.al = make_aligned_seqs( data={ "a": "GTACGTACGATC", "b": "GTACGTACGTAC", "c": "GTACGTACGTTC", "e": "GTACGTACTGGT", }) self.collection = make_unaligned_seqs( data={ "a": "GTACGTACGATC", "b": "GTACGTACGTAC", "c": "GTACGTACGTTC", "e": "GTACGTACTGGT", })
def test_DnaRna_interconversion(self): """test interconversion between Rna and Dna by SequenceCollection and Alignment""" dna = { "seq1": "--ACGT--GT---", "seq2": "--ACGTA-GT---", "seq3": "--ACGTA-GT---", } rna = { "seq1": "--ACGU--GU---", "seq2": "--ACGUA-GU---", "seq3": "--ACGUA-GU---", } collect_Dna = make_unaligned_seqs(data=dna, moltype=DNA) collect_Rna = make_unaligned_seqs(data=rna, moltype=RNA) self.assertEqual(collect_Rna.to_dna().to_dict(), dna) self.assertEqual(collect_Dna.to_rna().to_dict(), rna) aln_Dna = make_aligned_seqs(data=dna, moltype=DNA) aln_Rna = make_aligned_seqs(data=rna, moltype=RNA) rna_from_dna = aln_Dna.to_rna() dna_from_rna = aln_Rna.to_dna() self.assertEqual(rna_from_dna.to_dict(), rna) self.assertEqual(dna_from_rna.to_dict(), dna)
def test_progressive_est_tree(self): """excercise progressive alignment without a guide tree""" seqs = make_unaligned_seqs( data={ "A": "TGTGGCACAAATGCTCATGCCAGCTCTTTACAGCATGAGAACA", "B": "TGTGGCACAGATACTCATGCCAGCTCATTACAGCATGAGAACAGCAGTTT", "C": "TGTGGCACAAGTACTCATGCCAGCTCAGTACAGCATGAGAACAGCAGTTT", }) aln, tree = cogent3.align.progressive.TreeAlign( HKY85(), seqs, show_progress=False, param_vals={"kappa": 4.0}) expect = { "A": "TGTGGCACAAATGCTCATGCCAGCTCTTTACAGCATGAGAACA-------", "C": "TGTGGCACAAGTACTCATGCCAGCTCAGTACAGCATGAGAACAGCAGTTT", "B": "TGTGGCACAGATACTCATGCCAGCTCATTACAGCATGAGAACAGCAGTTT", } self.assertEqual(aln.to_dict(), expect)
def test_degap(self): """test stripping gaps from collections and alignments""" aln = make_aligned_seqs( data={ "seq1": "--ACGT--GT---", "seq2": "--ACGTA-GT---", "seq3": "--ACGTA-GT---", }) observed = aln.degap() expect = {"seq1": "ACGTGT", "seq2": "ACGTAGT", "seq3": "ACGTAGT"} self.assertEqual(observed.to_dict(), expect) collection = make_unaligned_seqs( data={ "seq1": "--ACGT--GT---", "seq2": "--ACGTA-GT---", "seq3": "--ACGTA-GT---", }, moltype=DNA, ) observed = collection.degap() self.assertEqual(observed.to_dict(), expect) self.assertEqual(observed.moltype, DNA)
def test_omit_duplicated(self): """correctly drop duplicated sequences""" # strict omit_duplicated data = { "a": "ACGT", "b": "ACG-", # identical excepting - "c": "ACGN", # non-strict matches above "d": "ACGG", "e": "ACGG", "k": "ACGG", # strict identical "f": "RAAA", "g": "YAAA", # non-strict identical "h": "GGGG", } # unique! seqs = make_unaligned_seqs(data=data, moltype=DNA) # mask_degen = True : [{'a', 'c', 'b'}, {'k', 'd', 'e'}, # {'g', 'f'}] are dupe sets. Only 'h' unique drop = sample.omit_duplicated(mask_degen=True, choose=None, moltype="dna") got = drop(seqs) self.assertEqual(got.to_dict(), {"h": "GGGG"}) # mask_degen = False : [{'a', 'b'}, {'k', 'd', 'e'}] # c, f, g, h drop = sample.omit_duplicated(mask_degen=False, choose=None, moltype="dna") got = drop(seqs) expect = { "a": "ACGT", "b": "ACG-", "c": "ACGN", "f": "RAAA", "g": "YAAA", "h": "GGGG", } self.assertEqual(got.to_dict(), expect)
def load(self, data): """returns sequences Parameters ---------- data file path or cogent3 sequence collection / alignment """ if type(data) == str: with open_(data) as infile: data = dict(record for record in self._parser(infile)) seqs = self.klass(data=data, moltype=self.moltype) seqs.info.path = data elif not isinstance(data, SequenceCollection): if self.aligned: seqs = make_aligned_seqs(data, moltype=self.moltype) else: seqs = make_unaligned_seqs(data, moltype=self.moltype) if not (self._output_types & {"aligned"}): seqs = seqs.degap() return seqs
class FastSlowDistTests(TestCase): seqs1 = make_unaligned_seqs(_seqs1, moltype=DNA) seqs2 = make_unaligned_seqs(_seqs2, moltype=DNA) seqs3 = make_unaligned_seqs(_seqs3, moltype=DNA) seqs4 = make_unaligned_seqs(_seqs4, moltype=DNA) seqs5 = make_unaligned_seqs(_seqs5, moltype=PROTEIN) def test_init(self): """tests if fast_slow_dist can be initialised correctly""" fast_slow_dist = dist_app.fast_slow_dist(fast_calc="hamming", moltype="dna") self.assertIsInstance(fast_slow_dist.fast_calc, HammingPair) self.assertIsNone(fast_slow_dist._sm) fast_slow_dist = dist_app.fast_slow_dist(distance="TN93") self.assertIsInstance(fast_slow_dist.fast_calc, TN93Pair) self.assertEqual(fast_slow_dist._sm.name, "TN93") fast_slow_dist = dist_app.fast_slow_dist(distance="GTR") self.assertEqual(fast_slow_dist._sm.name, "GTR") fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") self.assertEqual(fast_slow_dist._sm.name, "TN93") self.assertIsNone(fast_slow_dist.fast_calc) with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(distance="TN93", fast_calc="TN93", slow_calc="TN93") with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(fast_calc="GTR") with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(slow_calc="hamming") def test_compatible_parameters(self): """tests if the input parameters are compatible with fast_slow_dist initialisation""" fast_slow_dist = dist_app.fast_slow_dist(fast_calc="hamming", moltype="dna") fast_slow_dist = dist_app.fast_slow_dist(fast_calc="TN93") fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") fast_slow_dist = dist_app.fast_slow_dist(fast_calc="TN93") # fails for paralinear or hamming if no moltype with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(slow_calc="hamming") with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(slow_calc="paralinear") # fails for hamming as slow_calc with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(slow_calc="hamming", moltype="dna") with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(fast_calc="GTR") def test_composable_apps(self): """tests two composable apps""" composable_apps = _get_all_composable_apps() fast_slow_dist = dist_app.fast_slow_dist(fast_calc="hamming", moltype="dna") for app in composable_apps: # Compose two composable applications, there should not be exceptions. got = app + fast_slow_dist self.assertIsInstance(got, dist_app.fast_slow_dist) self.assertEqual(got._type, "distance") self.assertIs(got.input, app) self.assertIs(got.output, None) self.assertIsInstance(got._input_types, frozenset) self.assertIsInstance(got._output_types, frozenset) self.assertIs(got._in, app) self.assertIs(got._out, None) app.disconnect() fast_slow_dist.disconnect() def test_est_dist_pair_slow(self): """tests the distance between seq pairs in aln""" aligner = align.align_to_ref() aln3 = aligner(self.seqs3) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")]) self.assertTrue(got[("Mouse", "Human")] >= 0) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")]) self.assertTrue(got[("Mouse", "Human")] >= 0) aligner = align.align_to_ref(ref_seq="Human") aln3 = aligner(self.seqs3) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")]) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")]) self.assertTrue(got[("Mouse", "Human")] >= 0) aligner = align.align_to_ref(ref_seq="Mouse") aln3 = aligner(self.seqs3) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() self.assertTrue(got[("Mouse", "Human")] >= 0) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() self.assertTrue(got[("Mouse", "Human")] >= 0) aligner = align.align_to_ref() aln3 = aligner(self.seqs4) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() self.assertTrue(got[("Human", "Opossum")] >= 0) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() self.assertTrue(got[("Human", "Opossum")] >= 0) aligner = align.align_to_ref(ref_seq="Human") aln3 = aligner(self.seqs4) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() self.assertTrue(got[("Human", "Opossum")] >= 0) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() self.assertTrue(got[("Human", "Opossum")] >= 0) aligner = align.align_to_ref(ref_seq="Opossum") aln3 = aligner(self.seqs4) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() self.assertTrue(got[("Human", "Opossum")] >= 0) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() self.assertTrue(got[("Human", "Opossum")] >= 0) # now as a process proc = align.align_to_ref() + dist_app.fast_slow_dist( fast_calc="hamming", moltype="dna") got = proc(self.seqs1) self.assertEqual(got[("Human", "Rhesus")], 1) treestring = "(Human:0.2,Bandicoot:0.2)" aligner = align.progressive_align(model="WG01", guide_tree=treestring) _ = aligner(self.seqs5) def test_composes_with_write_tabular(self): """correctly links to tabular""" with TemporaryDirectory(dir=".") as dirname: writer = io.write_tabular(dirname) dist_calc = dist_app.fast_slow_dist(distance="hamming", moltype="protein") _ = dist_calc + writer def test_functions_as_composable(self): """works as a composable app""" from pathlib import Path loader = io.load_aligned(moltype="dna", format="paml") dist = dist_app.fast_slow_dist("hamming", moltype="dna") with TemporaryDirectory(dir=".") as dirname: dirname = Path(dirname) writer = io.write_tabular(dirname) proc = loader + dist + writer _ = proc("data/brca1_5.250.paml") output = dirname / "brca1_5.250.tsv" self.assertTrue(output.exists())
def test_dotplot_single(self): """dotplot with single sequence should not fail""" seqs = make_unaligned_seqs({"seq1": "ACGG"}, moltype="dna") dp = seqs.dotplot() self.assertEqual(dp.seq1, dp.seq2)
def test_trim_stop_codons(self): """test without terminal stop handling""" seq_coll = make_unaligned_seqs( data={"seq1": "ACGTAA", "seq2": "ACGACG", "seq3": "ACGCGT"}, moltype=DNA ) seq_coll = seq_coll.trim_stop_codons() seqs = seq_coll.to_dict() self.assertEqual(seqs["seq1"], "ACG") # note: not 'acg---' self.assertEqual(seqs["seq2"], "ACGACG") # aligned aln = make_aligned_seqs( data={"seq1": "ACGTAA", "seq2": "ACGTGA", "seq3": "ACGTAA"}, moltype=DNA ) aln = aln.trim_stop_codons() self.assertEqual( aln.to_dict(), {"seq1": "ACG", "seq2": "ACG", "seq3": "ACG"} ) # note: not 'acg---' aln = make_aligned_seqs( data={"seq1": "ACGAAA", "seq2": "ACGTGA", "seq3": "ACGTAA"}, moltype=DNA ) aln = aln.trim_stop_codons() self.assertEqual( aln.to_dict(), {"seq1": "ACGAAA", "seq2": "ACG---", "seq3": "ACG---"} ) # for case where a sequence length is not divisible by 3 seq_coll = make_unaligned_seqs( data={"seq1": "ACGTAA", "seq2": "ACGAC"}, moltype=DNA ) # fail self.assertRaises(ValueError, seq_coll.trim_stop_codons) # unless explicitly over-ridden with allow_partial new_coll = seq_coll.trim_stop_codons(allow_partial=True) self.assertEqual(new_coll.to_dict(), dict(seq1="ACG", seq2="ACGAC")) # should work for alignments too aln = make_aligned_seqs( data={"seq1": "ACGTAA---", "seq2": "ACGAC----", "seq3": "ACGCAATTT"}, moltype=DNA, ) # fail self.assertRaises(ValueError, aln.trim_stop_codons) # unless explicitly over-ridden with allow_partial aln = aln.trim_stop_codons(allow_partial=True) self.assertEqual( aln.to_dict(), {"seq1": "ACG------", "seq2": "ACGAC----", "seq3": "ACGCAATTT"}, ) # mixed lengths aln = make_aligned_seqs( data={"seq1": "ACGTAA---", "seq2": "ACGAC----", "seq3": "ACGCAATGA"}, moltype=DNA, ) aln = aln.trim_stop_codons(allow_partial=True) self.assertEqual( aln.to_dict(), {"seq1": "ACG---", "seq2": "ACGAC-", "seq3": "ACGCAA"} ) # longest seq not divisible by 3 aln = make_aligned_seqs( data={"seq1": "ACGTAA--", "seq2": "ACGAC---", "seq3": "ACGC-ATG"}, moltype=DNA, ) aln = aln.trim_stop_codons(allow_partial=True) self.assertEqual( aln.to_dict(), {"seq1": "ACG-----", "seq2": "ACGAC---", "seq3": "ACGC-ATG"} )
class RefalignmentTests(TestCase): seqs = make_unaligned_seqs(_seqs, moltype=DNA) def test_align_to_ref(self): """correctly aligns to a reference""" aligner = align_app.align_to_ref(ref_seq="Human") aln = aligner(self.seqs) expect = { "Bandicoot": "---NACTCATTAATGCTTGAAACCAGCAGTTTATTGTCCAAC", "FlyingFox": "GCCAGCTCTTTACAGCATGAGAACAG---TTTATTATACACT", "Human": "GCCAGCTCATTACAGCATGAGAACAGCAGTTTATTACTCACT", "Rhesus": "GCCAGCTCATTACAGCATGAGAAC---AGTTTGTTACTCACT", } self.assertEqual(aln.to_dict(), expect) def test_align_to_ref_generic_moltype(self): """tests when the moltype is generic""" test_moltypes = [ "text", "rna", "protein", "protein_with_stop", "bytes", "ab" ] for test_moltype in test_moltypes: aligner = align_app.align_to_ref(moltype=test_moltype) self.assertEqual(aligner._moltype.label, test_moltype) self.assertEqual( aligner._kwargs["S"], make_generic_scoring_dict(10, get_moltype(test_moltype)), ) def test_align_to_ref_result_has_moltype(self): """aligned object has correct moltype""" aligner = align_app.align_to_ref(moltype="dna") got = aligner(self.seqs) self.assertEqual(got.moltype.label, "dna") def test_merged_gaps(self): """correctly merges gaps""" a = dict([(2, 3), (4, 9)]) b = dict([(2, 6), (8, 5)]) # omitting one just returns the other self.assertIs(_merged_gaps(a, {}), a) self.assertIs(_merged_gaps({}, b), b) got = _merged_gaps(a, b) self.assertEqual(got, [(2, 6), (4, 9), (8, 5)]) def test_aln_to_ref_known(self): """correctly recapitulates known case""" orig = make_aligned_seqs( { "Ref": "CAG---GAGAACAGAAACCCAT--TACTCACT", "Qu1": "CAG---GAGAACAG---CCCGTGTTACTCACT", "Qu2": "CAGCATGAGAACAGAAACCCGT--TA---ACT", "Qu3": "CAGCATGAGAACAGAAACCCGT----CTCACT", "Qu4": "CAGCATGAGAACAGAAACCCGTGTTACTCACT", "Qu5": "CAG---GAGAACAG---CCCAT--TACTCACT", "Qu6": "CAG---GA-AACAG---CCCAT--TACTCACT", "Qu7": "CAG---GA--ACAGA--CCCGT--TA---ACT", }, moltype="dna", ) expect = orig.to_dict() aligner = align_app.align_to_ref(ref_seq="Ref") aln = aligner(orig.degap()) self.assertEqual(aln.to_dict(), expect) def test_gap_union(self): """correctly identifies the union of all gaps""" # fails if not all sequences same seq = DNA.make_seq("AACCCGTT") all_gaps = dict([(0, 3), (2, 1), (5, 3), (6, 3)]) final_seq = make_aligned(all_gaps, seq) gap_sets = [ dict([(5, 1), (6, 3)]), dict([(2, 1), (5, 3)]), dict([(2, 1), (5, 1), (6, 2)]), dict([(0, 3)]), ] seqs = [make_aligned(gaps, seq) for gaps in gap_sets] got = _gap_union(seqs) self.assertEqual(got, dict(all_gaps)) # must all be Aligned instances with self.assertRaises(TypeError): _gap_union(seqs + ["GGGGGGGG"]) # must all have the same name with self.assertRaises(ValueError): _gap_union(seqs + [make_aligned({}, seq, name="blah")]) def test_gap_difference(self): """correctly identifies the difference in gaps""" seq = DNA.make_seq("AACCCGTT") all_gaps = dict([(0, 3), (2, 1), (5, 3), (6, 3)]) gap_sets = [ dict([(5, 1), (6, 3)]), dict([(2, 1), (5, 3)]), dict([(2, 1), (5, 1), (6, 2)]), dict([(0, 3)]), ] seqs = [make_aligned(gaps, seq) for gaps in gap_sets] union = _gap_union(seqs) expects = [ [dict([(0, 3), (2, 1)]), dict([(5, 2)])], [dict([(0, 3), (6, 3)]), {}], [dict([(0, 3)]), dict([(5, 2), (6, 1)])], [dict([(2, 1), (5, 3), (6, 3)]), {}], ] for seq, (plain, overlap) in zip(seqs, expects): seq_gaps = dict(seq.map.get_gap_coordinates()) got_plain, got_overlap = _gap_difference(seq_gaps, union) self.assertEqual(got_plain, dict(plain)) self.assertEqual(got_overlap, dict(overlap)) def test_merged_gaps(self): """correctly handles gap values""" a_gaps = {0: 2} b_gaps = {2: 2} self.assertEqual(_merged_gaps(a_gaps, {}), a_gaps) self.assertEqual(_merged_gaps({}, b_gaps), b_gaps) def test_combined_refseq_gaps(self): union = dict([(0, 3), (2, 1), (5, 3), (6, 3)]) gap_sets = [ [(5, 1), (6, 3)], [(2, 1), (5, 3)], [(2, 1), (5, 1), (6, 2)], [(0, 3)], ] # for subset gaps, their alignment position is the # offset + their position + their gap length expects = [ dict([(6, 2), (0, 3), (2, 1)]), dict([(0, 3), (10, 3)]), dict([(0, 3), (5 + 1 + 1, 2), (6 + 2 + 2, 1)]), dict([(2 + 3, 1), (5 + 3, 3), (6 + 3, 3)]), ] for i, gap_set in enumerate(gap_sets): got = _combined_refseq_gaps(dict(gap_set), union) self.assertEqual(got, expects[i]) # if union gaps equals ref gaps got = _combined_refseq_gaps({2: 2}, {2: 2}) self.assertEqual(got, {}) def test_gaps_for_injection(self): # for gaps before any otherseq gaps, alignment coord is otherseq coord oseq_gaps = {2: 1, 6: 2} rseq_gaps = {0: 3} expect = {0: 3, 2: 1, 6: 2} seqlen = 50 got = _gaps_for_injection(oseq_gaps, rseq_gaps, seqlen) self.assertEqual(got, expect) # for gaps after otherseq gaps seq coord is align coord minus gap # length totals got = _gaps_for_injection(oseq_gaps, {4: 3}, seqlen) expect = {2: 1, 3: 3, 6: 2} self.assertEqual(got, expect) got = _gaps_for_injection(oseq_gaps, {11: 3}, seqlen) expect = {2: 1, 6: 2, 8: 3} self.assertEqual(got, expect) # gaps beyond sequence length added to end of sequence got = _gaps_for_injection({2: 1, 6: 2}, {11: 3, 8: 3}, 7) expect = {2: 1, 6: 2, 7: 6} self.assertEqual(got, expect) def test_pairwise_to_multiple(self): """the standalone function constructs a multiple alignment""" expect = { "Ref": "CAG---GAGAACAGAAACCCAT--TACTCACT", "Qu1": "CAG---GAGAACAG---CCCGTGTTACTCACT", "Qu2": "CAGCATGAGAACAGAAACCCGT--TA---ACT", "Qu3": "CAGCATGAGAACAGAAACCCGT----CTCACT", "Qu7": "CAG---GA--ACAGA--CCCGT--TA---ACT", "Qu4": "CAGCATGAGAACAGAAACCCGTGTTACTCACT", "Qu5": "CAG---GAGAACAG---CCCAT--TACTCACT", "Qu6": "CAG---GA-AACAG---CCCAT--TACTCACT", } aln = make_aligned_seqs(expect, moltype="dna").omit_gap_pos() expect = aln.to_dict() for refseq_name in ["Qu3"]: refseq, pwise = make_pairwise(expect, refseq_name) got = pairwise_to_multiple(pwise, ref_seq=refseq, moltype=refseq.moltype) self.assertEqual(len(got), len(aln)) orig = dict(pwise) _, pwise = make_pairwise(got.to_dict(), refseq_name) got = dict(pwise) # should be able to recover the original pairwise alignments for key, value in got.items(): self.assertEqual(value.to_dict(), orig[key].to_dict(), msg=refseq_name) with self.assertRaises(TypeError): pairwise_to_multiple(pwise, "ACGG", DNA) def test_pairwise_to_multiple_2(self): """correctly handle alignments with gaps beyond end of query""" # cogent3.core.alignment.DataError: Not all sequences are the same length: # max is 425, min is 419 def make_pwise(data, ref_name): result = [] for n, seqs in data.items(): result.append([ n, make_aligned_seqs(data=seqs, moltype="dna", array_align=False) ]) ref_seq = result[0][1].get_seq(ref_name) return result, ref_seq pwise = { "Platypus": { "Opossum": "-----------------GTGC------GAT-------------------------------CCAAAAACCTGTGTC--ACCGT--------GCC----CAGAGCCTCC----CTCAGGCCGCTCGGGGAG---TG-------GCCCCCCG--GC-GGAGGGCAGGGATGGGGAGT-AGGGGTGGCAGTC----GGAACTGGAAGAGCTT-TACAAACC---------GA--------------------GGCT-AGAGGGTC-TGCTTAC-------TTTTTACCTTGG------------GTTTG-CCAGGAGGTAG----------AGGATGA-----------------CTAC--ATCAAG----AGC------------TGGG-------------", "Platypus": "CAGGATGACTACATCAAGAGCTGGGAAGATAACCAGCAAGGAGATGAAGCTCTGGACACTACCAAAGACCCCTGCCAGAACGTGAAGTGCAGCCGACACAAGGTCTGCATCGCTCAGGGCTACCAGAGAGCCATGTGTATCAGCCGCAAGAAGCTGGAGCACAGGATCAAGCAGCCAGCCCTGAAACTCCATGGAAACAGAGAGAGCTTCTGCAAGCCTTGTCACATGACCCAGCTGGCCTCTGTCTGCGGCTCGGACGGACACACTTACAGCTCCGTGTGCAAACTGGAGCAGCAGGCCTGTCTGACCAGCAAGCAGCTGACAGTCAAGTGTGAAGGCCAGTGCCCGTGCCCCACCGATCATGTTCCAGCCTCCACCGCTGATGGAAAACAAGAGACCT", }, "Wombat": { "Opossum": "GTGCGATCCAAAAACCTGTGTCACCGTGCCCAGAGCCTCCCTCAGGCCGCTCGG-GGAGTGGCCCCCCGGCGGAGGGCAGGGATGGGGAGTAGGGGTGGCAGTCGGAACTGGAAGAGCTTTACAAACCGAGGCTAGAGGGTCTGCTTACTTTTTACCTTGG------GTTT--GC-CAGGA---GGT----AGAGGATGACTACATCAAGAGCTGGG---------------------------", "Wombat": "--------CA----------TCACCGC-CCCTGCACC---------CGGCTCGGCGGAGGGGGATTCTAA-GGGGGTCAAGGATGGCGAG-ACCCCTGGCAATTTCA--TGGAGGA------CGAGCAATGGCT-----GTC-GTCCATCTCCCAGTATAGCGGCAAGATCAAGCACTGGAACCGCTTCCGAGACGATGACTACATCAAGAGCTGGGAGGACAGTCAGCAAGGAGATGAAGCGC", }, } pwise, ref_seq = make_pwise(pwise, "Opossum") aln = pairwise_to_multiple(pwise, ref_seq, ref_seq.moltype) self.assertNotIsInstance(aln, NotCompleted) pwise = { "Platypus": { "Opossum": "-----------------GTGC------GAT-------------------------------CCAAAAACCTGTGTC", "Platypus": "CAGGATGACTACATCAAGAGCTGGGAAGATAACCAGCAAGGAGATGAAGCTCTGGACACTACCAAAGACCCCTGCC", }, "Wombat": { "Opossum": "GTGCGATCCAAAAACCTGTGTC", "Wombat": "--------CA----------TC", }, } pwise, ref_seq = make_pwise(pwise, "Opossum") aln = pairwise_to_multiple(pwise, ref_seq, ref_seq.moltype) self.assertNotIsInstance(aln, NotCompleted)
class RefalignmentTests(TestCase): seqs = make_unaligned_seqs(_seqs, moltype=DNA) treestring = "(Bandicoot:0.4,FlyingFox:0.05,(Rhesus:0.06," "Human:0.0):0.04);" def test_align_to_ref(self): """correctly aligns to a reference""" aligner = align_app.align_to_ref(ref_seq="Human") aln = aligner(self.seqs) expect = { "Bandicoot": "---NACTCATTAATGCTTGAAACCAGCAGTTTATTGTCCAAC", "FlyingFox": "GCCAGCTCTTTACAGCATGAGAACAG---TTTATTATACACT", "Human": "GCCAGCTCATTACAGCATGAGAACAGCAGTTTATTACTCACT", "Rhesus": "GCCAGCTCATTACAGCATGAGAAC---AGTTTGTTACTCACT", } self.assertEqual(aln.to_dict(), expect) def test_align_to_ref_generic_moltype(self): """tests when the moltype is generic""" test_moltypes = [ "text", "rna", "protein", "protein_with_stop", "bytes", "ab" ] for test_moltype in test_moltypes: aligner = align_app.align_to_ref(moltype=test_moltype) self.assertEqual(aligner._moltype.label, test_moltype) self.assertEqual( aligner._kwargs["S"], make_generic_scoring_dict(10, get_moltype(test_moltype)), ) def test_align_to_ref_result_has_moltype(self): """aligned object has correct moltype""" aligner = align_app.align_to_ref(moltype="dna") got = aligner(self.seqs) self.assertEqual(got.moltype.label, "dna") def test_progressive_align_protein_moltype(self): """tests guide_tree is None and moltype is protein""" from cogent3 import load_aligned_seqs seqs = load_aligned_seqs("data/nexus_aa.nxs", moltype="protein") seqs = seqs.degap() seqs = seqs.take_seqs(["Rat", "Cow", "Human", "Mouse", "Whale"]) aligner = align_app.progressive_align(model="WG01") got = aligner(seqs) self.assertNotIsInstance(got, NotCompleted) aligner = align_app.progressive_align(model="protein") got = aligner(seqs) self.assertNotIsInstance(got, NotCompleted) def test_progressive_align_nuc(self): """progressive alignment with nuc models""" aligner = align_app.progressive_align(model="TN93", distance="TN93") aln = aligner(self.seqs) expect = { "Rhesus": "GCCAGCTCATTACAGCATGAGAACAG---TTTGTTACTCACT", "Human": "GCCAGCTCATTACAGCATGAGAACAGCAGTTTATTACTCACT", "Bandicoot": "NACTCATTAATGCTTGAAACCAGCAG---TTTATTGTCCAAC", "FlyingFox": "GCCAGCTCTTTACAGCATGAGAACAG---TTTATTATACACT", } got = aln.to_dict() self.assertEqual(got, expect) # using default aligner = align_app.progressive_align(model="TN93", distance="TN93") aln = aligner(self.seqs) self.assertEqual(len(aln), 42) self.assertEqual(aln.moltype, aligner._moltype) # todo the following is not robust across operating systems # so commenting out for now, but needs to be checked # expect = {'Human': 'GCCAGCTCATTACAGCATGAGAACAGCAGTTTATTACTCACT', # 'Rhesus': 'GCCAGCTCATTACAGCATGAGAA---CAGTTTGTTACTCACT', # 'Bandicoot': 'NACTCATTAATGCTTGAAACCAG---CAGTTTATTGTCCAAC', # 'FlyingFox': 'GCCAGCTCTTTACAGCATGAGAA---CAGTTTATTATACACT'} # got = aln.to_dict() # self.assertEqual(got, expect) def test_progressive_fails(self): """should return NotCompletedResult along with message""" # Bandicoot has an inf-frame stop codon seqs = make_unaligned_seqs( data={ "Human": "GCCTCA", "Rhesus": "GCCAGCTCA", "Bandicoot": "TGATCATTA" }, moltype="dna", ) aligner = align_app.progressive_align(model="codon") got = aligner(seqs) self.assertTrue(type(got), NotCompleted) def test_progress_with_guide_tree(self): """progressive align works with provided guide tree""" tree = make_tree(treestring=self.treestring) aligner = align_app.progressive_align(model="nucleotide", guide_tree=self.treestring) aln = aligner(self.seqs) self.assertEqual(len(aln), 42) aligner = align_app.progressive_align(model="nucleotide", guide_tree=tree) aln = aligner(self.seqs) self.assertEqual(len(aln), 42) # even if it has underscores in name treestring = ("(Bandicoot:0.4,FlyingFox:0.05,(Rhesus_macaque:0.06," "Human:0.0):0.04);") aligner = align_app.progressive_align(model="nucleotide", guide_tree=treestring) data = self.seqs.to_dict() data["Rhesus macaque"] = data.pop("Rhesus") seqs = make_unaligned_seqs(data) aln = aligner(seqs) self.assertEqual(len(aln), 42) # guide tree with no lengths raises value error with self.assertRaises(ValueError): _ = align_app.progressive_align( model="nucleotide", guide_tree="(Bandicoot,FlyingFox,(Rhesus_macaque,Human));", ) def test_progressive_align_codon(self): """progressive alignment with codon models""" aligner = align_app.progressive_align(model="GY94") aln = aligner(self.seqs) self.assertEqual(len(aln), 42) aligner = align_app.progressive_align(model="codon") aln = aligner(self.seqs) self.assertEqual(len(aln), 42) def test_pickle_progressive_align(self): """test progressive_align is picklable""" from pickle import dumps, loads aligner = align_app.progressive_align(model="codon") aln = aligner(self.seqs) got = loads(dumps(aln)) self.assertTrue(got) def test_with_genetic_code(self): """handles genetic code argument""" aligner = align_app.progressive_align(model="GY94", gc="2") # the 'TGA' codon is a sense codon in vertebrate mitochondrial self.assertTrue("TGA" in aligner._model.get_motifs()) aligner = align_app.progressive_align(model="codon") # but a stop codon in the standard nuclear self.assertTrue("TGA" not in aligner._model.get_motifs()) # try using a nuclear with self.assertRaises(TypeError): aligner = align_app.progressive_align(model="nucleotide", gc="2") def test_progressive_align_protein(self): """progressive alignment with protein models""" seqs = self.seqs.get_translation() aligner = align_app.progressive_align(model="WG01", guide_tree=self.treestring) aln = aligner(seqs) self.assertEqual(len(aln), 14) aligner = align_app.progressive_align(model="protein", guide_tree=self.treestring) aln = aligner(seqs) self.assertEqual(len(aln), 14)
def get_one2one_orthologs( compara, ref_genes, outpath, not_strict, force_overwrite, test ): """writes one-to-one orthologs of protein coding genes to outpath""" species = Counter(compara.species) written = 0 records = [] with click.progressbar(ref_genes, label="Finding 1to1 orthologs") as ids: for gene in ids: outfile_name = os.path.join(outpath, "%s.fa.gz" % gene) if os.path.exists(outfile_name) and not force_overwrite: written += 1 continue syntenic = list( compara.get_related_genes( stableid=gene, relationship="ortholog_one2one" ) ) if len(syntenic) != 1: continue syntenic = syntenic[0] if not not_strict and ( syntenic is None or Counter(syntenic.get_species_set()) != species ): # skipping, not all species had a 1to1 ortholog for this gene continue seqs = [] for m in syntenic.members: records.append([gene, m.stableid, m.location, m.description]) name = Species.get_common_name(m.genome.species) cds = m.canonical_transcript.cds.trim_stop_codon(allow_partial=True) cds.name = name seqs.append([name, cds]) seqs = make_unaligned_seqs(data=seqs) if test: print() print(gene) print(seqs.to_fasta()) else: with gzip.open(outfile_name, "wt") as outfile: outfile.write(seqs.to_fasta() + "\n") LOGGER.output_file(outfile_name) written += 1 if test: msg = "Would have written %d files to %s" % (written, outpath) else: msg = "Wrote %d files to %s" % (written, outpath) click.echo(msg) if written > 0: metadata = make_table( header=["refid", "stableid", "location", "description"], rows=records ) metadata.write(os.path.join(outpath, "metadata.tsv")) return
class FastSlowDistTests(TestCase): seqs1 = make_unaligned_seqs(_seqs1, moltype=DNA) seqs2 = make_unaligned_seqs(_seqs2, moltype=DNA) seqs3 = make_unaligned_seqs(_seqs3, moltype=DNA) seqs4 = make_unaligned_seqs(_seqs4, moltype=DNA) seqs5 = make_unaligned_seqs(_seqs5, moltype=PROTEIN) def test_init(self): """tests if fast_slow_dist can be initialised correctly""" fast_slow_dist = dist_app.fast_slow_dist(fast_calc="hamming", moltype="dna") self.assertIsInstance(fast_slow_dist.fast_calc, HammingPair) self.assertIsNone(fast_slow_dist._sm) fast_slow_dist = dist_app.fast_slow_dist(distance="TN93") self.assertIsInstance(fast_slow_dist.fast_calc, TN93Pair) self.assertEqual(fast_slow_dist._sm.name, "TN93") fast_slow_dist = dist_app.fast_slow_dist(distance="GTR") self.assertEqual(fast_slow_dist._sm.name, "GTR") fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") self.assertEqual(fast_slow_dist._sm.name, "TN93") self.assertIsNone(fast_slow_dist.fast_calc) with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(distance="TN93", fast_calc="TN93", slow_calc="TN93") with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(fast_calc="GTR") with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(slow_calc="hamming") def test_compatible_parameters(self): """tests if the input parameters are compatible with fast_slow_dist initialisation""" fast_slow_dist = dist_app.fast_slow_dist(fast_calc="hamming", moltype="dna") fast_slow_dist = dist_app.fast_slow_dist(fast_calc="TN93") fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") fast_slow_dist = dist_app.fast_slow_dist(fast_calc="TN93") # fails for paralinear or hamming if no moltype with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(slow_calc="hamming") with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(slow_calc="paralinear") # fails for hamming as slow_calc with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(slow_calc="hamming", moltype="dna") with self.assertRaises(ValueError): fast_slow_dist = dist_app.fast_slow_dist(fast_calc="GTR") def test_composable_apps(self): """tests two composable apps""" composable_apps = _get_all_composable_apps() fast_slow_dist = dist_app.fast_slow_dist(fast_calc="hamming", moltype="dna") for app in composable_apps: # Compose two composable applications, there should not be exceptions. got = app + fast_slow_dist self.assertIsInstance(got, dist_app.fast_slow_dist) self.assertEqual(got._type, "distance") self.assertIs(got.input, app) self.assertIs(got.output, None) self.assertIsInstance(got._input_types, frozenset) self.assertIsInstance(got._output_types, frozenset) self.assertIs(got._in, app) self.assertIs(got._out, None) app.disconnect() fast_slow_dist.disconnect() def test_est_dist_pair_slow(self): """tests the distance between seq pairs in aln""" aligner = align.align_to_ref() aln3 = aligner(self.seqs3) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")]) self.assertTrue(0 <= got[("Mouse", "Human")]) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")]) self.assertTrue(0 <= got[("Mouse", "Human")]) aligner = align.align_to_ref(ref_seq="Human") aln3 = aligner(self.seqs3) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")]) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() assert_allclose(got[("Human", "Mouse")], got[("Mouse", "Human")]) self.assertTrue(0 <= got[("Mouse", "Human")]) aligner = align.align_to_ref(ref_seq="Mouse") aln3 = aligner(self.seqs3) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() self.assertTrue(0 <= got[("Mouse", "Human")]) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() self.assertTrue(0 <= got[("Mouse", "Human")]) aligner = align.align_to_ref() aln3 = aligner(self.seqs4) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() self.assertTrue(0 <= got[("Human", "Opossum")]) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() self.assertTrue(0 <= got[("Human", "Opossum")]) aligner = align.align_to_ref(ref_seq="Human") aln3 = aligner(self.seqs4) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() self.assertTrue(0 <= got[("Human", "Opossum")]) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() self.assertTrue(0 <= got[("Human", "Opossum")]) aligner = align.align_to_ref(ref_seq="Opossum") aln3 = aligner(self.seqs4) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="GTR") got = fast_slow_dist(aln3).to_dict() self.assertTrue(0 <= got[("Human", "Opossum")]) fast_slow_dist = dist_app.fast_slow_dist(slow_calc="TN93") got = fast_slow_dist(aln3).to_dict() self.assertTrue(0 <= got[("Human", "Opossum")]) treestring = "(Human:0.2,Bandicoot:0.2)" aligner = align.progressive_align(model="WG01", guide_tree=treestring) _ = aligner(self.seqs5)
def test_take_n_seqs(self): """select specified number of sequences from a collection""" seqs1 = make_unaligned_seqs( data={ "a": "ACGT", "b": "ACG-", "c": "ACGN", "d": "ACGG", "e": "ACGG", "k": "ACGG", "f": "RAAA", "g": "YAAA", "h": "GGGG", }) seqs2 = seqs1.take_seqs(["a", "c", "e", "g", "h"]) # by order, fixed take = sample.take_n_seqs(3, fixed_choice=True) got = take(seqs1) self.assertEqual(len(got.names), 3) # this should return NotCompleted because it applies the names present in 1 to the next one got = take(seqs2) self.assertIsInstance(got, NotCompleted) take = sample.take_n_seqs(30) # this should fail because too few seqs got = take(seqs1) self.assertIsInstance(got, NotCompleted) # by order, not fixed take = sample.take_n_seqs(3, fixed_choice=False) got1 = take(seqs1) got2 = take(seqs2) self.assertNotEqual(set(got1.names), set(got2.names)) # random choice, fixed take = sample.take_n_seqs(3, random=True, fixed_choice=True) self.assertEqual(take._fixed_choice, True) got1 = take(seqs2) got2 = take(seqs1) self.assertEqual(got1.names, got2.names) # random choice, not fixed take = sample.take_n_seqs(2, random=True, fixed_choice=False) self.assertEqual(take._fixed_choice, False) # testing this is hard, we simply expect the labels to differ on subsequent call # the probability of drawing a specific pair of names on one call is 1/(9 choose 2) = 1/36 # at n = 11, the probability all the pairs will be identical is ~=0 first_call = take(seqs1) for _ in range(11): got = take(seqs1) different = first_call.names != got.names if different: break self.assertTrue(different, msg="failed to generate different random sample") # try setting the seed take = sample.take_n_seqs(2, random=True, seed=123) got = take(seqs1) self.assertNotIsInstance(got, NotCompleted)