class TestSearchBadMAF(unittest.TestCase): """Test index searching on an incorrectly-formatted MAF.""" def setUp(self): self.idx = MafIndex("MAF/ucsc_mm9_chr10_bad.mafindex", "MAF/ucsc_mm9_chr10_bad.maf", "mm9.chr10") self.assertEqual(len(self.idx), 48) def test_incorrect_bundle_coords(self): search = self.idx.search((3013219, ), (3013319, )) self.assertRaises(ValueError, next, search)
class TestSearchBadMAF(unittest.TestCase): """Test index searching on an incorrectly-formatted MAF""" def setUp(self): self.idx = MafIndex("MAF/ucsc_mm9_chr10_bad.mafindex", "MAF/ucsc_mm9_chr10_bad.maf", "mm9.chr10") self.assertEqual(len(self.idx), 48) def test_incorrect_bundle_coords(self): search = self.idx.search((3013219,), (3013319,)) self.assertRaises(ValueError, next, search)
class TestSearchGoodMAF(unittest.TestCase): """Test index searching on a properly-formatted MAF.""" def setUp(self): self.idx = MafIndex("MAF/ucsc_mm9_chr10.mafindex", "MAF/ucsc_mm9_chr10.maf", "mm9.chr10") self.assertEqual(len(self.idx), 48) def test_invalid_type_1(self): search = self.idx.search((500, 1000), ("string", 1500)) self.assertRaises(TypeError, next, search) def test_invalid_type_2(self): search = self.idx.search((500, 1000), (750, 1500.25)) self.assertRaises(TypeError, next, search) def test_invalid_exon_count(self): search = self.idx.search((0, 1000, 2000), (500, 1500)) self.assertRaises(ValueError, next, search) def test_invalid_exon_schema(self): search = self.idx.search((0, 1000, 2000), (250, 500, 2500)) self.assertRaises(ValueError, next, search) def test_correct_retrieval_1(self): """Correct retrieval of Cnksr3 in mouse.""" search = self.idx.search((3014742, 3018161), (3015028, 3018644)) results = list(search) self.assertEqual(len(results), 4 + 4) self.assertEqual({len(x) for x in results}, {4, 1, 9, 10, 4, 3, 5, 1}) # Code formatting note: # Expected start coordinates are grouped by alignment blocks self.assertEqual( {x.annotations["start"] for y in results for x in y}, { 3014742, 6283, 184202, 1257, 3014778, 3014795, 184257, 6365, 15871286, 16389854, 16169492, 171521, 7816, 1309, 3014842, 1371, 7842, 171548, 16169512, 16389874, 15871306, 6404, 184317, 14750994, 3018161, 16390178, 15871611, 16169818, 3018230, 15871676, 16390243, 3018359, 16390338, 15871771, 184712, 16169976, 3018482 }) def test_correct_retrieval_2(self): search = self.idx.search((3009319, 3021421), (3012566, 3021536)) results = list(search) self.assertEqual(len(results), 6) self.assertEqual({len(x) for x in results}, {2, 4, 5, 14, 7, 6}) # Code formatting note: # Expected start coordinates are grouped by alignment blocks self.assertEqual( {x.annotations["start"] for y in results for x in y}, { 3009319, 11087, 3012076, 16160203, 16379004, 15860456, 3012441, 15860899, 16379447, 16160646, 180525, 3021421, 9910, 996, 16173434, 16393782, 15875216, 11047, 175213, 3552, 677, 78072203, 3590, 95587, 14757054, 3021465, 9957, 16173483, 16393831, 15875265, 78072243, 14757099, 3021494, 16173516, 16393864, 15875298, 78072287, 14757144 }) def test_correct_retrieval_3(self): """Following issue 1083. https://github.com/biopython/biopython/issues/1083 """ search = self.idx.search((3012076, 3012076 + 300), (3012076 + 100, 3012076 + 400)) results = list(search) self.assertEqual(len(results), 2) self.assertEqual({len(x) for x in results}, {4, 5}) # Code formatting note: # Expected start coordinates are grouped by alignment blocks self.assertEqual( {x.annotations["start"] for y in results for x in y}, { 3012076, 16160203, 16379004, 15860456, 3012441, 15860899, 16379447, 16160646, 180525 }) def test_correct_block_boundary(self): """Following issues 504 and 1086. https://github.com/biopython/biopython/pull/504 https://github.com/biopython/biopython/pull/1086#issuecomment-285080702 We test what happens at the boundary between these two MAF blocks: a score=19159.000000 s mm9.chr10 3014644 45 + 129993255 CCTGTACC---CTTTGGTGAGAATTTTTGTTTCAGTGTTAAAAGTTTG s hg18.chr6 15870786 46 - 170899992 CCTATACCTTTCTTTTATGAGAA-TTTTGTTTTAATCCTAAAC-TTTT i hg18.chr6 I 9085 C 0 s panTro2.chr6 16389355 46 - 173908612 CCTATACCTTTCTTTTATGAGAA-TTTTGTTTTAATCCTAAAC-TTTT q panTro2.chr6 99999999999999999999999-9999999999999999999-9999 i panTro2.chr6 I 9106 C 0 s calJac1.Contig6394 6182 46 + 133105 CCTATACCTTTCTTTCATGAGAA-TTTTGTTTGAATCCTAAAC-TTTT i calJac1.Contig6394 N 0 C 0 s loxAfr1.scaffold_75566 1167 34 - 10574 ------------TTTGGTTAGAA-TTATGCTTTAATTCAAAAC-TTCC q loxAfr1.scaffold_75566 ------------99999699899-9999999999999869998-9997 i loxAfr1.scaffold_75566 N 0 C 0 e tupBel1.scaffold_114895.1-498454 167376 4145 - 498454 I e echTel1.scaffold_288249 87661 7564 + 100002 I e otoGar1.scaffold_334.1-359464 181217 2931 - 359464 I e ponAbe2.chr6 16161448 8044 - 174210431 I a score=40840.000000 s mm9.chr10 3014689 53 + 129993255 GGGAGCATAAAACTCTAAATCTGCTAAATGTCTTGTCCCT-TTGGAAAGAGTTG s hg18.chr6 15870832 53 - 170899992 GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTT-TGGGAAATAGTGG i hg18.chr6 C 0 I 401 s panTro2.chr6 16389401 53 - 173908612 GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTT-TGGGAAATAGTGG q panTro2.chr6 9999999999999999999999999999999999999999-9999999999999 i panTro2.chr6 C 0 I 400 s calJac1.Contig6394 6228 53 + 133105 GGGATCATAAGCCATTTAATCTGTGAAATGTGAAATCTTT-TGGGAAACAGTGG i calJac1.Contig6394 C 0 I 2 s otoGar1.scaffold_334.1-359464 184148 52 - 359464 GGAAGCATAAACT-TTTAATCTATGAAATATCAAATCACT-TGGGCAATAGCTG q otoGar1.scaffold_334.1-359464 7455455669566-99665699769895555689997599-9984787795599 i otoGar1.scaffold_334.1-359464 I 2931 I 2 s loxAfr1.scaffold_75566 1201 54 - 10574 GGGAGTATAAACCATTTAGTCTGCGAAATGCCAAATCTTCAGGGGAAAAAGCTG q loxAfr1.scaffold_75566 899989799999979999999999999999797999999999999999999999 i loxAfr1.scaffold_75566 C 0 I 2 e tupBel1.scaffold_114895.1-498454 167376 4145 - 498454 I e echTel1.scaffold_288249 87661 7564 + 100002 I e ponAbe2.chr6 16161448 8044 - 174210431 I """ # Segments ending at the end of the first block search = self.idx.search([3014687], [3014689]) self.assertEqual(len(list(search)), 1) search = self.idx.search([3014688], [3014689]) self.assertEqual(len(list(search)), 1) # Segments starting at the beginning of the second block search = self.idx.search([3014689], [3014690]) self.assertEqual(len(list(search)), 1) search = self.idx.search([3014689], [3014691]) self.assertEqual(len(list(search)), 1) # Segments overlapping the 2 blocks search = self.idx.search([3014688], [3014690]) self.assertEqual(len(list(search)), 2) search = self.idx.search([3014687], [3014690]) self.assertEqual(len(list(search)), 2) search = self.idx.search([3014687], [3014691]) self.assertEqual(len(list(search)), 2) def test_correct_block_length(self): """Following issues 504 and 1086. https://github.com/biopython/biopython/pull/504 https://github.com/biopython/biopython/pull/1086#issuecomment-285080702 We get the alignement corresponding to the following whole MAF block and check that the lengths of its sequences are correct: a score=40840.000000 s mm9.chr10 3014689 53 + 129993255 GGGAGCATAAAACTCTAAATCTGCTAAATGTCTTGTCCCT-TTGGAAAGAGTTG s hg18.chr6 15870832 53 - 170899992 GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTT-TGGGAAATAGTGG i hg18.chr6 C 0 I 401 s panTro2.chr6 16389401 53 - 173908612 GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTT-TGGGAAATAGTGG q panTro2.chr6 9999999999999999999999999999999999999999-9999999999999 i panTro2.chr6 C 0 I 400 s calJac1.Contig6394 6228 53 + 133105 GGGATCATAAGCCATTTAATCTGTGAAATGTGAAATCTTT-TGGGAAACAGTGG i calJac1.Contig6394 C 0 I 2 s otoGar1.scaffold_334.1-359464 184148 52 - 359464 GGAAGCATAAACT-TTTAATCTATGAAATATCAAATCACT-TGGGCAATAGCTG q otoGar1.scaffold_334.1-359464 7455455669566-99665699769895555689997599-9984787795599 i otoGar1.scaffold_334.1-359464 I 2931 I 2 s loxAfr1.scaffold_75566 1201 54 - 10574 GGGAGTATAAACCATTTAGTCTGCGAAATGCCAAATCTTCAGGGGAAAAAGCTG q loxAfr1.scaffold_75566 899989799999979999999999999999797999999999999999999999 i loxAfr1.scaffold_75566 C 0 I 2 e tupBel1.scaffold_114895.1-498454 167376 4145 - 498454 I e echTel1.scaffold_288249 87661 7564 + 100002 I e ponAbe2.chr6 16161448 8044 - 174210431 I """ ali = self.idx.get_spliced([3014689], [3014689 + 53]) seq_dict = {seqrec.id: seqrec.seq for seqrec in ali} correct_lengths = { "mm9.chr10": 53, "hg18.chr6": 53, "panTro2.chr6": 53, "calJac1.Contig6394": 53, "otoGar1.scaffold_334.1-359464": 52, "loxAfr1.scaffold_75566": 54 } for seq_id, length in correct_lengths.items(): self.assertEqual(len(seq_dict[seq_id].ungap("-")), length) def test_correct_spliced_sequences_1(self): """Checking that spliced sequences are correct. We get the alignement corresponding to the following whole MAF block and check that the sequences are correct: a score=40840.000000 s mm9.chr10 3014689 53 + 129993255 GGGAGCATAAAACTCTAAATCTGCTAAATGTCTTGTCCCT-TTGGAAAGAGTTG s hg18.chr6 15870832 53 - 170899992 GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTT-TGGGAAATAGTGG i hg18.chr6 C 0 I 401 s panTro2.chr6 16389401 53 - 173908612 GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTT-TGGGAAATAGTGG q panTro2.chr6 9999999999999999999999999999999999999999-9999999999999 i panTro2.chr6 C 0 I 400 s calJac1.Contig6394 6228 53 + 133105 GGGATCATAAGCCATTTAATCTGTGAAATGTGAAATCTTT-TGGGAAACAGTGG i calJac1.Contig6394 C 0 I 2 s otoGar1.scaffold_334.1-359464 184148 52 - 359464 GGAAGCATAAACT-TTTAATCTATGAAATATCAAATCACT-TGGGCAATAGCTG q otoGar1.scaffold_334.1-359464 7455455669566-99665699769895555689997599-9984787795599 i otoGar1.scaffold_334.1-359464 I 2931 I 2 s loxAfr1.scaffold_75566 1201 54 - 10574 GGGAGTATAAACCATTTAGTCTGCGAAATGCCAAATCTTCAGGGGAAAAAGCTG q loxAfr1.scaffold_75566 899989799999979999999999999999797999999999999999999999 i loxAfr1.scaffold_75566 C 0 I 2 e tupBel1.scaffold_114895.1-498454 167376 4145 - 498454 I e echTel1.scaffold_288249 87661 7564 + 100002 I e ponAbe2.chr6 16161448 8044 - 174210431 I """ ali = self.idx.get_spliced([3014689], [3014689 + 53]) seq_dict = {seqrec.id: seqrec.seq for seqrec in ali} correct_sequences = { "mm9.chr10": "GGGAGCATAAAACTCTAAATCTGCTAAATGTCTTGTCCCTTTGGAAAGAGTTG", "hg18.chr6": "GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTTTGGGAAATAGTGG", "panTro2.chr6": "GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTTTGGGAAATAGTGG", "calJac1.Contig6394": "GGGATCATAAGCCATTTAATCTGTGAAATGTGAAATCTTTTGGGAAACAGTGG", "otoGar1.scaffold_334.1-359464": "GGAAGCATAAACTTTTAATCTATGAAATATCAAATCACTTGGGCAATAGCTG", "loxAfr1.scaffold_75566": "GGGAGTATAAACCATTTAGTCTGCGAAATGCCAAATCTTCAGGGGAAAAAGCTG" } for seq_id, sequence in correct_sequences.items(): self.assertEqual(seq_dict[seq_id].ungap("-"), sequence) def test_correct_spliced_sequences_2(self): """Checking that spliced sequences are correct. We get spliced alignements from following MAF blocks and check that the sequences are correct: a score=19159.000000 s mm9.chr10 3014644 45 + 129993255 CCTGTACC---CTTTGGTGAGAATTTTTGTTTCAGTGTTAAAAGTTTG s hg18.chr6 15870786 46 - 170899992 CCTATACCTTTCTTTTATGAGAA-TTTTGTTTTAATCCTAAAC-TTTT i hg18.chr6 I 9085 C 0 s panTro2.chr6 16389355 46 - 173908612 CCTATACCTTTCTTTTATGAGAA-TTTTGTTTTAATCCTAAAC-TTTT q panTro2.chr6 99999999999999999999999-9999999999999999999-9999 i panTro2.chr6 I 9106 C 0 s calJac1.Contig6394 6182 46 + 133105 CCTATACCTTTCTTTCATGAGAA-TTTTGTTTGAATCCTAAAC-TTTT i calJac1.Contig6394 N 0 C 0 s loxAfr1.scaffold_75566 1167 34 - 10574 ------------TTTGGTTAGAA-TTATGCTTTAATTCAAAAC-TTCC q loxAfr1.scaffold_75566 ------------99999699899-9999999999999869998-9997 i loxAfr1.scaffold_75566 N 0 C 0 e tupBel1.scaffold_114895.1-498454 167376 4145 - 498454 I e echTel1.scaffold_288249 87661 7564 + 100002 I e otoGar1.scaffold_334.1-359464 181217 2931 - 359464 I e ponAbe2.chr6 16161448 8044 - 174210431 I a score=40840.000000 s mm9.chr10 3014689 53 + 129993255 GGGAGCATAAAACTCTAAATCTGCTAAATGTCTTGTCCCT-TTGGAAAGAGTTG s hg18.chr6 15870832 53 - 170899992 GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTT-TGGGAAATAGTGG i hg18.chr6 C 0 I 401 s panTro2.chr6 16389401 53 - 173908612 GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTT-TGGGAAATAGTGG q panTro2.chr6 9999999999999999999999999999999999999999-9999999999999 i panTro2.chr6 C 0 I 400 s calJac1.Contig6394 6228 53 + 133105 GGGATCATAAGCCATTTAATCTGTGAAATGTGAAATCTTT-TGGGAAACAGTGG i calJac1.Contig6394 C 0 I 2 s otoGar1.scaffold_334.1-359464 184148 52 - 359464 GGAAGCATAAACT-TTTAATCTATGAAATATCAAATCACT-TGGGCAATAGCTG q otoGar1.scaffold_334.1-359464 7455455669566-99665699769895555689997599-9984787795599 i otoGar1.scaffold_334.1-359464 I 2931 I 2 s loxAfr1.scaffold_75566 1201 54 - 10574 GGGAGTATAAACCATTTAGTCTGCGAAATGCCAAATCTTCAGGGGAAAAAGCTG q loxAfr1.scaffold_75566 899989799999979999999999999999797999999999999999999999 i loxAfr1.scaffold_75566 C 0 I 2 e tupBel1.scaffold_114895.1-498454 167376 4145 - 498454 I e echTel1.scaffold_288249 87661 7564 + 100002 I e ponAbe2.chr6 16161448 8044 - 174210431 I """ ali = self.idx.get_spliced([3014644, 3014689], [3014644 + 45, 3014689 + 53]) seq_dict = {seqrec.id: seqrec.seq for seqrec in ali} correct_sequences = { "mm9.chr10": "CCTGTACCCTTTGGTGAGAATTTTTGTTTCAGTGTTAAAAGTTTGGGGAGCATAAAACTCTAAATCTGCTAAATGTCTTGTCCCTTTGGAAAGAGTTG", "hg18.chr6": "CCTATACCTTTCTTTTATGAGAATTTTGTTTTAATCCTAAACTTTTGGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTTTGGGAAATAGTGG", "panTro2.chr6": "CCTATACCTTTCTTTTATGAGAATTTTGTTTTAATCCTAAACTTTTGGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTTTGGGAAATAGTGG", "calJac1.Contig6394": "CCTATACCTTTCTTTCATGAGAATTTTGTTTGAATCCTAAACTTTTGGGATCATAAGCCATTTAATCTGTGAAATGTGAAATCTTTTGGGAAACAGTGG", "otoGar1.scaffold_334.1-359464": "GGAAGCATAAACTTTTAATCTATGAAATATCAAATCACTTGGGCAATAGCTG", "loxAfr1.scaffold_75566": "TTTGGTTAGAATTATGCTTTAATTCAAAACTTCCGGGAGTATAAACCATTTAGTCTGCGAAATGCCAAATCTTCAGGGGAAAAAGCTG" } for seq_id, sequence in correct_sequences.items(): self.assertEqual(seq_dict[seq_id].ungap("-"), sequence)
class TestSearchGoodMAF(unittest.TestCase): """Test index searching on a properly-formatted MAF""" def setUp(self): self.idx = MafIndex("MAF/ucsc_mm9_chr10.mafindex", "MAF/ucsc_mm9_chr10.maf", "mm9.chr10") self.assertEqual(len(self.idx), 48) def test_invalid_type_1(self): search = self.idx.search((500, 1000), ("string", 1500)) self.assertRaises(TypeError, next, search) def test_invalid_type_2(self): search = self.idx.search((500, 1000), (750, 1500.25)) self.assertRaises(TypeError, next, search) def test_invalid_exon_count(self): search = self.idx.search((0, 1000, 2000), (500, 1500)) self.assertRaises(ValueError, next, search) def test_invalid_exon_schema(self): search = self.idx.search((0, 1000, 2000), (250, 500, 2500)) self.assertRaises(ValueError, next, search) def test_correct_retrieval_1(self): search = self.idx.search((3014742, 3018161), (3015028, 3018644)) results = [x for x in search] self.assertEqual(len(results), 12) self.assertEqual(set([len(x) for x in results]), set([5, 10, 7, 6, 3, 1, 1, 1, 2, 4, 4, 9])) self.assertEqual(set([x.annotations["start"] for y in results for x in y]), set([3018359, 16390338, 15871771, 184712, 16169512, 16169976, 3014842, 1371, 7842, 171548, 16389874, 15871306, 6404, 184317, 14750994, 3015028, 1616, 8040, 171763, 16169731, 6627, 184539, 3014689, 15870832, 16389401, 6228, 184148, 1201, 3018230, 15871676, 16390243, 3014778, 3018482, 3017743, 3018644, 78070420, 3014742, 6283, 184202, 1257, 3018161, 16390178, 15871611, 16169818, 3014795, 184257, 6365, 15871286, 16389854, 16169492, 171521, 7816, 1309])) def test_correct_retrieval_2(self): search = self.idx.search((3009319, 3021421), (3012566, 3021536)) results = [x for x in search] self.assertEqual(len(results), 8) self.assertEqual(set([len(x) for x in results]), set([14, 5, 2, 6, 7, 15, 6, 4])) self.assertEqual(set([x.annotations["start"] for y in results for x in y]), set([3021421, 9910, 996, 16173434, 16393782, 15875216, 11047, 175213, 3552, 677, 78072203, 3590, 95587, 14757054, 3012441, 15860899, 16379447, 16160646, 180525, 3009319, 11087, 3012566, 15861013, 16379561, 16160760, 180626, 310, 3021465, 9957, 16173483, 16393831, 15875265, 78072243, 14757099, 3021275, 9741, 838, 16173265, 16393613, 15875047, 10878, 175057, 3382, 521, 78072035, 73556, 3422, 95418, 14756885, 3021494, 16173516, 16393864, 15875298, 78072287, 14757144, 3012076, 16160203, 16379004, 15860456])) def test_correct_retrieval_3(self): search = self.idx.search((3012076, 3012076 + 300), (3012076 + 100, 3012076 + 400)) results = [x for x in search] self.assertEqual(len(results), 2) self.assertEqual(set([len(x) for x in results]), set([4, 5])) # Code formatting note: # Expected start coordinates are grouped by alignment blocks self.assertEqual( set([x.annotations["start"] for y in results for x in y]), set([ 3012076, 16160203, 16379004, 15860456, 3012441, 15860899, 16379447, 16160646, 180525]))
class TestSearchGoodMAF(unittest.TestCase): """Test index searching on a properly-formatted MAF.""" def setUp(self): self.idx = MafIndex("MAF/ucsc_mm9_chr10.mafindex", "MAF/ucsc_mm9_chr10.maf", "mm9.chr10") self.assertEqual(len(self.idx), 48) def test_invalid_type_1(self): search = self.idx.search((500, 1000), ("string", 1500)) self.assertRaises(TypeError, next, search) def test_invalid_type_2(self): search = self.idx.search((500, 1000), (750, 1500.25)) self.assertRaises(TypeError, next, search) def test_invalid_exon_count(self): search = self.idx.search((0, 1000, 2000), (500, 1500)) self.assertRaises(ValueError, next, search) def test_invalid_exon_schema(self): search = self.idx.search((0, 1000, 2000), (250, 500, 2500)) self.assertRaises(ValueError, next, search) def test_correct_retrieval_1(self): """Correct retrieval of Cnksr3 in mouse.""" search = self.idx.search((3014742, 3018161), (3015028, 3018644)) results = [x for x in search] self.assertEqual(len(results), 4 + 4) self.assertEqual(set([len(x) for x in results]), set([4, 1, 9, 10, 4, 3, 5, 1])) # Code formatting note: # Expected start coordinates are grouped by alignment blocks self.assertEqual( set([x.annotations["start"] for y in results for x in y]), set([ 3014742, 6283, 184202, 1257, 3014778, 3014795, 184257, 6365, 15871286, 16389854, 16169492, 171521, 7816, 1309, 3014842, 1371, 7842, 171548, 16169512, 16389874, 15871306, 6404, 184317, 14750994, 3018161, 16390178, 15871611, 16169818, 3018230, 15871676, 16390243, 3018359, 16390338, 15871771, 184712, 16169976, 3018482])) def test_correct_retrieval_2(self): search = self.idx.search((3009319, 3021421), (3012566, 3021536)) results = [x for x in search] self.assertEqual(len(results), 6) self.assertEqual(set([len(x) for x in results]), set([2, 4, 5, 14, 7, 6])) # Code formatting note: # Expected start coordinates are grouped by alignment blocks self.assertEqual( set([x.annotations["start"] for y in results for x in y]), set([ 3009319, 11087, 3012076, 16160203, 16379004, 15860456, 3012441, 15860899, 16379447, 16160646, 180525, 3021421, 9910, 996, 16173434, 16393782, 15875216, 11047, 175213, 3552, 677, 78072203, 3590, 95587, 14757054, 3021465, 9957, 16173483, 16393831, 15875265, 78072243, 14757099, 3021494, 16173516, 16393864, 15875298, 78072287, 14757144])) def test_correct_retrieval_3(self): """Following issue 1083. https://github.com/biopython/biopython/issues/1083 """ search = self.idx.search((3012076, 3012076 + 300), (3012076 + 100, 3012076 + 400)) results = [x for x in search] self.assertEqual(len(results), 2) self.assertEqual(set([len(x) for x in results]), set([4, 5])) # Code formatting note: # Expected start coordinates are grouped by alignment blocks self.assertEqual( set([x.annotations["start"] for y in results for x in y]), set([ 3012076, 16160203, 16379004, 15860456, 3012441, 15860899, 16379447, 16160646, 180525])) def test_correct_block_boundary(self): """Following issues 504 and 1086. https://github.com/biopython/biopython/pull/504 https://github.com/biopython/biopython/pull/1086#issuecomment-285080702 We test what happens at the boundary between these two MAF blocks: a score=19159.000000 s mm9.chr10 3014644 45 + 129993255 CCTGTACC---CTTTGGTGAGAATTTTTGTTTCAGTGTTAAAAGTTTG s hg18.chr6 15870786 46 - 170899992 CCTATACCTTTCTTTTATGAGAA-TTTTGTTTTAATCCTAAAC-TTTT i hg18.chr6 I 9085 C 0 s panTro2.chr6 16389355 46 - 173908612 CCTATACCTTTCTTTTATGAGAA-TTTTGTTTTAATCCTAAAC-TTTT q panTro2.chr6 99999999999999999999999-9999999999999999999-9999 i panTro2.chr6 I 9106 C 0 s calJac1.Contig6394 6182 46 + 133105 CCTATACCTTTCTTTCATGAGAA-TTTTGTTTGAATCCTAAAC-TTTT i calJac1.Contig6394 N 0 C 0 s loxAfr1.scaffold_75566 1167 34 - 10574 ------------TTTGGTTAGAA-TTATGCTTTAATTCAAAAC-TTCC q loxAfr1.scaffold_75566 ------------99999699899-9999999999999869998-9997 i loxAfr1.scaffold_75566 N 0 C 0 e tupBel1.scaffold_114895.1-498454 167376 4145 - 498454 I e echTel1.scaffold_288249 87661 7564 + 100002 I e otoGar1.scaffold_334.1-359464 181217 2931 - 359464 I e ponAbe2.chr6 16161448 8044 - 174210431 I a score=40840.000000 s mm9.chr10 3014689 53 + 129993255 GGGAGCATAAAACTCTAAATCTGCTAAATGTCTTGTCCCT-TTGGAAAGAGTTG s hg18.chr6 15870832 53 - 170899992 GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTT-TGGGAAATAGTGG i hg18.chr6 C 0 I 401 s panTro2.chr6 16389401 53 - 173908612 GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTT-TGGGAAATAGTGG q panTro2.chr6 9999999999999999999999999999999999999999-9999999999999 i panTro2.chr6 C 0 I 400 s calJac1.Contig6394 6228 53 + 133105 GGGATCATAAGCCATTTAATCTGTGAAATGTGAAATCTTT-TGGGAAACAGTGG i calJac1.Contig6394 C 0 I 2 s otoGar1.scaffold_334.1-359464 184148 52 - 359464 GGAAGCATAAACT-TTTAATCTATGAAATATCAAATCACT-TGGGCAATAGCTG q otoGar1.scaffold_334.1-359464 7455455669566-99665699769895555689997599-9984787795599 i otoGar1.scaffold_334.1-359464 I 2931 I 2 s loxAfr1.scaffold_75566 1201 54 - 10574 GGGAGTATAAACCATTTAGTCTGCGAAATGCCAAATCTTCAGGGGAAAAAGCTG q loxAfr1.scaffold_75566 899989799999979999999999999999797999999999999999999999 i loxAfr1.scaffold_75566 C 0 I 2 e tupBel1.scaffold_114895.1-498454 167376 4145 - 498454 I e echTel1.scaffold_288249 87661 7564 + 100002 I e ponAbe2.chr6 16161448 8044 - 174210431 I """ # Segments ending at the end of the first block search = self.idx.search([3014687], [3014689]) self.assertEqual(len(list(search)), 1) search = self.idx.search([3014688], [3014689]) self.assertEqual(len(list(search)), 1) # Segments starting at the beginning of the second block search = self.idx.search([3014689], [3014690]) self.assertEqual(len(list(search)), 1) search = self.idx.search([3014689], [3014691]) self.assertEqual(len(list(search)), 1) # Segments overlapping the 2 blocks search = self.idx.search([3014688], [3014690]) self.assertEqual(len(list(search)), 2) search = self.idx.search([3014687], [3014690]) self.assertEqual(len(list(search)), 2) search = self.idx.search([3014687], [3014691]) self.assertEqual(len(list(search)), 2) def test_correct_block_length(self): """Following issues 504 and 1086. https://github.com/biopython/biopython/pull/504 https://github.com/biopython/biopython/pull/1086#issuecomment-285080702 We get the alignement corresponding to the following whole MAF block and check that the lengths of its sequences are correct: a score=40840.000000 s mm9.chr10 3014689 53 + 129993255 GGGAGCATAAAACTCTAAATCTGCTAAATGTCTTGTCCCT-TTGGAAAGAGTTG s hg18.chr6 15870832 53 - 170899992 GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTT-TGGGAAATAGTGG i hg18.chr6 C 0 I 401 s panTro2.chr6 16389401 53 - 173908612 GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTT-TGGGAAATAGTGG q panTro2.chr6 9999999999999999999999999999999999999999-9999999999999 i panTro2.chr6 C 0 I 400 s calJac1.Contig6394 6228 53 + 133105 GGGATCATAAGCCATTTAATCTGTGAAATGTGAAATCTTT-TGGGAAACAGTGG i calJac1.Contig6394 C 0 I 2 s otoGar1.scaffold_334.1-359464 184148 52 - 359464 GGAAGCATAAACT-TTTAATCTATGAAATATCAAATCACT-TGGGCAATAGCTG q otoGar1.scaffold_334.1-359464 7455455669566-99665699769895555689997599-9984787795599 i otoGar1.scaffold_334.1-359464 I 2931 I 2 s loxAfr1.scaffold_75566 1201 54 - 10574 GGGAGTATAAACCATTTAGTCTGCGAAATGCCAAATCTTCAGGGGAAAAAGCTG q loxAfr1.scaffold_75566 899989799999979999999999999999797999999999999999999999 i loxAfr1.scaffold_75566 C 0 I 2 e tupBel1.scaffold_114895.1-498454 167376 4145 - 498454 I e echTel1.scaffold_288249 87661 7564 + 100002 I e ponAbe2.chr6 16161448 8044 - 174210431 I """ ali = self.idx.get_spliced([3014689], [3014689 + 53]) seq_dict = dict([(seqrec.id, seqrec.seq) for seqrec in ali]) correct_lengths = { "mm9.chr10": 53, "hg18.chr6": 53, "panTro2.chr6": 53, "calJac1.Contig6394": 53, "otoGar1.scaffold_334.1-359464": 52, "loxAfr1.scaffold_75566": 54} for seq_id, length in correct_lengths.items(): self.assertEqual(len(seq_dict[seq_id].ungap('-')), length) def test_correct_spliced_sequences_1(self): """Checking that spliced sequences are correct. We get the alignement corresponding to the following whole MAF block and check that the sequences are correct: a score=40840.000000 s mm9.chr10 3014689 53 + 129993255 GGGAGCATAAAACTCTAAATCTGCTAAATGTCTTGTCCCT-TTGGAAAGAGTTG s hg18.chr6 15870832 53 - 170899992 GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTT-TGGGAAATAGTGG i hg18.chr6 C 0 I 401 s panTro2.chr6 16389401 53 - 173908612 GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTT-TGGGAAATAGTGG q panTro2.chr6 9999999999999999999999999999999999999999-9999999999999 i panTro2.chr6 C 0 I 400 s calJac1.Contig6394 6228 53 + 133105 GGGATCATAAGCCATTTAATCTGTGAAATGTGAAATCTTT-TGGGAAACAGTGG i calJac1.Contig6394 C 0 I 2 s otoGar1.scaffold_334.1-359464 184148 52 - 359464 GGAAGCATAAACT-TTTAATCTATGAAATATCAAATCACT-TGGGCAATAGCTG q otoGar1.scaffold_334.1-359464 7455455669566-99665699769895555689997599-9984787795599 i otoGar1.scaffold_334.1-359464 I 2931 I 2 s loxAfr1.scaffold_75566 1201 54 - 10574 GGGAGTATAAACCATTTAGTCTGCGAAATGCCAAATCTTCAGGGGAAAAAGCTG q loxAfr1.scaffold_75566 899989799999979999999999999999797999999999999999999999 i loxAfr1.scaffold_75566 C 0 I 2 e tupBel1.scaffold_114895.1-498454 167376 4145 - 498454 I e echTel1.scaffold_288249 87661 7564 + 100002 I e ponAbe2.chr6 16161448 8044 - 174210431 I """ ali = self.idx.get_spliced([3014689], [3014689 + 53]) seq_dict = dict([(seqrec.id, seqrec.seq) for seqrec in ali]) correct_sequences = { "mm9.chr10": "GGGAGCATAAAACTCTAAATCTGCTAAATGTCTTGTCCCTTTGGAAAGAGTTG", "hg18.chr6": "GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTTTGGGAAATAGTGG", "panTro2.chr6": "GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTTTGGGAAATAGTGG", "calJac1.Contig6394": "GGGATCATAAGCCATTTAATCTGTGAAATGTGAAATCTTTTGGGAAACAGTGG", "otoGar1.scaffold_334.1-359464": "GGAAGCATAAACTTTTAATCTATGAAATATCAAATCACTTGGGCAATAGCTG", "loxAfr1.scaffold_75566": "GGGAGTATAAACCATTTAGTCTGCGAAATGCCAAATCTTCAGGGGAAAAAGCTG"} for seq_id, sequence in correct_sequences.items(): self.assertEqual(seq_dict[seq_id].ungap('-'), sequence) def test_correct_spliced_sequences_2(self): """Checking that spliced sequences are correct. We get spliced alignements from following MAF blocks and check that the sequences are correct: a score=19159.000000 s mm9.chr10 3014644 45 + 129993255 CCTGTACC---CTTTGGTGAGAATTTTTGTTTCAGTGTTAAAAGTTTG s hg18.chr6 15870786 46 - 170899992 CCTATACCTTTCTTTTATGAGAA-TTTTGTTTTAATCCTAAAC-TTTT i hg18.chr6 I 9085 C 0 s panTro2.chr6 16389355 46 - 173908612 CCTATACCTTTCTTTTATGAGAA-TTTTGTTTTAATCCTAAAC-TTTT q panTro2.chr6 99999999999999999999999-9999999999999999999-9999 i panTro2.chr6 I 9106 C 0 s calJac1.Contig6394 6182 46 + 133105 CCTATACCTTTCTTTCATGAGAA-TTTTGTTTGAATCCTAAAC-TTTT i calJac1.Contig6394 N 0 C 0 s loxAfr1.scaffold_75566 1167 34 - 10574 ------------TTTGGTTAGAA-TTATGCTTTAATTCAAAAC-TTCC q loxAfr1.scaffold_75566 ------------99999699899-9999999999999869998-9997 i loxAfr1.scaffold_75566 N 0 C 0 e tupBel1.scaffold_114895.1-498454 167376 4145 - 498454 I e echTel1.scaffold_288249 87661 7564 + 100002 I e otoGar1.scaffold_334.1-359464 181217 2931 - 359464 I e ponAbe2.chr6 16161448 8044 - 174210431 I a score=40840.000000 s mm9.chr10 3014689 53 + 129993255 GGGAGCATAAAACTCTAAATCTGCTAAATGTCTTGTCCCT-TTGGAAAGAGTTG s hg18.chr6 15870832 53 - 170899992 GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTT-TGGGAAATAGTGG i hg18.chr6 C 0 I 401 s panTro2.chr6 16389401 53 - 173908612 GGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTT-TGGGAAATAGTGG q panTro2.chr6 9999999999999999999999999999999999999999-9999999999999 i panTro2.chr6 C 0 I 400 s calJac1.Contig6394 6228 53 + 133105 GGGATCATAAGCCATTTAATCTGTGAAATGTGAAATCTTT-TGGGAAACAGTGG i calJac1.Contig6394 C 0 I 2 s otoGar1.scaffold_334.1-359464 184148 52 - 359464 GGAAGCATAAACT-TTTAATCTATGAAATATCAAATCACT-TGGGCAATAGCTG q otoGar1.scaffold_334.1-359464 7455455669566-99665699769895555689997599-9984787795599 i otoGar1.scaffold_334.1-359464 I 2931 I 2 s loxAfr1.scaffold_75566 1201 54 - 10574 GGGAGTATAAACCATTTAGTCTGCGAAATGCCAAATCTTCAGGGGAAAAAGCTG q loxAfr1.scaffold_75566 899989799999979999999999999999797999999999999999999999 i loxAfr1.scaffold_75566 C 0 I 2 e tupBel1.scaffold_114895.1-498454 167376 4145 - 498454 I e echTel1.scaffold_288249 87661 7564 + 100002 I e ponAbe2.chr6 16161448 8044 - 174210431 I """ ali = self.idx.get_spliced([3014644, 3014689], [3014644 + 45, 3014689 + 53]) seq_dict = dict([(seqrec.id, seqrec.seq) for seqrec in ali]) correct_sequences = { "mm9.chr10": "CCTGTACCCTTTGGTGAGAATTTTTGTTTCAGTGTTAAAAGTTTGGGGAGCATAAAACTCTAAATCTGCTAAATGTCTTGTCCCTTTGGAAAGAGTTG", "hg18.chr6": "CCTATACCTTTCTTTTATGAGAATTTTGTTTTAATCCTAAACTTTTGGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTTTGGGAAATAGTGG", "panTro2.chr6": "CCTATACCTTTCTTTTATGAGAATTTTGTTTTAATCCTAAACTTTTGGGATCATAAACCATTTAATCTGTGAAATATCTAATCTTTTGGGAAATAGTGG", "calJac1.Contig6394": "CCTATACCTTTCTTTCATGAGAATTTTGTTTGAATCCTAAACTTTTGGGATCATAAGCCATTTAATCTGTGAAATGTGAAATCTTTTGGGAAACAGTGG", "otoGar1.scaffold_334.1-359464": "GGAAGCATAAACTTTTAATCTATGAAATATCAAATCACTTGGGCAATAGCTG", "loxAfr1.scaffold_75566": "TTTGGTTAGAATTATGCTTTAATTCAAAACTTCCGGGAGTATAAACCATTTAGTCTGCGAAATGCCAAATCTTCAGGGGAAAAAGCTG"} for seq_id, sequence in correct_sequences.items(): self.assertEqual(seq_dict[seq_id].ungap('-'), sequence)