class TestINDELAlleleGenerator(): def setup(self): DB.drop_database('mykrobe-test') self.pg = AlleleGenerator( reference_filepath=f"{DATA_DIR}/BX571856.1.fasta") self.pg2 = AlleleGenerator( reference_filepath=f"{DATA_DIR}/NC_000962.3.fasta") self.reference_set = ReferenceSet().create_and_save(name="ref_set") self.variant_set = VariantSet.create_and_save( name="this_vcf_file", reference_set=self.reference_set) self.variant_sets = [self.variant_set] self.reference = Reference().create_and_save( name="ref", md5checksum="sre", reference_sets=[self.reference_set]) def test_simple_deletion1(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="AA", start=31, alternate_bases=["A"]) assert v.is_indel assert v.is_deletion panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs assert self.pg._calculate_length_delta_from_indels(v, []) == 1 assert panel.alts == [ "GATTAAAGATAGAAATACACGATGCGAGCATCAAATTTCATAACATCACCATGAGTTTG" ] def test_simple_deletion2(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="AT", start=32, alternate_bases=["A"]) panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert panel.alts == [ "ATTAAAGATAGAAATACACGATGCGAGCAACAAATTTCATAACATCACCATGAGTTTGAT" ] def test_simple_deletion3(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="AT", start=2902618, alternate_bases=["T"]) panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert "TTTATACTACTGCTCAATTTTTTTACTTTTATNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" in panel.refs assert panel.alts == [ "TTTATACTACTGCTCAATTTTTTTACTTTTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" ] def test_simple_deletion4(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert panel.alts == [ "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT" ] def test_simple_insertion1(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=1, alternate_bases=["TTTC"]) panel = self.pg.create(v) # assert_no_overlapping_kmers(panel)### Skip this test for vars in first k bases of ref assert v.is_indel assert v.is_insertion assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs assert panel.alts == ["TTTCGATTAAAGATAGAAATACACGATGCGAGC"] def test_simple_insertion2(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=1, alternate_bases=["CTTT"]) panel = self.pg.create(v) # assert_no_overlapping_kmers(panel)### Skip this test for vars in first k bases of ref assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs assert panel.alts == ["CTTTGATTAAAGATAGAAATACACGATGCGAGCA"] def test_simple_insertion3(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["ATTT"]) panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs assert panel.alts == [ "GATTAAAGATAGAAATACACGATGCGAGCATTTATCAAATTTCATAACATCACCATGAGTTTG" ] def test_simple_insertion4(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["AGGGG"]) panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert panel.alts == [ "ATTAAAGATAGAAATACACGATGCGAGCAAGGGGTCAAATTTCATAACATCACCATGAGTTTGA" ] def test_simple_insertion5(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=2902618, alternate_bases=["ATGC"]) panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert "TTTATACTACTGCTCAATTTTTTTACTTTTATNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" in panel.refs assert panel.alts == [ "TATACTACTGCTCAATTTTTTTACTTTTATGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" ] def test_double_insertion(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=4021408, alternate_bases=["ACGCTGGCGGGCG"]) v1 = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="AGA", start=4021406, alternate_bases=["CGG"]) context = [v1] assert self.pg2._remove_overlapping_contexts(v, [v1]) == [] panel = self.pg2.create(v, context=context) assert_no_overlapping_kmers(panel) assert "ATCTAGCCGCAAGGGCGCGAGCAGACGCAGAATCGCATGATTTGAGCTCAAATCATGCGAT" in panel.refs assert panel.alts == [ "TCTAGCCGCAAGGGCGCGAGCAGACGCAGACGCTGGCGGGCGATCGCATGATTTGAGCTCAAATCATGCGAT" ] def test_double_indel_fail(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="CCA", start=2288851, alternate_bases=["A"]) v1 = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=2288850, alternate_bases=["ACC"]) context = [v1] panel = self.pg2.create(v, context=context) assert "GGCGCACACAATGATCGGTGGCAATACCGACCACATCGACCTCATCGACGCCGCGTTGCCG" in panel.refs assert "GGCGCACACAATGATCGGTGGCAATACCGACCACATCGACCTCATCGACGCCGCGTTGCCG" not in panel.alts def test_large_insertion(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="CCGCCGGCCCCGCCGTTT", start=1636155, alternate_bases=[ "CTGCCGGCCCCGCCGGCGCCGCCCAATCCACCGAAGCCCCTCCCTTCGGTGGGGTCGCTGCCGCCGTCGCCGCCGTCACCGCCCTTGCCGCCGGCCCCGCCGTCGCCGCCGGCTCCGGCGGTGCCGTCGCCGCCCTGGCCGCCGGCCCCGCCGTTTCCG" ]) panel = self.pg2.create(v, context=[]) assert_no_overlapping_kmers(panel) assert "AGACCTAGCAGGGTGCCGGCGCCGCCCTTGCCGCCGGCCCCGCCGTTTCCGCCGCCGCCAT" in panel.refs assert panel.alts == [ "GACCTAGCAGGGTGCCGGCGCCGCCCTTGCTGCCGGCCCCGCCGGCGCCGCCCAATCCACCGAAGCCCCTCCCTTCGGTGGGGTCGCTGCCGCCGTCGCCGCCGTCACCGCCCTTGCCGCCGGCCCCGCCGTCGCCGCCGGCTCCGGCGGTGCCGTCGCCGCCCTGGCCGCCGGCCCCGCCGTTTCCGCCGCCGCCGCCATCGCCGATGATGTTTTCC" ]
class TestINDELandSNPSAlleleGenerator(): def setup(self): DB.drop_database('mykrobe-test') self.pg = AlleleGenerator( reference_filepath="src/mykrobe/data/BX571856.1.fasta") self.pg2 = AlleleGenerator( reference_filepath="src/mykrobe/data/NC_000962.3.fasta") self.reference_set = ReferenceSet().create_and_save(name="ref_set") self.variant_set = VariantSet.create_and_save( name="this_vcf_file", reference_set=self.reference_set) self.variant_sets = [self.variant_set] self.reference = Reference().create_and_save( name="ref", md5checksum="sre", reference_sets=[ self.reference_set]) def teardown(self): DB.drop_database('mykrobe-test') def test_ins_with_SNP_context(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["ATTT"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"]) panel = self.pg.create(v, context=[v2]) #assert_no_overlapping_kmers(panel) ### This test seems to fail sometimes, and pass othertimes... assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs assert sorted( panel.alts) == sorted( [ "GATTAAAGATAGAAATACACGATGCGAGCATTTATCAAATTTCATAACATCACCATGAGTTTG", "TTAAAGATAGAAATACACGATGCGAGCATTTTTCAAATTTCATAACATCACCATGAGTTTG"]) def test_del_with_SNP_context1(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="AA", start=31, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=33, alternate_bases=["A"]) panel = self.pg.create(v, context=[v2]) assert_no_overlapping_kmers(panel) assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs assert sorted( panel.alts) == sorted( [ "ATTAAAGATAGAAATACACGATGCGAGCAACAAATTTCATAACATCACCATGAGTTTGA", "GATTAAAGATAGAAATACACGATGCGAGCATCAAATTTCATAACATCACCATGAGTTTG"]) def test_del_with_SNP_context2(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="AA", start=31, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"]) panel = self.pg.create(v, context=[v2]) assert_no_overlapping_kmers(panel) assert self.pg._remove_overlapping_contexts(v, [v2]) == [] assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs assert sorted( panel.alts) == sorted( ["GATTAAAGATAGAAATACACGATGCGAGCATCAAATTTCATAACATCACCATGAGTTTG"]) def test_del_with_ins_context1(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="AAT", start=31, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=4, alternate_bases=["TTTT"]) panel = self.pg.create(v, context=[v2]) assert_no_overlapping_kmers(panel) assert self.pg._remove_overlapping_contexts(v, [v2]) == [v2] assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs assert sorted( panel.alts) == sorted( [ "GATTAAAGATAGAAATACACGATGCGAGCACAAATTTCATAACATCACCATGAGTTTGAT", "TTTTAAAGATAGAAATACACGATGCGAGCACAAATTTCATAACATCACCATGAGTTTG"]) def test_del_with_ins_context2(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=1, alternate_bases=["CTTT"]) panel = self.pg.create(v, context=[v2]) assert_no_overlapping_kmers(panel) assert self.pg._remove_overlapping_contexts(v, [v2]) == [v2] assert self.pg._remove_contexts_not_within_k(v, [v2]) == [] assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert sorted( panel.alts) == sorted( ["ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"]) def test_del_with_ins_context3(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=5, alternate_bases=["TT"]) panel = self.pg.create(v, context=[v2]) assert_no_overlapping_kmers(panel) assert self.pg._remove_overlapping_contexts(v, [v2]) == [v2] assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert sorted( panel.alts) == sorted( [ "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", "TTTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"]) def test_del_with_ins_context4(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=5, alternate_bases=["TT"]) v3 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=5, alternate_bases=["TG"]) panel = self.pg.create(v, context=[v2, v3]) assert_no_overlapping_kmers(panel) assert self.pg._remove_overlapping_contexts(v, [v2, v3]) == [v2, v3] assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert sorted( panel.alts) == sorted( [ "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", "TTTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", "TTGAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"]) def test_del_with_ins_context5(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=5, alternate_bases=["TT"]) v3 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=6, alternate_bases=["AG"]) panel = self.pg.create(v, context=[v2, v3]) assert_no_overlapping_kmers(panel) assert self.pg._remove_overlapping_contexts(v, [v2, v3]) == [v2, v3] assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert sorted( panel.alts) == sorted( [ "TTAGAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGA", "TTAGAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", "TTTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"]) def test_del_with_ins_context_where_base_is_deleted1(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=33, alternate_bases=["C"]) panel = self.pg.create(v, context=[v2]) assert_no_overlapping_kmers(panel) assert self.pg._remove_overlapping_contexts(v, [v2]) == [] assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert sorted( panel.alts) == sorted( ["ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"]) def test_del_with_ins_context_where_base_is_deleted2(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="TAAA", start=5, alternate_bases=["T"]) v3 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=7, alternate_bases=["AG"]) panel = self.pg.create(v, context=[v2, v3]) assert_no_overlapping_kmers(panel) assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert sorted( panel.alts) == sorted( [ "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", "CGATTGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATC", "TTAAGAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"]) panel = self.pg.create(v, context=[v3, v2]) assert_no_overlapping_kmers(panel) assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert sorted( panel.alts) == sorted( [ "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", "CGATTGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATC", "TTAAGAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"]) def test_snp_with_replace_context(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="G", start=2338961, alternate_bases=["A"]) v1 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="GGATG", start=2338990, alternate_bases=["CGATA"]) panel = self.pg2.create(v, context=[v1]) assert_no_overlapping_kmers(panel) assert "CGACTAGCCACCATCGCGCATCAGTGCGAGGTCAAAAGCGACCAAAGCGAGCAAGTCGCGG" in panel.refs assert set(panel.alts) == \ set(["CGACTAGCCACCATCGCGCATCAGTGCGAGATCAAAAGCGACCAAAGCGAGCAAGTCGCCG", "CGACTAGCCACCATCGCGCATCAGTGCGAGATCAAAAGCGACCAAAGCGAGCAAGTCGCGG"]) def test_indel_snp_indel_context(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="TCGCGTGGC", start=4021459, alternate_bases=["GCGAGCAGA"]) v1 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=4021455, alternate_bases=["ATCTAGCCGCAAG"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=4021489, alternate_bases=["G"]) panel = self.pg2.create(v) # , context = [v1, v2]) assert_no_overlapping_kmers(panel) assert "ATCATGCGATTCTGCGTCTGCTCGCGAGGCTCGCGTGGCCGCCGGCGCTGGCGGGCGATCT" in panel.refs panel = self.pg2.create(v, context=[v1, v2]) assert_no_overlapping_kmers(panel) assert sorted( panel.alts) == sorted( [ "ATCATGCGATTCTGCGTCTGCTCGCGAGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCG", "ATCATGCGATTCTGCGTCTGCTCGCGAGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCT", "TGCGTCTGCTCGCGATCTAGCCGCAAGGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCG", "TGCGTCTGCTCGCGATCTAGCCGCAAGGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCT"]) def test_complex_context(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATTT", start=1503643, alternate_bases=["A"]) v1 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="CCT", start=1503615, alternate_bases=["C"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=1503655, alternate_bases=["ATGCCGCCGCC"]) panel = self.pg2.create(v, context=[v1, v2]) assert_no_overlapping_kmers(panel) assert "ATCCTGGAGCCCACCAGCGGAAACACCGGCATTTCGCTGGCGATGGCGGCCCGGTTGAAGG" in panel.refs assert set(panel.alts) == set([ "CCATCGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGGCGGCCCGGTTGAAGGGGT", "TCCTGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGGCGGCCCGGTTGAAGGGG", "ATCGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGCCGCCGCCTGGCGGCCCGG", "TCCTGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGCCGCCGCCTGGCGGCCCGG", ])