def test_make_variant_panel6(self): ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta", kmer=31) gene = self.gm.get_gene("pncA") variants = list( self.gm.get_variant_names("pncA", "CAG28TAA", protein_coding_var=False)) assert len(variants) == 1 var = variants[0] ref, start, alt = split_var_name(var) assert ref == "CTG" assert start == 2289212 assert alt == "TTA" v = Variant.create( variant_sets=self.variant_sets, reference=self.reference_id, reference_bases=ref, start=start, alternate_bases=[alt], ) panel = ag.create(v) assert len(panel.alts) == 1 alt = panel.alts[0] # the panel ref/alt seqs go past the end of the gene, # so can't comparie against gene sequence. Need to get # subseq from the reference seq panel_ref_start = self.reference_seq.find(panel.refs[0]) assert panel_ref_start < start < panel_ref_start + len(panel.refs[0]) seq = str(self.reference_seq[panel_ref_start:panel_ref_start + len(panel.refs[0])]) assert seq == panel.refs[0] assert alt == seq[:30] + "TTA" + seq[33:] DB.drop_database("mykrobe-test")
def test_make_variant_panel8(self): ag = AlleleGenerator("src/mykrobe/data/NC_000962.3.fasta") gene = self.gm.get_gene("eis") variants = list( self.gm.get_variant_names("eis", "TG-1T", protein_coding_var=False)) assert len(variants) == 1 var = variants[0] ref, start, alt = split_var_name(var) assert ref == 'CA' assert start == 2715332 assert alt == 'A' v = Variant.create(variant_sets=self.variant_sets, reference=self.reference_id, reference_bases=ref, start=start, alternate_bases=[alt]) panel = ag.create(v) assert len(panel.alts) == 1 alt = panel.alts[0] # the panel ref/alt seqs go past the end of the gene, # so can't comparie against gene sequence. Need to get # subseq from the reference seq panel_ref_start = self.reference_seq.find(panel.refs[0]) assert panel_ref_start < start < panel_ref_start + len(panel.refs[0]) seq = str(self.reference_seq[panel_ref_start:panel_ref_start + len(panel.refs[0])]) assert seq == panel.refs[0] print(alt, seq[:31] + seq[31:]) assert alt == seq[:30] + seq[31:] DB.drop_database('mykrobe-test')
def setup(self): DB.drop_database("mykrobe-test") self.pg = AlleleGenerator( reference_filepath=f"{DATA_DIR}/NC_000962.3.fasta", kmer=31) self.reference_set = ReferenceSet().create_and_save(name="ref_set") self.variant_set = VariantSet.create_and_save( name="this_vcf_file", reference_set=self.reference_set) self.variant_sets = [self.variant_set] self.reference = Reference().create_and_save( name="ref", md5checksum="sre", reference_sets=[self.reference_set])
def setup(self): DB.drop_database('mykrobe-test') self.pg = AlleleGenerator( reference_filepath="src/mykrobe/data/BX571856.1.fasta") self.reference_set = ReferenceSet().create_and_save(name="ref_set") self.variant_set = VariantSet.create_and_save( name="this_vcf_file", reference_set=self.reference_set) self.variant_sets = [self.variant_set] self.reference = Reference().create_and_save( name="ref", md5checksum="sre", reference_sets=[self.reference_set])
def test_make_variant_panel5(self): ag = AlleleGenerator("src/mykrobe/data/NC_000962.3.fasta") gene = self.gm.get_gene("gyrA") for var in self.gm.get_variant_names("gyrA", "D94X"): ref, start, alt = split_var_name(var) v = Variant.create(variant_sets=self.variant_sets, reference=self.reference_id, reference_bases=ref, start=start, alternate_bases=[alt]) panel = ag.create(v) for alt in panel.alts: seq = copy.copy(str(gene.seq)) seq = seq.replace(panel.refs[0], alt) assert Seq(seq).translate()[93] != "D" DB.drop_database('mykrobe-test')
def run(parser, args): db_name = '%s-%s' % (DB_PREFIX, args.db_name) DB = connect(db_name, host=args.db_uri) if args.verbose: logger.setLevel(level=logging.DEBUG) else: logger.setLevel(level=logging.INFO) al = AlleleGenerator(reference_filepath=args.reference_filepath, kmer=args.kmer) _variant_ids = get_non_singelton_variants(db_name) total = Variant.snps(id__in=_variant_ids).count() N = 100 pages = math.ceil(total / N) for page in range(pages): logger.info("%i of %i - %f%%" % (page * N, total, round(100 * (page * N) / total, 2))) for variant in Variant.snps( id__in=_variant_ids).order_by("start").skip(N * page).limit(N): # for variant in Variant.snps().order_by("start"): variant_panel = make_variant_probe(al, variant, args.kmer, DB=DB) for i, ref in enumerate(variant_panel.refs): sys.stdout.write( ">ref-%s?var_name=%snum_alts=%i&ref=%s&enum=%i\n" % (variant_panel.variant.var_hash, variant.var_name[:100], len(variant_panel.alts), variant_panel.variant.reference.id, i)) sys.stdout.write("%s\n" % ref) for i, a in enumerate(variant_panel.alts): sys.stdout.write(">alt-%s?var_name=%s&enum=%i\n" % (variant_panel.variant.var_hash, variant.var_name[:100], i)) sys.stdout.write("%s\n" % a)
def test_make_variant_panel4(self): ag = AlleleGenerator("src/mykrobe/data/NC_000962.3.fasta") gene = self.gm.get_gene("katG") for var in self.gm.get_variant_names("katG", "W90R"): ref, start, alt = split_var_name(var) v = Variant.create(variant_sets=self.variant_sets, reference=self.reference_id, reference_bases=ref, start=start, alternate_bases=[alt]) panel = ag.create(v) for alt in panel.alts: seq = copy.copy(str(gene.seq.reverse_complement())) seq = seq.replace(panel.refs[0], alt) assert seq != str(gene.seq) assert Seq(seq).reverse_complement().translate()[89] == "R" DB.drop_database('mykrobe-test')
def test_make_variant_panel7(self): # Test DNA change upstream of a gene on the reverse # strand. The variant G-10A is in "gene space", ie # 10 bases upstream of eis is the nucleotide G on the # reverse strand. That position is 2715342 in the genome, # and is C on the forwards strand. # Here's a diagram: # | <- This C is at -10 in "gene space", so variant G-10A has ref=G # | ref coord is 2715342, and variant in "ref space" is C2715342T # CACAGAATCCGACTGTGGCATATGCCGC # | # | <- C = last nucleotide of gene, at 2715332 ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta", kmer=31) gene = self.gm.get_gene("eis") variants = list( self.gm.get_variant_names("eis", "G-10A", protein_coding_var=False)) assert len(variants) == 1 var = variants[0] ref, start, alt = split_var_name(var) assert ref == "C" assert start == 2715342 assert alt == "T" v = Variant.create( variant_sets=self.variant_sets, reference=self.reference_id, reference_bases=ref, start=start, alternate_bases=[alt], ) panel = ag.create(v) assert len(panel.alts) == 1 alt = panel.alts[0] # the panel ref/alt seqs go past the end of the gene, # so can't comparie against gene sequence. Need to get # subseq from the reference seq panel_ref_start = self.reference_seq.find(panel.refs[0]) assert panel_ref_start < start < panel_ref_start + len(panel.refs[0]) seq = str(self.reference_seq[panel_ref_start:panel_ref_start + len(panel.refs[0])]) assert seq == panel.refs[0] assert alt == seq[:30] + "T" + seq[31:] DB.drop_database("mykrobe-test")
def test_make_variant_panel1(self): ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta") gene = self.gm.get_gene("rpoB") for var in self.gm.get_variant_names("rpoB", "D3A"): ref, start, alt = split_var_name(var) v = Variant.create( variant_sets=self.variant_sets, reference=self.reference_id, reference_bases=ref, start=start, alternate_bases=[alt]) panel = ag.create(v) for alt in panel.alts: seq = copy.copy(str(gene.seq)) assert Seq(seq).translate()[2] == "D" seq = seq.replace(panel.refs[0][25:], alt[24:]) assert seq != str(gene.seq) assert Seq(seq).translate()[2] == "A" DB.drop_database('mykrobe-test')
def test_make_variant_panel2(self): ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta", kmer=31) gene = self.gm.get_gene("katG") for var in self.gm.get_variant_names("katG", "E3A"): ref, start, alt = split_var_name(var) v = Variant.create( variant_sets=self.variant_sets, reference=self.reference_id, reference_bases=ref, start=start, alternate_bases=[alt], ) panel = ag.create(v) for alt in panel.alts: seq = copy.copy(str(gene.seq.reverse_complement())) seq = seq.replace(panel.refs[0][:39], alt[:39 + len(alt) - len(panel.refs[0])]) assert seq != str(gene.seq) assert Seq(seq).reverse_complement().translate()[2] == "A" DB.drop_database("mykrobe-test")
class TestSNPAlleleGenerator: def setup(self): DB.drop_database("mykrobe-test") self.pg = AlleleGenerator( reference_filepath=f"{DATA_DIR}/BX571856.1.fasta", kmer=31) self.reference_set = ReferenceSet().create_and_save(name="ref_set") self.variant_set = VariantSet.create_and_save( name="this_vcf_file", reference_set=self.reference_set) self.variant_sets = [self.variant_set] self.reference = Reference().create_and_save( name="ref", md5checksum="sre", reference_sets=[self.reference_set]) def test_panel_generator(self): pg = AlleleGenerator(reference_filepath=f"{DATA_DIR}/BX571856.1.fasta", kmer=31) assert pg.ref is not None def test_simple_snp_variant(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["T"], ) panel = self.pg.create(v) assert panel.refs[0][:31] != panel.alts[0][:31] assert panel.refs[0][-32:] != panel.alts[0][-32:] assert panel.refs[0][-31:] != panel.alts[0][-31:] assert_no_overlapping_kmers(panel) assert panel.refs == [ "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" ] assert panel.alts == [ "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTG" ] assert self.pg._calculate_length_delta_from_indels(v, []) == 0 assert v.is_indel is False def test_simple_variant2(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"], ) panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert panel.refs == [ "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" ] assert panel.alts == [ "GATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTGA" ] def test_simple_variant_invalid(self): with pytest.raises(ValueError) as cm: v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=31, alternate_bases=["T"], ) panel = self.pg.create(v) def test_simple_variant_start(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=1, alternate_bases=["T"], ) panel = self.pg.create(v) # assert_no_overlapping_kmers(panel) ## Will have overlapping kmers only if the SNP is in the i assert panel.refs == [ "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" ] assert panel.alts == [ "TGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" ] def test_simple_variant_end(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=2902618, alternate_bases=["T"], ) panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert panel.refs == [ "TTTATACTACTGCTCAATTTTTTTACTTTTATNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" ] assert panel.alts == [ "TTTATACTACTGCTCAATTTTTTTACTTTTTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" ] v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=2902616, alternate_bases=["C"], ) panel = self.pg.create(v) assert panel.refs == [ "ATTTTATACTACTGCTCAATTTTTTTACTTTTATNNNNNNNNNNNNNNNNNNNNNNNNNNN" ] assert panel.alts == [ "ATTTTATACTACTGCTCAATTTTTTTACTTCTATNNNNNNNNNNNNNNNNNNNNNNNNNNN" ] def test_simple_variant_with_nearby_snp(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["T"], ) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"], ) panel = self.pg.create(v, context=[v2]) assert_no_overlapping_kmers(panel) assert set(panel.refs) == set([ "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTG", ]) assert set(panel.alts) == set([ "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTG", ]) def test_simple_variant_with_multiple_nearby_snps(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["T"], ) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"], ) v3 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=30, alternate_bases=["G"], ) panel = self.pg.create(v, context=[v2, v3]) assert_no_overlapping_kmers(panel) assert panel.refs == [ "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGAATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGATTCAAATTTCATAACATCACCATGAGTTTGAT", ] assert panel.alts == [ "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGTTTCAAATTTCATAACATCACCATGAGTTTGAT", ] def test_simple_variant_with_multiple_nearby_snps2(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["T"], ) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"], ) v3 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=30, alternate_bases=["G"], ) v4 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=30, alternate_bases=["T"], ) v5 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=30, alternate_bases=["A"], ) assert sorted(self.pg._split_context([v, v3, v4])) == sorted([[v, v4], [v, v3]]) assert (self.pg._split_context([v3, v4])) == [[v4], [v3]] assert (self.pg._split_context([v, v3, v4, v5])) == [[v, v4, v5], [v, v3, v5]] panel = self.pg.create(v, context=[v2, v3, v4, v5]) assert_no_overlapping_kmers(panel) assert sorted(panel.refs) == sorted([ "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGGAATCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGGATTCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGTAATCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGTATTCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGAAATCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGAATTCAAATTTCATAACATCACCATGAGTTTG", ]) assert sorted(panel.alts) == sorted([ "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGGTATCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGGTTTCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGTTATCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGTTTTCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGATATCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGATTTCAAATTTCATAACATCACCATGAGTTTG", ]) def test_simple_variant_with_multiple_nearby_snps(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["T"], ) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"], ) v5 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["G"], ) v3 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=30, alternate_bases=["G"], ) v4 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=30, alternate_bases=["T"], ) panel = self.pg.create(v, context=[v2, v3, v4, v5]) assert_no_overlapping_kmers(panel) assert sorted(panel.refs) == sorted([ "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGGAATCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGGATTCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGTAATCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGTATTCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGCAGTCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGGAGTCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGTAGTCAAATTTCATAACATCACCATGAGTTTG", ]) assert sorted(panel.alts) == sorted([ "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGGTATCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGGTTTCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGTTATCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGTTTTCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGCTGTCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGGTGTCAAATTTCATAACATCACCATGAGTTTG", "CGATTAAAGATAGAAATACACGATGCGAGTTGTCAAATTTCATAACATCACCATGAGTTTG", ])
class TestINDELAlleleGenerator(): def setup(self): DB.drop_database('mykrobe-test') self.pg = AlleleGenerator( reference_filepath=f"{DATA_DIR}/BX571856.1.fasta") self.pg2 = AlleleGenerator( reference_filepath=f"{DATA_DIR}/NC_000962.3.fasta") self.reference_set = ReferenceSet().create_and_save(name="ref_set") self.variant_set = VariantSet.create_and_save( name="this_vcf_file", reference_set=self.reference_set) self.variant_sets = [self.variant_set] self.reference = Reference().create_and_save( name="ref", md5checksum="sre", reference_sets=[self.reference_set]) def test_simple_deletion1(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="AA", start=31, alternate_bases=["A"]) assert v.is_indel assert v.is_deletion panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs assert self.pg._calculate_length_delta_from_indels(v, []) == 1 assert panel.alts == [ "GATTAAAGATAGAAATACACGATGCGAGCATCAAATTTCATAACATCACCATGAGTTTG" ] def test_simple_deletion2(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="AT", start=32, alternate_bases=["A"]) panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert panel.alts == [ "ATTAAAGATAGAAATACACGATGCGAGCAACAAATTTCATAACATCACCATGAGTTTGAT" ] def test_simple_deletion3(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="AT", start=2902618, alternate_bases=["T"]) panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert "TTTATACTACTGCTCAATTTTTTTACTTTTATNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" in panel.refs assert panel.alts == [ "TTTATACTACTGCTCAATTTTTTTACTTTTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" ] def test_simple_deletion4(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert panel.alts == [ "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT" ] def test_simple_insertion1(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=1, alternate_bases=["TTTC"]) panel = self.pg.create(v) # assert_no_overlapping_kmers(panel)### Skip this test for vars in first k bases of ref assert v.is_indel assert v.is_insertion assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs assert panel.alts == ["TTTCGATTAAAGATAGAAATACACGATGCGAGC"] def test_simple_insertion2(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=1, alternate_bases=["CTTT"]) panel = self.pg.create(v) # assert_no_overlapping_kmers(panel)### Skip this test for vars in first k bases of ref assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs assert panel.alts == ["CTTTGATTAAAGATAGAAATACACGATGCGAGCA"] def test_simple_insertion3(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["ATTT"]) panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs assert panel.alts == [ "GATTAAAGATAGAAATACACGATGCGAGCATTTATCAAATTTCATAACATCACCATGAGTTTG" ] def test_simple_insertion4(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["AGGGG"]) panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert panel.alts == [ "ATTAAAGATAGAAATACACGATGCGAGCAAGGGGTCAAATTTCATAACATCACCATGAGTTTGA" ] def test_simple_insertion5(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=2902618, alternate_bases=["ATGC"]) panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert "TTTATACTACTGCTCAATTTTTTTACTTTTATNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" in panel.refs assert panel.alts == [ "TATACTACTGCTCAATTTTTTTACTTTTATGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" ] def test_double_insertion(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=4021408, alternate_bases=["ACGCTGGCGGGCG"]) v1 = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="AGA", start=4021406, alternate_bases=["CGG"]) context = [v1] assert self.pg2._remove_overlapping_contexts(v, [v1]) == [] panel = self.pg2.create(v, context=context) assert_no_overlapping_kmers(panel) assert "ATCTAGCCGCAAGGGCGCGAGCAGACGCAGAATCGCATGATTTGAGCTCAAATCATGCGAT" in panel.refs assert panel.alts == [ "TCTAGCCGCAAGGGCGCGAGCAGACGCAGACGCTGGCGGGCGATCGCATGATTTGAGCTCAAATCATGCGAT" ] def test_double_indel_fail(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="CCA", start=2288851, alternate_bases=["A"]) v1 = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=2288850, alternate_bases=["ACC"]) context = [v1] panel = self.pg2.create(v, context=context) assert "GGCGCACACAATGATCGGTGGCAATACCGACCACATCGACCTCATCGACGCCGCGTTGCCG" in panel.refs assert "GGCGCACACAATGATCGGTGGCAATACCGACCACATCGACCTCATCGACGCCGCGTTGCCG" not in panel.alts def test_large_insertion(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="CCGCCGGCCCCGCCGTTT", start=1636155, alternate_bases=[ "CTGCCGGCCCCGCCGGCGCCGCCCAATCCACCGAAGCCCCTCCCTTCGGTGGGGTCGCTGCCGCCGTCGCCGCCGTCACCGCCCTTGCCGCCGGCCCCGCCGTCGCCGCCGGCTCCGGCGGTGCCGTCGCCGCCCTGGCCGCCGGCCCCGCCGTTTCCG" ]) panel = self.pg2.create(v, context=[]) assert_no_overlapping_kmers(panel) assert "AGACCTAGCAGGGTGCCGGCGCCGCCCTTGCCGCCGGCCCCGCCGTTTCCGCCGCCGCCAT" in panel.refs assert panel.alts == [ "GACCTAGCAGGGTGCCGGCGCCGCCCTTGCTGCCGGCCCCGCCGGCGCCGCCCAATCCACCGAAGCCCCTCCCTTCGGTGGGGTCGCTGCCGCCGTCGCCGCCGTCACCGCCCTTGCCGCCGGCCCCGCCGTCGCCGCCGGCTCCGGCGGTGCCGTCGCCGCCCTGGCCGCCGGCCCCGCCGTTTCCGCCGCCGCCGCCATCGCCGATGATGTTTTCC" ]
class TestINDELandSNPSAlleleGenerator(): def setup(self): DB.drop_database('mykrobe-test') self.pg = AlleleGenerator( reference_filepath="src/mykrobe/data/BX571856.1.fasta") self.pg2 = AlleleGenerator( reference_filepath="src/mykrobe/data/NC_000962.3.fasta") self.reference_set = ReferenceSet().create_and_save(name="ref_set") self.variant_set = VariantSet.create_and_save( name="this_vcf_file", reference_set=self.reference_set) self.variant_sets = [self.variant_set] self.reference = Reference().create_and_save( name="ref", md5checksum="sre", reference_sets=[ self.reference_set]) def teardown(self): DB.drop_database('mykrobe-test') def test_ins_with_SNP_context(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["ATTT"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"]) panel = self.pg.create(v, context=[v2]) #assert_no_overlapping_kmers(panel) ### This test seems to fail sometimes, and pass othertimes... assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs assert sorted( panel.alts) == sorted( [ "GATTAAAGATAGAAATACACGATGCGAGCATTTATCAAATTTCATAACATCACCATGAGTTTG", "TTAAAGATAGAAATACACGATGCGAGCATTTTTCAAATTTCATAACATCACCATGAGTTTG"]) def test_del_with_SNP_context1(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="AA", start=31, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=33, alternate_bases=["A"]) panel = self.pg.create(v, context=[v2]) assert_no_overlapping_kmers(panel) assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs assert sorted( panel.alts) == sorted( [ "ATTAAAGATAGAAATACACGATGCGAGCAACAAATTTCATAACATCACCATGAGTTTGA", "GATTAAAGATAGAAATACACGATGCGAGCATCAAATTTCATAACATCACCATGAGTTTG"]) def test_del_with_SNP_context2(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="AA", start=31, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"]) panel = self.pg.create(v, context=[v2]) assert_no_overlapping_kmers(panel) assert self.pg._remove_overlapping_contexts(v, [v2]) == [] assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs assert sorted( panel.alts) == sorted( ["GATTAAAGATAGAAATACACGATGCGAGCATCAAATTTCATAACATCACCATGAGTTTG"]) def test_del_with_ins_context1(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="AAT", start=31, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=4, alternate_bases=["TTTT"]) panel = self.pg.create(v, context=[v2]) assert_no_overlapping_kmers(panel) assert self.pg._remove_overlapping_contexts(v, [v2]) == [v2] assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs assert sorted( panel.alts) == sorted( [ "GATTAAAGATAGAAATACACGATGCGAGCACAAATTTCATAACATCACCATGAGTTTGAT", "TTTTAAAGATAGAAATACACGATGCGAGCACAAATTTCATAACATCACCATGAGTTTG"]) def test_del_with_ins_context2(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=1, alternate_bases=["CTTT"]) panel = self.pg.create(v, context=[v2]) assert_no_overlapping_kmers(panel) assert self.pg._remove_overlapping_contexts(v, [v2]) == [v2] assert self.pg._remove_contexts_not_within_k(v, [v2]) == [] assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert sorted( panel.alts) == sorted( ["ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"]) def test_del_with_ins_context3(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=5, alternate_bases=["TT"]) panel = self.pg.create(v, context=[v2]) assert_no_overlapping_kmers(panel) assert self.pg._remove_overlapping_contexts(v, [v2]) == [v2] assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert sorted( panel.alts) == sorted( [ "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", "TTTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"]) def test_del_with_ins_context4(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=5, alternate_bases=["TT"]) v3 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=5, alternate_bases=["TG"]) panel = self.pg.create(v, context=[v2, v3]) assert_no_overlapping_kmers(panel) assert self.pg._remove_overlapping_contexts(v, [v2, v3]) == [v2, v3] assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert sorted( panel.alts) == sorted( [ "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", "TTTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", "TTGAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"]) def test_del_with_ins_context5(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=5, alternate_bases=["TT"]) v3 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=6, alternate_bases=["AG"]) panel = self.pg.create(v, context=[v2, v3]) assert_no_overlapping_kmers(panel) assert self.pg._remove_overlapping_contexts(v, [v2, v3]) == [v2, v3] assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert sorted( panel.alts) == sorted( [ "TTAGAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGA", "TTAGAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", "TTTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"]) def test_del_with_ins_context_where_base_is_deleted1(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=33, alternate_bases=["C"]) panel = self.pg.create(v, context=[v2]) assert_no_overlapping_kmers(panel) assert self.pg._remove_overlapping_contexts(v, [v2]) == [] assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert sorted( panel.alts) == sorted( ["ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"]) def test_del_with_ins_context_where_base_is_deleted2(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="TAAA", start=5, alternate_bases=["T"]) v3 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=7, alternate_bases=["AG"]) panel = self.pg.create(v, context=[v2, v3]) assert_no_overlapping_kmers(panel) assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert sorted( panel.alts) == sorted( [ "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", "CGATTGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATC", "TTAAGAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"]) panel = self.pg.create(v, context=[v3, v2]) assert_no_overlapping_kmers(panel) assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs assert sorted( panel.alts) == sorted( [ "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", "CGATTGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATC", "TTAAGAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"]) def test_snp_with_replace_context(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="G", start=2338961, alternate_bases=["A"]) v1 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="GGATG", start=2338990, alternate_bases=["CGATA"]) panel = self.pg2.create(v, context=[v1]) assert_no_overlapping_kmers(panel) assert "CGACTAGCCACCATCGCGCATCAGTGCGAGGTCAAAAGCGACCAAAGCGAGCAAGTCGCGG" in panel.refs assert set(panel.alts) == \ set(["CGACTAGCCACCATCGCGCATCAGTGCGAGATCAAAAGCGACCAAAGCGAGCAAGTCGCCG", "CGACTAGCCACCATCGCGCATCAGTGCGAGATCAAAAGCGACCAAAGCGAGCAAGTCGCGG"]) def test_indel_snp_indel_context(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="TCGCGTGGC", start=4021459, alternate_bases=["GCGAGCAGA"]) v1 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=4021455, alternate_bases=["ATCTAGCCGCAAG"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=4021489, alternate_bases=["G"]) panel = self.pg2.create(v) # , context = [v1, v2]) assert_no_overlapping_kmers(panel) assert "ATCATGCGATTCTGCGTCTGCTCGCGAGGCTCGCGTGGCCGCCGGCGCTGGCGGGCGATCT" in panel.refs panel = self.pg2.create(v, context=[v1, v2]) assert_no_overlapping_kmers(panel) assert sorted( panel.alts) == sorted( [ "ATCATGCGATTCTGCGTCTGCTCGCGAGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCG", "ATCATGCGATTCTGCGTCTGCTCGCGAGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCT", "TGCGTCTGCTCGCGATCTAGCCGCAAGGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCG", "TGCGTCTGCTCGCGATCTAGCCGCAAGGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCT"]) def test_complex_context(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATTT", start=1503643, alternate_bases=["A"]) v1 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="CCT", start=1503615, alternate_bases=["C"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=1503655, alternate_bases=["ATGCCGCCGCC"]) panel = self.pg2.create(v, context=[v1, v2]) assert_no_overlapping_kmers(panel) assert "ATCCTGGAGCCCACCAGCGGAAACACCGGCATTTCGCTGGCGATGGCGGCCCGGTTGAAGG" in panel.refs assert set(panel.alts) == set([ "CCATCGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGGCGGCCCGGTTGAAGGGGT", "TCCTGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGGCGGCCCGGTTGAAGGGG", "ATCGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGCCGCCGCCTGGCGGCCCGG", "TCCTGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGCCGCCGCCTGGCGGCCCGG", ])
def test_panel_generator(self): pg = AlleleGenerator( reference_filepath="src/mykrobe/data/BX571856.1.fasta") assert pg.ref is not None
class TestSNPAlleleGenerator(): def setup(self): DB.drop_database('mykrobe-test') self.pg = AlleleGenerator( reference_filepath="src/mykrobe/data/BX571856.1.fasta") self.reference_set = ReferenceSet().create_and_save(name="ref_set") self.variant_set = VariantSet.create_and_save( name="this_vcf_file", reference_set=self.reference_set) self.variant_sets = [self.variant_set] self.reference = Reference().create_and_save( name="ref", md5checksum="sre", reference_sets=[self.reference_set]) def test_panel_generator(self): pg = AlleleGenerator( reference_filepath="src/mykrobe/data/BX571856.1.fasta") assert pg.ref is not None def test_simple_variant(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["T"]) panel = self.pg.create(v) assert panel.refs == [ "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT" ] assert panel.alts == [ "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTGAT" ] assert self.pg._calculate_length_delta_from_indels(v, []) == 0 assert v.is_indel is False def test_simple_variant2(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"]) panel = self.pg.create(v) assert panel.refs == [ "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGATC" ] assert panel.alts == [ "GATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTGATC" ] def test_simple_variant_invalid(self): with pytest.raises(ValueError) as cm: v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=31, alternate_bases=["T"]) panel = self.pg.create(v) def test_simple_variant_start(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=1, alternate_bases=["T"]) panel = self.pg.create(v) assert panel.refs == [ "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT" ] assert panel.alts == [ "TGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT" ] def test_simple_variant_end(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=2902618, alternate_bases=["T"]) panel = self.pg.create(v) assert panel.refs == [ "TAACAAAATCCTTTTTATAACGCAAGTTCATTTTATACTACTGCTCAATTTTTTTACTTTTAT" ] assert panel.alts == [ "TAACAAAATCCTTTTTATAACGCAAGTTCATTTTATACTACTGCTCAATTTTTTTACTTTTTT" ] v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=2902616, alternate_bases=["C"]) panel = self.pg.create(v) assert panel.refs == [ "TAACAAAATCCTTTTTATAACGCAAGTTCATTTTATACTACTGCTCAATTTTTTTACTTTTAT" ] assert panel.alts == [ "TAACAAAATCCTTTTTATAACGCAAGTTCATTTTATACTACTGCTCAATTTTTTTACTTCTAT" ] def test_simple_variant_with_nearby_snp(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["T"]) v2 = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"]) panel = self.pg.create(v, context=[v2]) assert panel.refs == [ "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTGAT" ] assert panel.alts == [ "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTGAT" ] def test_simple_variant_with_multiple_nearby_snps(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["T"]) v2 = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"]) v3 = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=30, alternate_bases=["G"]) panel = self.pg.create(v, context=[v2, v3]) assert panel.refs == [ 'CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT', 'CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTGAT', 'CGATTAAAGATAGAAATACACGATGCGAGGAATCAAATTTCATAACATCACCATGAGTTTGAT', 'CGATTAAAGATAGAAATACACGATGCGAGGATTCAAATTTCATAACATCACCATGAGTTTGAT' ] assert panel.alts == [ "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGTTTCAAATTTCATAACATCACCATGAGTTTGAT" ] def test_simple_variant_with_multiple_nearby_snps2(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["T"]) v2 = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"]) v3 = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=30, alternate_bases=["G"]) v4 = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=30, alternate_bases=["T"]) v5 = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=30, alternate_bases=["A"]) assert sorted(self.pg._split_context([v, v3, v4])) == sorted([[v, v4], [v, v3]]) assert (self.pg._split_context([v3, v4])) == [[v4], [v3]] assert (self.pg._split_context([v, v3, v4, v5])) == [[v, v4, v5], [v, v3, v5]] panel = self.pg.create(v, context=[v2, v3, v4, v5]) assert sorted(panel.refs) == sorted([ "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGAATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGATTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTAATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTATTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGAAATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGAATTCAAATTTCATAACATCACCATGAGTTTGAT" ]) assert sorted(panel.alts) == sorted([ "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGTTTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTTTTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGATATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGATTTCAAATTTCATAACATCACCATGAGTTTGAT" ]) def test_simple_variant_with_multiple_nearby_snps(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["T"]) v2 = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"]) v5 = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["G"]) v3 = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=30, alternate_bases=["G"]) v4 = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=30, alternate_bases=["T"]) panel = self.pg.create(v, context=[v2, v3, v4, v5]) assert sorted(panel.refs) == sorted([ "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGAATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGATTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTAATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTATTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCAGTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGAGTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTAGTCAAATTTCATAACATCACCATGAGTTTGAT" ]) assert sorted(panel.alts) == sorted([ "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGTTTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTTTTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCTGTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGTGTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTTGTCAAATTTCATAACATCACCATGAGTTTGAT" ])
def test_panel_generator(self): pg = AlleleGenerator(reference_filepath=f"{DATA_DIR}/BX571856.1.fasta", kmer=31) assert pg.ref is not None
def run(parser, args): # There's no need to try to connect to database if we're not doing backgrounds if args.no_backgrounds: logger.info( "Not connecting to database, because --no-backgrounds option used") DB = None else: DB = connect("%s-%s" % (DB_PREFIX, args.db_name)) if DB is not None: try: Variant.objects() logger.info("Connected to %s-%s" % (DB_PREFIX, args.db_name)) except (ServerSelectionTimeoutError): DB = None logger.warning( "Could not connect to database. Continuing without using genetic backgrounds" ) mutations = [] lineages = set() reference = os.path.basename(args.reference_filepath).split(".fa")[0] if args.vcf: run_make_probes_from_vcf_file(args) elif args.genbank: aa2dna = GeneAminoAcidChangeToDNAVariants(args.reference_filepath, args.genbank) if args.text_file: with open(args.text_file, "r") as infile: reader = csv.reader(infile, delimiter="\t") for row in reader: gene, mutation_string, alphabet = row if alphabet == "DNA": protein_coding_var = False else: protein_coding_var = True for var_name in aa2dna.get_variant_names( gene, mutation_string, protein_coding_var): mutation = Mutation( reference=reference, var_name=var_name, gene=aa2dna.get_gene(gene), mut=mutation_string, protein_coding_var=protein_coding_var, ) mutations.append(mutation) else: for variant in args.variants: gene, mutation = variant.split("_") for var_name in aa2dna.get_variant_names(gene, mutation): mutations.append( Mutation( reference=reference, var_name=var_name, gene=gene, mut=mutation, )) else: if args.text_file: mutations, lineages = load_dna_vars_txt_file( args.text_file, reference) if args.lineage: with open(args.lineage, "w") as f: json.dump(lineages, f, sort_keys=True, indent=2) else: mutations.extend( Mutation(reference=reference, var_name=v) for v in args.variants) al = AlleleGenerator(reference_filepath=args.reference_filepath, kmer=args.kmer) for enum, mut in enumerate(mutations): if enum % 100 == 0: logger.info( "%i of %i - %f%%" % (enum, len(mutations), round(100 * enum / len(mutations), 2))) variant_panel = make_variant_probe(al, mut.variant, args.kmer, DB=DB, no_backgrounds=args.no_backgrounds) if variant_panel is not None: for i, ref in enumerate(variant_panel.refs): try: gene_name = mut.gene.name except AttributeError: gene_name = "NA" sys.stdout.write( ">ref-%s?var_name=%s&num_alts=%i&ref=%s&enum=%i&gene=%s&mut=%s\n" % ( mut.mutation_output_name, mut.variant.var_name, len(variant_panel.alts), mut.reference, i, gene_name, mut.mutation_output_name, )) sys.stdout.write("%s\n" % ref) for i, a in enumerate(variant_panel.alts): sys.stdout.write( ">alt-%s?var_name=%s&enum=%i&gene=%s&mut=%s\n" % ( mut.mutation_output_name, mut.variant.var_name, i, gene_name, mut.mutation_output_name, )) sys.stdout.write("%s\n" % a) else: logger.warning("All variants failed for %s_%s - %s" % (mut.gene, mut.mutation_output_name, mut.variant))
class TestLargeINDELAlleleGenerator(): def setup(self): DB.drop_database('mykrobe-test') self.pg = AlleleGenerator( reference_filepath=f"{DATA_DIR}/NC_000962.3.fasta") self.reference_set = ReferenceSet().create_and_save(name="ref_set") self.variant_set = VariantSet.create_and_save( name="this_vcf_file", reference_set=self.reference_set) self.variant_sets = [self.variant_set] self.reference = Reference().create_and_save( name="ref", md5checksum="sre", reference_sets=[self.reference_set]) def test_large_variant1(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases= "AACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACT", start=1355983, alternate_bases=[ "ACCGCCCGGTATCTGAGGATTGGTTTTCCACCCAAATACAAGTCGCATTCGCG" ]) panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert "TCGTCAACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACTGGA" in panel.refs assert panel.alts == [ "TCGTCACCGCCCGGTATCTGAGGATTGGTTTTCCACCCAAATACAAGTCGCATTCGCGGGA" ] def test_large_variant2(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases= "AACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCAC", start=1355983, alternate_bases=[ "ACCGCCCGGTATCTGAGGATTGGTTTTCCACCCAAATACAAGTCGCATTCGC" ]) panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert "TCGTCAACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACTGGA" in panel.refs assert panel.alts == [ "TCGTCACCGCCCGGTATCTGAGGATTGGTTTTCCACCCAAATACAAGTCGCATTCGCTGGA" ] # def test_large_variant3(self): # v = Variant.create( # variant_sets=self.variant_sets, # reference=self.reference, # reference_bases="TCGTCAACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACTGGA", # start=1355978, # alternate_bases=["TCGTCAACGCCCGGTATCTGAGGATCGGTGTTCTCACCCAATACAAGTCGCATTCACTGGA"]) # panel = self.pg.create(v) # assert_no_overlapping_kmers(panel) # assert "CAAACCTCGTCAACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACTGGACCGCCA" in panel.refs # assert panel.alts == [ # "CAAACCTCGTCAACGCCCGGTATCTGAGGATCGGTGTTCTCACCCAATACAAGTCGCATTCACTGGACCGCCA"] # def test_very_large_variant3(self): # v = Variant.create( # variant_sets=self.variant_sets, # reference=self.reference, # reference_bases="TCGTCAACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACTGGACCGCCAT", # start=1355978, # alternate_bases=["TCGTCAACGCCCGGTATCTGAGGATCGGTGTTCACCCAATACAAGTCGCATTCACTGGACCGCCAT"]) # panel = self.pg.create(v) # assert "AAACCTCGTCAACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACTGGACCGCCATATCTCG" in panel.refs # assert panel.alts == [ # "CAAACCTCGTCAACGCCCGGTATCTGAGGATCGGTGTTCACCCAATACAAGTCGCATTCACTGGACCGCCATATCTCGC"] def test_large_insertion(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=2352065, alternate_bases=[ "CCTCGCCTGGGCTGGCGAGCAGACGCAAAATCCCCCGCACGCCCGGCGTGTCGGGGGATTTTGCGTCTG" ]) panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert "AGCTCGGCCAGCTCAGTCACGTCGCCGCCGCCTCGCCAGTTGACCGCGCCCGCTCGCGGCT" in panel.refs assert panel.alts == [ "CCAGCTCAGTCACGTCGCCGCCGCCTCGCCTGGGCTGGCGAGCAGACGCAAAATCCCCCGCACGCCCGGCGTGTCGGGGGATTTTGCGTCTGCTCGCCAGTTGACCGCGCCCGCTCGCGGCT" ] def test_large_var1(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases= "CGCGGGAGTAGAACGATCGCCAAGTGGTCGGTCTTGGCTGCCCACTTCATCCCCGGCGCCACCGGCAGGTCTCGCGGTCATCTCGACCAACGGAGGGCCGTCGGTGGTTCGTATCCGGCCAAGAACGGCGAGAACGGTTTGTGCCTCTATGCCAGGGTGAATGTCTCATCTCCCAGGCGGACGGTGATATCCAGTTCTCCGCCAAGAGCGGACACGTATTTGCGCAGTGTGTTGACCTGTGCGGAGCCGATGTCGCCGTTCTCGATGCTGGATACCCGGCTCTGCCGGATGTGCGCCAGCGCAGCCACCTGGACCTGGGTGAGTGACTGAGCCGCGCGCAGCTCCCGGAGCCGGAATGCCCGCACTTCATCGCGCATTCGTGCCTTGTGCCGGTCCACCGCCTCCCGGTTAACGGGACGTACGGCGTCCATGTCCCGTAGTGTCATCGCCATCGTGCCACTTACCCTTTCTTGCGCTTGCGCCTCTTTGGCTTCGTGTCCTCGAACTGTGCGAGATGTTCGGCAAACATCTCATCGGCCGCTTTGATCTTCTCGTCGTACCACTGGGTCCACCGCCCGGCCTTGTTACCGGCGGCCAGCATGATCGCCTGCCGCGCCGGGTCGAAGGCGAACAGAATGCGGACCTCGGACCGCCCTTGTGATCCTGGACGCAGCTCCTTCATGTTCTTGTGGCGCGACCCACGCACCGTGTCCACCAGAGGACAGCCAAGTGCGGGGCCCTCTTCCTCGAGAACCTCGATAGCTGCGAACACCAATTCGTAGGTCTCTCGGTCCAAGCCGTTGAGCCAGGCGGAGATGCGCTCCACATCCGCCGTCCACCCCACAGAGTCGCAGAGTAGCGCGATACGCGATATCACACAAGGGTGATATTCCTCCGGGTAAGAGCAGCGGGCGACGGGGCTACCGTCGAGGAAATGCCGGCAGGCGAGGACGGACTCTGCGCACCCGGGCCGTTGAAACAGTAGCCTGTGCCAGGCCGAGAATTCATCCCCACGTATGAGGCAGTACAGTGCGCCGCCGTGCGCGTTCTCCCATGGAACGTTCACGGGCTCCCGTGGATGACAGGCGTTTCATGAACGCCAGCGCCGCCGCAACCCGACCGAAAGCGGTTGACCCCAAGGAGAGCTGGAAGTCGAGGCCACCACCTTCGCCGCGGAGTTGCTCATGCCCGAGAGCGAGACTCGTCCCGAAATACGCCGGCTCGATTTCGGCAAGTTGCTCGAACTGAAGCGGGAATGGGCGTCGACCCGCTCGACCAGCCCCAGCCGGGTGACCAGCCCCAGCCGGGTGACCAGCCGATGCACCGCGGCGATCCCACCGAAGCCGGTGGCATCGATGTTGGCGCCGACCTCGTAGCGCACCGCGCCCGAACCCAGCATCGGCCTGGGCTGCGCCGCCCAGCGTCCAGCCCGCGCGTGCCGCGCCGCCACCCTGCGCCCTCGGCGTGTGATGTTTCGCCGACTCTGTTCATGGGTTATCTTCTTCACCACAAAGGCCTTTCCTGCTGGGCTGTGTTGAGGTCGCAAACCCAGCCAGGGTAAGGCCTTTGGCCTCTCCTACCCGGCCGACACGCTTACTGAAGGCCTAGTCTAGGCAGGCCATTCAATCTGCGGAATCGAAAAATTCGGTTCCAGCCTGCTCGTTTCCTTTCCGACAGCGATCTGACGTTGCGTAACGTCATTTGTACGGACTCTTTTAGCGGCATTGATTTCAGATGCCAACGCCGTCTGTGCTGTAGCGCCGATTGGCCGAAACTGTAAATTTGTATGATTATTTAAATCTTTGACGAACACGCGCCACAAACGTACTATCTCTTTGGCAAAGTCCACCGGCATCTCATTCAACGGTTTTGTTTGCGCGTGGTCGTCATATGTTGGTAACTGTGTAACCGGCCGCCTATCTTGCGCGTGCATCATATGACTATGAATCGGCCTTCTCCAGTGAAATTGATACAAGATCGATCCGATAAGCGGTACCTTGTACACAGTGCAATTGTAGTAATTCGCGTTTTGTCCTACGCTTGTATTCTGCGTGAAGAATTCA", start=2266659, alternate_bases=[ "CACGCGAGTTGTAGATGATCGTTGAGTGGTCTTGCTTGGACTTCCATTTCATCTTTTCGACGCGCCAGGTCTCGCGGTCCTCCGGATCTGCGCCCGGTTTGAGTTGCACATCAAGGGGATACGGCTTGACCGACTCGTAGCCGACATGTAAGTCGGCTAGTTTCCGGCCGGCGCTGGCGAGCTGGTCGAAGCGTTCGCGGGTCTCCGGTGTTGGGATGTGCGGGAGCATCTTCTTGAGGTCAGCGGCGTATTTTGTGCGGTAGGCGGGGTCATGCAGCAGGCCGTAGACGTAGTAGAAGATGTCGTCTTTGGTGACTTGGTCGCCGATCGTGTCGCGGTAGAGCTTGAGGATGACGCCGGTGATGTTGTCGACGCGGCGGTAGCCGTGGTCGTCTACTTCGGCGTTGGTGGTGGACTCGAAATCGAGTTCGCCGTCACGTGGTTCGGTCTTCTCGTAGGTCCAGCGCGGGAAGAATTGACCGTTGCTTGAGCCCCAGAATGCGAGATCGGGGATAGCGTTTAGCATCAGACACGAGAAGGGCTTGTCTGAGCCCATGCCAACCACGTAGTAACCGACATTCCCGTGCTCCGGCGTCGGAAACATCGACGGAAGCTGGTAGGTACAGTTGTTGAGCTGCTGGTTGGGGTCGAGGTAGGCGTGCTCTTTCGTAAATGGTCGGTACGTGCCGAGCCGCATTCCCGCGGGAGCGAATTCGATGCGAATGCCTTGTGCCACTTGCCGCTTGTTGATGCGGTCCCAGCTGAACTTGGCCGAGTCCACGGTAATGAGGGCGTCAACCGGCGGGGTCTTGGCGTCCCTTCCGCGGATCTCGTTGATCCGGTCGACCTCCGAGTTGTAGAAGTCGATCGTGCGTCCGATGTTGGCCTCGAGCGCACCACGTGAAAAGTTGTAACACCACGCATCCCGGCTGGTCTTCAAGCCCGCGGAATAGTTCGCGAAGACACGTGTCACGTCAAGAGCAGCCTTCTTGTCGCCGATAACCGGCCACGCGCTGAACGCGTCGTCGCGTTGGTTGACCCAGTCACCGTGCAAGTTGGGTGTGACTGTCTGCCATTCCACCGTGTCGAGGTAGCCGTCGCCGACGATCCGCAACTTCTCCTCGCGACTCAGGTAATCGCCGATGTCGCGGTAAAGGACATCGCATGGCCCGCTGTGCTTCGGATCCTTGATGCCAAGGAAGATCGCCACCGTGTTGCGACTCCCCCCGCCAAAGACCTTGCCGCCTTCCTGGCGTGAGAGTTCCCCAGCTGTGCGCTGGTTCCCCCGCAGGTTGTACACATATACCGCCGCGTAGTCGTCGGCGAGCGACAACCGCATGCCGTCTGCCGTGTTGCCGTCTATGTACCCACCATTGGAGACGAATCCGACAACACCGTTGTCACCAATGCGGTCGGTCGCCCACCGGAACGCGCGAATATACGAGTCGTACAGGCTGTTCTTCAGCTGCGCCGTCGACCGCTTCGCGTACGTCTGCTCAATCCGCCCGTCCAACGTCGGATACTTCACGTTGGCGTTCAGGTCGTTCGCGCTGCTCTGCCCCACCGAGTACGGCGGATTCCCGATGATCACGCTGATCGGCGTCGCCAGCTGTCGCAAGATCCGAGCGTTGTTGTACGGGAACATGATCGCGTCCATCGAGTCCCCGGCTTCGGAAATCTGGAACGTGTCGGCCAGCGCCATCCCGGGGAACGGCTCATAGGCGTCGGCGTCGGCGGTCTTGCCCGCCAAAGCATGGTAGGTCGACTCGATGTTCACCGCGGCGATGTAGTACGCCAGCAGCATGATCTCGTTGGCGTGCAGCTCTTGCGAGTACTTTCGGGTGAGGTCGGCGGCCGTGATCAGGTCGGACTGCAGCAGCCGGGTAATGAATGTGCCCGTCCCGGCGAAGCCGTCCAGAATATGCACGCCCTCGTCGGTCAGCCCGCGCCCGAAATGCTTGCGCGACACGAAATCAGCCGCCCGCACAATGAAGTCCACGACCTCGACCGGCGTGTACACGATCCCCAGCGCCTCGGCCTGCTTCTTGAAGCCGATGCGGAAGAACTTCTCGTACAGCTCGGCGATCACCTGCTGCTTGCCCTCGGCGCTGGTGACCTCGCCGGCGCGCCGTCGCACCGATTCGTAAAAGCCTTCCAACCGAGCGGTTTCGGCCTCCAGGCCGGCACCCCCGACGGTGTCGACCATCTTCTGCATGGCCCGCGACACCGGGTTGTGCGACGCGAAGTCATGCCCGGCGAACAGCGCGTCGAACACCGGCTTGGTGATCAGGTGCTGCGAGAGCATGCTGATCGCGTCATCGGGGGTGATCGAGTCATTGAGGTTATCGCGCAGCCCGGCCAGGAACTGCTCGAACGCCGCCGCCGCCGTAGCGTCGGCGCCGCCGAGCAGGGCGTGGATACGGGTGGTCAGCGTCGCGGCGATGTCGGCGACATCGGCGGCCCACTGCTCCCAATAGGTCCGGGTGCCAACCTTGTCGACGATGCGCGCGTAGATCGCTTCCTGCCACTGCGACAACGAGAACATCGCCAACTGCTCCGCGACGGCGGGTCCCGCCTCGTCGGAGGTCGGCCCGATGTGACCGCCCAACAGCTTGTCGCTGCCTTCACCGGTCTTCGTCGGCTTCACGTTCAGCGCAATGCTGTTCACCATCGCGTCGAAGCGCTCGTCGTGCGACCGCAACGCGTTGAGGACCTGCCACACCACCTTGAACCGTTTGTTGTCGGCCAACGCGGCAGACGGCTCGACACCCTCGGGCACCGCCACCGGCAAGATGACGTACCCGTAGTCCTTGCCGGGCGACTTGCGCATCACCCGACCGACCGACTGCACCACGTCGACGATGGAATTGCGCGGATTCAGGAACAGCACCGCGTCCAGCGCGGGCACGTCGACCCCTTCGGAGAGGCAGCGGGCGTTGGACAGGATGCGGCATTCATCCTCGGCGACCACGCCTTTGAGCCAGGCCAGCTGTTCGTTGCGGACCAGCGCGTTGAACGTCCCGTCCACGTGGCGCACCG" ]) panel = self.pg.create(v) assert_no_overlapping_kmers(panel) assert "TGGTGACGCGGGAGTAGAACGATCGCCAAGTGGTCGGTCTTGGCTGCCCACTTCATCCCCGGCGCCACCGGCAGGTCTCGCGGTCATCTCGACCAACGGAGGGCCGTCGGTGGTTCGTATCCGGCCAAGAACGGCGAGAACGGTTTGTGCCTCTATGCCAGGGTGAATGTCTCATCTCCCAGGCGGACGGTGATATCCAGTTCTCCGCCAAGAGCGGACACGTATTTGCGCAGTGTGTTGACCTGTGCGGAGCCGATGTCGCCGTTCTCGATGCTGGATACCCGGCTCTGCCGGATGTGCGCCAGCGCAGCCACCTGGACCTGGGTGAGTGACTGAGCCGCGCGCAGCTCCCGGAGCCGGAATGCCCGCACTTCATCGCGCATTCGTGCCTTGTGCCGGTCCACCGCCTCCCGGTTAACGGGACGTACGGCGTCCATGTCCCGTAGTGTCATCGCCATCGTGCCACTTACCCTTTCTTGCGCTTGCGCCTCTTTGGCTTCGTGTCCTCGAACTGTGCGAGATGTTCGGCAAACATCTCATCGGCCGCTTTGATCTTCTCGTCGTACCACTGGGTCCACCGCCCGGCCTTGTTACCGGCGGCCAGCATGATCGCCTGCCGCGCCGGGTCGAAGGCGAACAGAATGCGGACCTCGGACCGCCCTTGTGATCCTGGACGCAGCTCCTTCATGTTCTTGTGGCGCGACCCACGCACCGTGTCCACCAGAGGACAGCCAAGTGCGGGGCCCTCTTCCTCGAGAACCTCGATAGCTGCGAACACCAATTCGTAGGTCTCTCGGTCCAAGCCGTTGAGCCAGGCGGAGATGCGCTCCACATCCGCCGTCCACCCCACAGAGTCGCAGAGTAGCGCGATACGCGATATCACACAAGGGTGATATTCCTCCGGGTAAGAGCAGCGGGCGACGGGGCTACCGTCGAGGAAATGCCGGCAGGCGAGGACGGACTCTGCGCACCCGGGCCGTTGAAACAGTAGCCTGTGCCAGGCCGAGAATTCATCCCCACGTATGAGGCAGTACAGTGCGCCGCCGTGCGCGTTCTCCCATGGAACGTTCACGGGCTCCCGTGGATGACAGGCGTTTCATGAACGCCAGCGCCGCCGCAACCCGACCGAAAGCGGTTGACCCCAAGGAGAGCTGGAAGTCGAGGCCACCACCTTCGCCGCGGAGTTGCTCATGCCCGAGAGCGAGACTCGTCCCGAAATACGCCGGCTCGATTTCGGCAAGTTGCTCGAACTGAAGCGGGAATGGGCGTCGACCCGCTCGACCAGCCCCAGCCGGGTGACCAGCCCCAGCCGGGTGACCAGCCGATGCACCGCGGCGATCCCACCGAAGCCGGTGGCATCGATGTTGGCGCCGACCTCGTAGCGCACCGCGCCCGAACCCAGCATCGGCCTGGGCTGCGCCGCCCAGCGTCCAGCCCGCGCGTGCCGCGCCGCCACCCTGCGCCCTCGGCGTGTGATGTTTCGCCGACTCTGTTCATGGGTTATCTTCTTCACCACAAAGGCCTTTCCTGCTGGGCTGTGTTGAGGTCGCAAACCCAGCCAGGGTAAGGCCTTTGGCCTCTCCTACCCGGCCGACACGCTTACTGAAGGCCTAGTCTAGGCAGGCCATTCAATCTGCGGAATCGAAAAATTCGGTTCCAGCCTGCTCGTTTCCTTTCCGACAGCGATCTGACGTTGCGTAACGTCATTTGTACGGACTCTTTTAGCGGCATTGATTTCAGATGCCAACGCCGTCTGTGCTGTAGCGCCGATTGGCCGAAACTGTAAATTTGTATGATTATTTAAATCTTTGACGAACACGCGCCACAAACGTACTATCTCTTTGGCAAAGTCCACCGGCATCTCATTCAACGGTTTTGTTTGCGCGTGGTCGTCATATGTTGGTAACTGTGTAACCGGCCGCCTATCTTGCGCGTGCATCATATGACTATGAATCGGCCTTCTCCAGTGAAATTGATACAAGATCGATCCGATAAGCGGTACCTTGTACACAGTGCAATTGTAGTAATTCGCGTTTTGTCCTACGCTTGTATTCTGCGTGAAGAATTCAAACA" in panel.refs assert panel.alts == [ "GGCCTCGTCGGGAATGCCGGCGATGGTGACACGCGAGTTGTAGATGATCGTTGAGTGGTCTTGCTTGGACTTCCATTTCATCTTTTCGACGCGCCAGGTCTCGCGGTCCTCCGGATCTGCGCCCGGTTTGAGTTGCACATCAAGGGGATACGGCTTGACCGACTCGTAGCCGACATGTAAGTCGGCTAGTTTCCGGCCGGCGCTGGCGAGCTGGTCGAAGCGTTCGCGGGTCTCCGGTGTTGGGATGTGCGGGAGCATCTTCTTGAGGTCAGCGGCGTATTTTGTGCGGTAGGCGGGGTCATGCAGCAGGCCGTAGACGTAGTAGAAGATGTCGTCTTTGGTGACTTGGTCGCCGATCGTGTCGCGGTAGAGCTTGAGGATGACGCCGGTGATGTTGTCGACGCGGCGGTAGCCGTGGTCGTCTACTTCGGCGTTGGTGGTGGACTCGAAATCGAGTTCGCCGTCACGTGGTTCGGTCTTCTCGTAGGTCCAGCGCGGGAAGAATTGACCGTTGCTTGAGCCCCAGAATGCGAGATCGGGGATAGCGTTTAGCATCAGACACGAGAAGGGCTTGTCTGAGCCCATGCCAACCACGTAGTAACCGACATTCCCGTGCTCCGGCGTCGGAAACATCGACGGAAGCTGGTAGGTACAGTTGTTGAGCTGCTGGTTGGGGTCGAGGTAGGCGTGCTCTTTCGTAAATGGTCGGTACGTGCCGAGCCGCATTCCCGCGGGAGCGAATTCGATGCGAATGCCTTGTGCCACTTGCCGCTTGTTGATGCGGTCCCAGCTGAACTTGGCCGAGTCCACGGTAATGAGGGCGTCAACCGGCGGGGTCTTGGCGTCCCTTCCGCGGATCTCGTTGATCCGGTCGACCTCCGAGTTGTAGAAGTCGATCGTGCGTCCGATGTTGGCCTCGAGCGCACCACGTGAAAAGTTGTAACACCACGCATCCCGGCTGGTCTTCAAGCCCGCGGAATAGTTCGCGAAGACACGTGTCACGTCAAGAGCAGCCTTCTTGTCGCCGATAACCGGCCACGCGCTGAACGCGTCGTCGCGTTGGTTGACCCAGTCACCGTGCAAGTTGGGTGTGACTGTCTGCCATTCCACCGTGTCGAGGTAGCCGTCGCCGACGATCCGCAACTTCTCCTCGCGACTCAGGTAATCGCCGATGTCGCGGTAAAGGACATCGCATGGCCCGCTGTGCTTCGGATCCTTGATGCCAAGGAAGATCGCCACCGTGTTGCGACTCCCCCCGCCAAAGACCTTGCCGCCTTCCTGGCGTGAGAGTTCCCCAGCTGTGCGCTGGTTCCCCCGCAGGTTGTACACATATACCGCCGCGTAGTCGTCGGCGAGCGACAACCGCATGCCGTCTGCCGTGTTGCCGTCTATGTACCCACCATTGGAGACGAATCCGACAACACCGTTGTCACCAATGCGGTCGGTCGCCCACCGGAACGCGCGAATATACGAGTCGTACAGGCTGTTCTTCAGCTGCGCCGTCGACCGCTTCGCGTACGTCTGCTCAATCCGCCCGTCCAACGTCGGATACTTCACGTTGGCGTTCAGGTCGTTCGCGCTGCTCTGCCCCACCGAGTACGGCGGATTCCCGATGATCACGCTGATCGGCGTCGCCAGCTGTCGCAAGATCCGAGCGTTGTTGTACGGGAACATGATCGCGTCCATCGAGTCCCCGGCTTCGGAAATCTGGAACGTGTCGGCCAGCGCCATCCCGGGGAACGGCTCATAGGCGTCGGCGTCGGCGGTCTTGCCCGCCAAAGCATGGTAGGTCGACTCGATGTTCACCGCGGCGATGTAGTACGCCAGCAGCATGATCTCGTTGGCGTGCAGCTCTTGCGAGTACTTTCGGGTGAGGTCGGCGGCCGTGATCAGGTCGGACTGCAGCAGCCGGGTAATGAATGTGCCCGTCCCGGCGAAGCCGTCCAGAATATGCACGCCCTCGTCGGTCAGCCCGCGCCCGAAATGCTTGCGCGACACGAAATCAGCCGCCCGCACAATGAAGTCCACGACCTCGACCGGCGTGTACACGATCCCCAGCGCCTCGGCCTGCTTCTTGAAGCCGATGCGGAAGAACTTCTCGTACAGCTCGGCGATCACCTGCTGCTTGCCCTCGGCGCTGGTGACCTCGCCGGCGCGCCGTCGCACCGATTCGTAAAAGCCTTCCAACCGAGCGGTTTCGGCCTCCAGGCCGGCACCCCCGACGGTGTCGACCATCTTCTGCATGGCCCGCGACACCGGGTTGTGCGACGCGAAGTCATGCCCGGCGAACAGCGCGTCGAACACCGGCTTGGTGATCAGGTGCTGCGAGAGCATGCTGATCGCGTCATCGGGGGTGATCGAGTCATTGAGGTTATCGCGCAGCCCGGCCAGGAACTGCTCGAACGCCGCCGCCGCCGTAGCGTCGGCGCCGCCGAGCAGGGCGTGGATACGGGTGGTCAGCGTCGCGGCGATGTCGGCGACATCGGCGGCCCACTGCTCCCAATAGGTCCGGGTGCCAACCTTGTCGACGATGCGCGCGTAGATCGCTTCCTGCCACTGCGACAACGAGAACATCGCCAACTGCTCCGCGACGGCGGGTCCCGCCTCGTCGGAGGTCGGCCCGATGTGACCGCCCAACAGCTTGTCGCTGCCTTCACCGGTCTTCGTCGGCTTCACGTTCAGCGCAATGCTGTTCACCATCGCGTCGAAGCGCTCGTCGTGCGACCGCAACGCGTTGAGGACCTGCCACACCACCTTGAACCGTTTGTTGTCGGCCAACGCGGCAGACGGCTCGACACCCTCGGGCACCGCCACCGGCAAGATGACGTACCCGTAGTCCTTGCCGGGCGACTTGCGCATCACCCGACCGACCGACTGCACCACGTCGACGATGGAATTGCGCGGATTCAGGAACAGCACCGCGTCCAGCGCGGGCACGTCGACCCCTTCGGAGAGGCAGCGGGCGTTGGACAGGATGCGGCATTCATCCTCGGCGACCACGCCTTTGAGCCAGGCCAGCTGTTCGTTGCGGACCAGCGCGTTGAACGTCCCGTCCACGTGGCGCACCGAACACGCCAGGCCCGGGCCGTCGTCAACCA" ]
def run(parser, args): DB = connect('mykrobe-%s' % (args.db_name)) if DB is not None: try: Variant.objects() logging.info( "Connected to mykrobe-%s" % (args.db_name)) except (ServerSelectionTimeoutError): DB = None logging.warning( "Could not connect to database. Continuing without using genetic backgrounds") mutations = [] reference = os.path.basename(args.reference_filepath).split('.fa')[0] if args.vcf: run_make_probes_from_vcf_file(args) elif args.genbank: aa2dna = GeneAminoAcidChangeToDNAVariants( args.reference_filepath, args.genbank) if args.text_file: with open(args.text_file, 'r') as infile: reader = csv.reader(infile, delimiter="\t") for row in reader: gene, mutation_string, alphabet = row if alphabet == "DNA": protein_coding_var = False else: protein_coding_var = True for var_name in aa2dna.get_variant_names( gene, mutation_string, protein_coding_var): mutation = Mutation(reference=reference, var_name=var_name, gene=aa2dna.get_gene(gene), mut=mutation_string) mutations.append(mutation) else: for variant in args.variants: gene, mutation = variant.split("_") for var_name in aa2dna.get_variant_names(gene, mutation): mutations.append( Mutation(reference=reference, var_name=var_name, gene=gene, mut=mutation)) else: if args.text_file: with open(args.text_file, 'r') as infile: reader = csv.reader(infile, delimiter="\t") for row in reader: gene_name, pos, ref, alt, alphabet = row if gene_name == "ref": mutations.append( Mutation( reference=reference, var_name="".join([ref, pos, alt]))) else: mutations.append( Mutation( reference=reference, var_name=row[0])) else: mutations.extend(Mutation(reference=reference, var_name=v) for v in args.variants) al = AlleleGenerator( reference_filepath=args.reference_filepath, kmer=args.kmer) for enum, mut in enumerate(mutations): if enum % 100 == 0: logger.info( "%i of %i - %f%%" % (enum, len(mutations), round(100*enum/len(mutations), 2))) variant_panel = make_variant_probe( al, mut.variant, args.kmer, DB=DB, no_backgrounds=args.no_backgrounds) if variant_panel is not None: for i, ref in enumerate(variant_panel.refs): try: gene_name = mut.gene.name except AttributeError: gene_name = "NA" sys.stdout.write( ">ref-%s?var_name=%s&num_alts=%i&ref=%s&enum=%i&gene=%s&mut=%s\n" % (mut.mut, mut.variant.var_name, len( variant_panel.alts), mut.reference, i, gene_name, mut.mut)) sys.stdout.write("%s\n" % ref) for i, a in enumerate(variant_panel.alts): sys.stdout.write(">alt-%s?var_name=%s&enum=%i&gene=%s&mut=%s\n" % (mut.mut, mut.variant.var_name, i, gene_name, mut.mut)) sys.stdout.write("%s\n" % a) else: logging.warning( "All variants failed for %s_%s - %s" % (mut.gene, mut.mut, mut.variant))