def setUp(self): DB.drop_database('atlas-test') self.pg = AlleleGenerator( reference_filepath="atlasvar/data/NC_000962.2.fasta") self.reference_set = ReferenceSet().create_and_save(name="ref_set") self.variant_set = VariantSet.create_and_save( name="this_vcf_file", reference_set=self.reference_set) self.variant_sets = [self.variant_set] self.reference = Reference().create_and_save( name="ref", md5checksum="sre", reference_sets=[self.reference_set])
def run(parser, args): db_name = 'atlas-%s' % (args.db_name) DB = connect(db_name) if args.verbose: logger.setLevel(level=logging.DEBUG) else: logger.setLevel(level=logging.INFO) al = AlleleGenerator( reference_filepath=args.reference_filepath, kmer=args.kmer) _variant_ids = get_non_singelton_variants(db_name) total = Variant.snps(id__in=_variant_ids).count() N = 100 pages = math.ceil(total / N) for page in range(pages): logger.info("%i of %i - %f%%" % (page*N, total, round(100*(page*N)/total, 2))) for variant in Variant.snps(id__in=_variant_ids).order_by("start").skip(N*page).limit(N): # for variant in Variant.snps().order_by("start"): variant_panel = make_variant_probe(al, variant, args.kmer, DB=DB) for i, ref in enumerate(variant_panel.refs): sys.stdout.write( ">ref-%s?var_name=%snum_alts=%i&ref=%s&enum=%i\n" % (variant_panel.variant.var_hash, variant.var_name[:100], len( variant_panel.alts), variant_panel.variant.reference.id, i)) sys.stdout.write("%s\n" % ref) for i, a in enumerate(variant_panel.alts): sys.stdout.write(">alt-%s?var_name=%s&enum=%i\n" % (variant_panel.variant.var_hash, variant.var_name[:100], i)) sys.stdout.write("%s\n" % a)
def __init__(self, bigsi, reference): self.bigsi = bigsi self.reference = reference self.al = AlleleGenerator(reference_filepath=self.reference, kmer=self.bigsi.kmer_size)
def __init__(self, cbg, reference): self.cbg = cbg self.reference = reference self.al = AlleleGenerator(reference_filepath=self.reference, kmer=self.cbg.kmer_size)
def test_panel_generator(self): pg = AlleleGenerator( reference_filepath="atlasvar/data/BX571856.1.fasta") assert pg.ref is not None
class TestSNPAlleleGenerator(): def setUp(self): DB.drop_database('atlas-test') self.pg = AlleleGenerator( reference_filepath="atlasvar/data/BX571856.1.fasta") self.reference_set = ReferenceSet().create_and_save(name="ref_set") self.variant_set = VariantSet.create_and_save( name="this_vcf_file", reference_set=self.reference_set) self.variant_sets = [self.variant_set] self.reference = Reference().create_and_save( name="ref", md5checksum="sre", reference_sets=[ self.reference_set]) def test_panel_generator(self): pg = AlleleGenerator( reference_filepath="atlasvar/data/BX571856.1.fasta") assert pg.ref is not None def test_simple_variant(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["T"]) panel = self.pg.create(v) assert panel.refs == [ "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT"] assert panel.alts == [ "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTGAT"] assert self.pg._calculate_length_delta_from_indels(v, []) == 0 assert v.is_indel is False def test_simple_variant2(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"]) panel = self.pg.create(v) assert panel.refs == [ "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGATC"] assert panel.alts == [ "GATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTGATC"] def test_simple_variant_invalid(self): with assert_raises(ValueError) as cm: v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=31, alternate_bases=["T"]) panel = self.pg.create(v) def test_simple_variant_start(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=1, alternate_bases=["T"]) panel = self.pg.create(v) assert panel.refs == [ "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT"] assert panel.alts == [ "TGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT"] def test_simple_variant_end(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=2902618, alternate_bases=["T"]) panel = self.pg.create(v) assert panel.refs == [ "TAACAAAATCCTTTTTATAACGCAAGTTCATTTTATACTACTGCTCAATTTTTTTACTTTTAT"] assert panel.alts == [ "TAACAAAATCCTTTTTATAACGCAAGTTCATTTTATACTACTGCTCAATTTTTTTACTTTTTT"] v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=2902616, alternate_bases=["C"]) panel = self.pg.create(v) assert panel.refs == [ "TAACAAAATCCTTTTTATAACGCAAGTTCATTTTATACTACTGCTCAATTTTTTTACTTTTAT"] assert panel.alts == [ "TAACAAAATCCTTTTTATAACGCAAGTTCATTTTATACTACTGCTCAATTTTTTTACTTCTAT"] def test_simple_variant_with_nearby_snp(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["T"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"]) panel = self.pg.create(v, context=[v2]) assert panel.refs == ["CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTGAT"] assert panel.alts == [ "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTGAT"] def test_simple_variant_with_multiple_nearby_snps(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["T"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"]) v3 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=30, alternate_bases=["G"]) panel = self.pg.create(v, context=[v2, v3]) assert panel.refs == ['CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT', 'CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTGAT', 'CGATTAAAGATAGAAATACACGATGCGAGGAATCAAATTTCATAACATCACCATGAGTTTGAT', 'CGATTAAAGATAGAAATACACGATGCGAGGATTCAAATTTCATAACATCACCATGAGTTTGAT'] assert panel.alts == [ "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGTTTCAAATTTCATAACATCACCATGAGTTTGAT"] def test_simple_variant_with_multiple_nearby_snps2(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["T"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"]) v3 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=30, alternate_bases=["G"]) v4 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=30, alternate_bases=["T"]) v5 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=30, alternate_bases=["A"]) assert sorted(self.pg._split_context([v, v3, v4])) == sorted( [[v, v4], [v, v3]]) assert (self.pg._split_context([v3, v4])) == [[v4], [v3]] assert (self.pg._split_context([v, v3, v4, v5])) == [ [v, v4, v5], [v, v3, v5]] panel = self.pg.create(v, context=[v2, v3, v4, v5]) assert sorted(panel.refs) == sorted( [ "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGAATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGATTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTAATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTATTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGAAATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGAATTCAAATTTCATAACATCACCATGAGTTTGAT"]) assert sorted( panel.alts) == sorted( [ "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGTTTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTTTTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGATATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGATTTCAAATTTCATAACATCACCATGAGTTTGAT"]) def test_simple_variant_with_multiple_nearby_snps(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["T"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"]) v5 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["G"]) v3 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=30, alternate_bases=["G"]) v4 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=30, alternate_bases=["T"]) panel = self.pg.create(v, context=[v2, v3, v4, v5]) assert sorted(panel.refs) == sorted([ "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGAATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGATTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTAATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTATTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCAGTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGAGTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTAGTCAAATTTCATAACATCACCATGAGTTTGAT"]) assert sorted( panel.alts) == sorted( [ "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGTTTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTTTTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCTGTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGGTGTCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGTTGTCAAATTTCATAACATCACCATGAGTTTGAT"])
class TestLargeINDELAlleleGenerator(): def setUp(self): DB.drop_database('atlas-test') self.pg = AlleleGenerator( reference_filepath="atlasvar/data/NC_000962.2.fasta") self.reference_set = ReferenceSet().create_and_save(name="ref_set") self.variant_set = VariantSet.create_and_save( name="this_vcf_file", reference_set=self.reference_set) self.variant_sets = [self.variant_set] self.reference = Reference().create_and_save( name="ref", md5checksum="sre", reference_sets=[self.reference_set]) def test_large_variant(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases= "AACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACT", start=1355983, alternate_bases=[ "ACCGCCCGGTATCTGAGGATTGGTTTTCCACCCAAATACAAGTCGCATTCGCG" ]) panel = self.pg.create(v) assert "TCGTCAACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACTGGACC" in panel.refs assert panel.alts == [ "TCGTCACCGCCCGGTATCTGAGGATTGGTTTTCCACCCAAATACAAGTCGCATTCGCGGGACC" ] def test_large_variant2(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases= "AACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCAC", start=1355983, alternate_bases=[ "ACCGCCCGGTATCTGAGGATTGGTTTTCCACCCAAATACAAGTCGCATTCGC" ]) panel = self.pg.create(v) assert "TCGTCAACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACTGGACC" in panel.refs assert panel.alts == [ "TCGTCACCGCCCGGTATCTGAGGATTGGTTTTCCACCCAAATACAAGTCGCATTCGCTGGACC" ] def test_large_variant3(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases= "TCGTCAACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACTGGACC", start=1355978, alternate_bases=[ "TCGTCAACGCCCGGTATCTGAGGATCGGTGTTCTCACCCAATACAAGTCGCATTCACTGGACC" ]) panel = self.pg.create(v) assert "CAAACCTCGTCAACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACTGGACCGCCATA" in panel.refs assert panel.alts == [ "CAAACCTCGTCAACGCCCGGTATCTGAGGATCGGTGTTCTCACCCAATACAAGTCGCATTCACTGGACCGCCATA" ] def test_very_large_variant3(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases= "TCGTCAACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACTGGACCGCCAT", start=1355978, alternate_bases=[ "TCGTCAACGCCCGGTATCTGAGGATCGGTGTTCACCCAATACAAGTCGCATTCACTGGACCGCCAT" ]) panel = self.pg.create(v) assert "AAACCTCGTCAACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACTGGACCGCCATATCTCG" in panel.refs assert panel.alts == [ "CAAACCTCGTCAACGCCCGGTATCTGAGGATCGGTGTTCACCCAATACAAGTCGCATTCACTGGACCGCCATATCTCGC" ] def test_large_insertion(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=2352065, alternate_bases=[ "CCTCGCCTGGGCTGGCGAGCAGACGCAAAATCCCCCGCACGCCCGGCGTGTCGGGGGATTTTGCGTCTG" ]) panel = self.pg.create(v) assert "AGCTCGGCCAGCTCAGTCACGTCGCCGCCGCCTCGCCAGTTGACCGCGCCCGCTCGCGGCTAG" in panel.refs assert panel.alts == [ "AAGTCGTCGAGCGAGAACGGTAGTTCCGCGGTGAACCGGTCCAGCTCGGCCAGCTCAGTCACGTCGCCGCCGCCTCGCCTGGGCTGGCGAGCAGACGCAAAATCCCCCGCACGCCCGGCGTGTCGGGGGATTTTGCGTCTGCTCGCCAGTTGACCGCGCCCGCTCGCGGCTAGCGGGCCTACGTGACGTCGTCATGAGATCCGATGACCGATGGC" ] def test_large_var1(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases= "CGCGGGAGTAGAACGATCGCCAAGTGGTCGGTCTTGGCTGCCCACTTCATCCCCGGCGCCACCGGCAGGTCTCGCGGTCATCTCGACCAACGGAGGGCCGTCGGTGGTTCGTATCCGGCCAAGAACGGCGAGAACGGTTTGTGCCTCTATGCCAGGGTGAATGTCTCATCTCCCAGGCGGACGGTGATATCCAGTTCTCCGCCAAGAGCGGACACGTATTTGCGCAGTGTGTTGACCTGTGCGGAGCCGATGTCGCCGTTCTCGATGCTGGATACCCGGCTCTGCCGGATGTGCGCCAGCGCAGCCACCTGGACCTGGGTGAGTGACTGAGCCGCGCGCAGCTCCCGGAGCCGGAATGCCCGCACTTCATCGCGCATTCGTGCCTTGTGCCGGTCCACCGCCTCCCGGTTAACGGGACGTACGGCGTCCATGTCCCGTAGTGTCATCGCCATCGTGCCACTTACCCTTTCTTGCGCTTGCGCCTCTTTGGCTTCGTGTCCTCGAACTGTGCGAGATGTTCGGCAAACATCTCATCGGCCGCTTTGATCTTCTCGTCGTACCACTGGGTCCACCGCCCGGCCTTGTTACCGGCGGCCAGCATGATCGCCTGCCGCGCCGGGTCGAAGGCGAACAGAATGCGGACCTCGGACCGCCCTTGTGATCCTGGACGCAGCTCCTTCATGTTCTTGTGGCGCGACCCACGCACCGTGTCCACCAGAGGACAGCCAAGTGCGGGGCCCTCTTCCTCGAGAACCTCGATAGCTGCGAACACCAATTCGTAGGTCTCTCGGTCCAAGCCGTTGAGCCAGGCGGAGATGCGCTCCACATCCGCCGTCCACCCCACAGAGTCGCAGAGTAGCGCGATACGCGATATCACACAAGGGTGATATTCCTCCGGGTAAGAGCAGCGGGCGACGGGGCTACCGTCGAGGAAATGCCGGCAGGCGAGGACGGACTCTGCGCACCCGGGCCGTTGAAACAGTAGCCTGTGCCAGGCCGAGAATTCATCCCCACGTATGAGGCAGTACAGTGCGCCGCCGTGCGCGTTCTCCCATGGAACGTTCACGGGCTCCCGTGGATGACAGGCGTTTCATGAACGCCAGCGCCGCCGCAACCCGACCGAAAGCGGTTGACCCCAAGGAGAGCTGGAAGTCGAGGCCACCACCTTCGCCGCGGAGTTGCTCATGCCCGAGAGCGAGACTCGTCCCGAAATACGCCGGCTCGATTTCGGCAAGTTGCTCGAACTGAAGCGGGAATGGGCGTCGACCCGCTCGACCAGCCCCAGCCGGGTGACCAGCCCCAGCCGGGTGACCAGCCGATGCACCGCGGCGATCCCACCGAAGCCGGTGGCATCGATGTTGGCGCCGACCTCGTAGCGCACCGCGCCCGAACCCAGCATCGGCCTGGGCTGCGCCGCCCAGCGTCCAGCCCGCGCGTGCCGCGCCGCCACCCTGCGCCCTCGGCGTGTGATGTTTCGCCGACTCTGTTCATGGGTTATCTTCTTCACCACAAAGGCCTTTCCTGCTGGGCTGTGTTGAGGTCGCAAACCCAGCCAGGGTAAGGCCTTTGGCCTCTCCTACCCGGCCGACACGCTTACTGAAGGCCTAGTCTAGGCAGGCCATTCAATCTGCGGAATCGAAAAATTCGGTTCCAGCCTGCTCGTTTCCTTTCCGACAGCGATCTGACGTTGCGTAACGTCATTTGTACGGACTCTTTTAGCGGCATTGATTTCAGATGCCAACGCCGTCTGTGCTGTAGCGCCGATTGGCCGAAACTGTAAATTTGTATGATTATTTAAATCTTTGACGAACACGCGCCACAAACGTACTATCTCTTTGGCAAAGTCCACCGGCATCTCATTCAACGGTTTTGTTTGCGCGTGGTCGTCATATGTTGGTAACTGTGTAACCGGCCGCCTATCTTGCGCGTGCATCATATGACTATGAATCGGCCTTCTCCAGTGAAATTGATACAAGATCGATCCGATAAGCGGTACCTTGTACACAGTGCAATTGTAGTAATTCGCGTTTTGTCCTACGCTTGTATTCTGCGTGAAGAATTCA", start=2266659, alternate_bases=[ "CACGCGAGTTGTAGATGATCGTTGAGTGGTCTTGCTTGGACTTCCATTTCATCTTTTCGACGCGCCAGGTCTCGCGGTCCTCCGGATCTGCGCCCGGTTTGAGTTGCACATCAAGGGGATACGGCTTGACCGACTCGTAGCCGACATGTAAGTCGGCTAGTTTCCGGCCGGCGCTGGCGAGCTGGTCGAAGCGTTCGCGGGTCTCCGGTGTTGGGATGTGCGGGAGCATCTTCTTGAGGTCAGCGGCGTATTTTGTGCGGTAGGCGGGGTCATGCAGCAGGCCGTAGACGTAGTAGAAGATGTCGTCTTTGGTGACTTGGTCGCCGATCGTGTCGCGGTAGAGCTTGAGGATGACGCCGGTGATGTTGTCGACGCGGCGGTAGCCGTGGTCGTCTACTTCGGCGTTGGTGGTGGACTCGAAATCGAGTTCGCCGTCACGTGGTTCGGTCTTCTCGTAGGTCCAGCGCGGGAAGAATTGACCGTTGCTTGAGCCCCAGAATGCGAGATCGGGGATAGCGTTTAGCATCAGACACGAGAAGGGCTTGTCTGAGCCCATGCCAACCACGTAGTAACCGACATTCCCGTGCTCCGGCGTCGGAAACATCGACGGAAGCTGGTAGGTACAGTTGTTGAGCTGCTGGTTGGGGTCGAGGTAGGCGTGCTCTTTCGTAAATGGTCGGTACGTGCCGAGCCGCATTCCCGCGGGAGCGAATTCGATGCGAATGCCTTGTGCCACTTGCCGCTTGTTGATGCGGTCCCAGCTGAACTTGGCCGAGTCCACGGTAATGAGGGCGTCAACCGGCGGGGTCTTGGCGTCCCTTCCGCGGATCTCGTTGATCCGGTCGACCTCCGAGTTGTAGAAGTCGATCGTGCGTCCGATGTTGGCCTCGAGCGCACCACGTGAAAAGTTGTAACACCACGCATCCCGGCTGGTCTTCAAGCCCGCGGAATAGTTCGCGAAGACACGTGTCACGTCAAGAGCAGCCTTCTTGTCGCCGATAACCGGCCACGCGCTGAACGCGTCGTCGCGTTGGTTGACCCAGTCACCGTGCAAGTTGGGTGTGACTGTCTGCCATTCCACCGTGTCGAGGTAGCCGTCGCCGACGATCCGCAACTTCTCCTCGCGACTCAGGTAATCGCCGATGTCGCGGTAAAGGACATCGCATGGCCCGCTGTGCTTCGGATCCTTGATGCCAAGGAAGATCGCCACCGTGTTGCGACTCCCCCCGCCAAAGACCTTGCCGCCTTCCTGGCGTGAGAGTTCCCCAGCTGTGCGCTGGTTCCCCCGCAGGTTGTACACATATACCGCCGCGTAGTCGTCGGCGAGCGACAACCGCATGCCGTCTGCCGTGTTGCCGTCTATGTACCCACCATTGGAGACGAATCCGACAACACCGTTGTCACCAATGCGGTCGGTCGCCCACCGGAACGCGCGAATATACGAGTCGTACAGGCTGTTCTTCAGCTGCGCCGTCGACCGCTTCGCGTACGTCTGCTCAATCCGCCCGTCCAACGTCGGATACTTCACGTTGGCGTTCAGGTCGTTCGCGCTGCTCTGCCCCACCGAGTACGGCGGATTCCCGATGATCACGCTGATCGGCGTCGCCAGCTGTCGCAAGATCCGAGCGTTGTTGTACGGGAACATGATCGCGTCCATCGAGTCCCCGGCTTCGGAAATCTGGAACGTGTCGGCCAGCGCCATCCCGGGGAACGGCTCATAGGCGTCGGCGTCGGCGGTCTTGCCCGCCAAAGCATGGTAGGTCGACTCGATGTTCACCGCGGCGATGTAGTACGCCAGCAGCATGATCTCGTTGGCGTGCAGCTCTTGCGAGTACTTTCGGGTGAGGTCGGCGGCCGTGATCAGGTCGGACTGCAGCAGCCGGGTAATGAATGTGCCCGTCCCGGCGAAGCCGTCCAGAATATGCACGCCCTCGTCGGTCAGCCCGCGCCCGAAATGCTTGCGCGACACGAAATCAGCCGCCCGCACAATGAAGTCCACGACCTCGACCGGCGTGTACACGATCCCCAGCGCCTCGGCCTGCTTCTTGAAGCCGATGCGGAAGAACTTCTCGTACAGCTCGGCGATCACCTGCTGCTTGCCCTCGGCGCTGGTGACCTCGCCGGCGCGCCGTCGCACCGATTCGTAAAAGCCTTCCAACCGAGCGGTTTCGGCCTCCAGGCCGGCACCCCCGACGGTGTCGACCATCTTCTGCATGGCCCGCGACACCGGGTTGTGCGACGCGAAGTCATGCCCGGCGAACAGCGCGTCGAACACCGGCTTGGTGATCAGGTGCTGCGAGAGCATGCTGATCGCGTCATCGGGGGTGATCGAGTCATTGAGGTTATCGCGCAGCCCGGCCAGGAACTGCTCGAACGCCGCCGCCGCCGTAGCGTCGGCGCCGCCGAGCAGGGCGTGGATACGGGTGGTCAGCGTCGCGGCGATGTCGGCGACATCGGCGGCCCACTGCTCCCAATAGGTCCGGGTGCCAACCTTGTCGACGATGCGCGCGTAGATCGCTTCCTGCCACTGCGACAACGAGAACATCGCCAACTGCTCCGCGACGGCGGGTCCCGCCTCGTCGGAGGTCGGCCCGATGTGACCGCCCAACAGCTTGTCGCTGCCTTCACCGGTCTTCGTCGGCTTCACGTTCAGCGCAATGCTGTTCACCATCGCGTCGAAGCGCTCGTCGTGCGACCGCAACGCGTTGAGGACCTGCCACACCACCTTGAACCGTTTGTTGTCGGCCAACGCGGCAGACGGCTCGACACCCTCGGGCACCGCCACCGGCAAGATGACGTACCCGTAGTCCTTGCCGGGCGACTTGCGCATCACCCGACCGACCGACTGCACCACGTCGACGATGGAATTGCGCGGATTCAGGAACAGCACCGCGTCCAGCGCGGGCACGTCGACCCCTTCGGAGAGGCAGCGGGCGTTGGACAGGATGCGGCATTCATCCTCGGCGACCACGCCTTTGAGCCAGGCCAGCTGTTCGTTGCGGACCAGCGCGTTGAACGTCCCGTCCACGTGGCGCACCG" ]) panel = self.pg.create(v) assert "TGGTGACGCGGGAGTAGAACGATCGCCAAGTGGTCGGTCTTGGCTGCCCACTTCATCCCCGGCGCCACCGGCAGGTCTCGCGGTCATCTCGACCAACGGAGGGCCGTCGGTGGTTCGTATCCGGCCAAGAACGGCGAGAACGGTTTGTGCCTCTATGCCAGGGTGAATGTCTCATCTCCCAGGCGGACGGTGATATCCAGTTCTCCGCCAAGAGCGGACACGTATTTGCGCAGTGTGTTGACCTGTGCGGAGCCGATGTCGCCGTTCTCGATGCTGGATACCCGGCTCTGCCGGATGTGCGCCAGCGCAGCCACCTGGACCTGGGTGAGTGACTGAGCCGCGCGCAGCTCCCGGAGCCGGAATGCCCGCACTTCATCGCGCATTCGTGCCTTGTGCCGGTCCACCGCCTCCCGGTTAACGGGACGTACGGCGTCCATGTCCCGTAGTGTCATCGCCATCGTGCCACTTACCCTTTCTTGCGCTTGCGCCTCTTTGGCTTCGTGTCCTCGAACTGTGCGAGATGTTCGGCAAACATCTCATCGGCCGCTTTGATCTTCTCGTCGTACCACTGGGTCCACCGCCCGGCCTTGTTACCGGCGGCCAGCATGATCGCCTGCCGCGCCGGGTCGAAGGCGAACAGAATGCGGACCTCGGACCGCCCTTGTGATCCTGGACGCAGCTCCTTCATGTTCTTGTGGCGCGACCCACGCACCGTGTCCACCAGAGGACAGCCAAGTGCGGGGCCCTCTTCCTCGAGAACCTCGATAGCTGCGAACACCAATTCGTAGGTCTCTCGGTCCAAGCCGTTGAGCCAGGCGGAGATGCGCTCCACATCCGCCGTCCACCCCACAGAGTCGCAGAGTAGCGCGATACGCGATATCACACAAGGGTGATATTCCTCCGGGTAAGAGCAGCGGGCGACGGGGCTACCGTCGAGGAAATGCCGGCAGGCGAGGACGGACTCTGCGCACCCGGGCCGTTGAAACAGTAGCCTGTGCCAGGCCGAGAATTCATCCCCACGTATGAGGCAGTACAGTGCGCCGCCGTGCGCGTTCTCCCATGGAACGTTCACGGGCTCCCGTGGATGACAGGCGTTTCATGAACGCCAGCGCCGCCGCAACCCGACCGAAAGCGGTTGACCCCAAGGAGAGCTGGAAGTCGAGGCCACCACCTTCGCCGCGGAGTTGCTCATGCCCGAGAGCGAGACTCGTCCCGAAATACGCCGGCTCGATTTCGGCAAGTTGCTCGAACTGAAGCGGGAATGGGCGTCGACCCGCTCGACCAGCCCCAGCCGGGTGACCAGCCCCAGCCGGGTGACCAGCCGATGCACCGCGGCGATCCCACCGAAGCCGGTGGCATCGATGTTGGCGCCGACCTCGTAGCGCACCGCGCCCGAACCCAGCATCGGCCTGGGCTGCGCCGCCCAGCGTCCAGCCCGCGCGTGCCGCGCCGCCACCCTGCGCCCTCGGCGTGTGATGTTTCGCCGACTCTGTTCATGGGTTATCTTCTTCACCACAAAGGCCTTTCCTGCTGGGCTGTGTTGAGGTCGCAAACCCAGCCAGGGTAAGGCCTTTGGCCTCTCCTACCCGGCCGACACGCTTACTGAAGGCCTAGTCTAGGCAGGCCATTCAATCTGCGGAATCGAAAAATTCGGTTCCAGCCTGCTCGTTTCCTTTCCGACAGCGATCTGACGTTGCGTAACGTCATTTGTACGGACTCTTTTAGCGGCATTGATTTCAGATGCCAACGCCGTCTGTGCTGTAGCGCCGATTGGCCGAAACTGTAAATTTGTATGATTATTTAAATCTTTGACGAACACGCGCCACAAACGTACTATCTCTTTGGCAAAGTCCACCGGCATCTCATTCAACGGTTTTGTTTGCGCGTGGTCGTCATATGTTGGTAACTGTGTAACCGGCCGCCTATCTTGCGCGTGCATCATATGACTATGAATCGGCCTTCTCCAGTGAAATTGATACAAGATCGATCCGATAAGCGGTACCTTGTACACAGTGCAATTGTAGTAATTCGCGTTTTGTCCTACGCTTGTATTCTGCGTGAAGAATTCAAACACG" in panel.refs assert panel.alts == [ "GACCGCCGAGTGCGGCTGGATTGGATTTCACAAGGATGCCAATATCCGGCGCAACGCCGTCGAGCGACGGACGGTGCTCGACACGGGAGCCCGGCTATTCTGTGTGCCGCGGGCCGACATCCTGGCAGAGCAAGTCGCGGCACGGTATATTGCGTCCCTTGCGGCGATTGCCCGTGCCGCACGATTTCCGGGACCATTCATCTACACGGTTCACCCGAGCAAGATCGTTCGCGTGCTCTAGTCGTTCATCGCTCCGTTAACCGCCGGCGAGGCCGTCGACGATCTTCATGGTCTCGACGCTGACGGTGGTCACCTTCTTGATGAGGTCGACGATGTAGGTGGGATCGTCGTGTTCGTCGCACCAGTCGTTGGGGTCGTTGACGATGCCCGACGCTTTGTCGGTGGTGACGCGGTAGCGCTCGATGATCCAGCCGAGCGCCGAGCGGGAGCGAGCAGGTAGCGCTCGGCCTCGTCGGGAATGCCGGCGATGGTGACACGCGAGTTGTAGATGATCGTTGAGTGGTCTTGCTTGGACTTCCATTTCATCTTTTCGACGCGCCAGGTCTCGCGGTCCTCCGGATCTGCGCCCGGTTTGAGTTGCACATCAAGGGGATACGGCTTGACCGACTCGTAGCCGACATGTAAGTCGGCTAGTTTCCGGCCGGCGCTGGCGAGCTGGTCGAAGCGTTCGCGGGTCTCCGGTGTTGGGATGTGCGGGAGCATCTTCTTGAGGTCAGCGGCGTATTTTGTGCGGTAGGCGGGGTCATGCAGCAGGCCGTAGACGTAGTAGAAGATGTCGTCTTTGGTGACTTGGTCGCCGATCGTGTCGCGGTAGAGCTTGAGGATGACGCCGGTGATGTTGTCGACGCGGCGGTAGCCGTGGTCGTCTACTTCGGCGTTGGTGGTGGACTCGAAATCGAGTTCGCCGTCACGTGGTTCGGTCTTCTCGTAGGTCCAGCGCGGGAAGAATTGACCGTTGCTTGAGCCCCAGAATGCGAGATCGGGGATAGCGTTTAGCATCAGACACGAGAAGGGCTTGTCTGAGCCCATGCCAACCACGTAGTAACCGACATTCCCGTGCTCCGGCGTCGGAAACATCGACGGAAGCTGGTAGGTACAGTTGTTGAGCTGCTGGTTGGGGTCGAGGTAGGCGTGCTCTTTCGTAAATGGTCGGTACGTGCCGAGCCGCATTCCCGCGGGAGCGAATTCGATGCGAATGCCTTGTGCCACTTGCCGCTTGTTGATGCGGTCCCAGCTGAACTTGGCCGAGTCCACGGTAATGAGGGCGTCAACCGGCGGGGTCTTGGCGTCCCTTCCGCGGATCTCGTTGATCCGGTCGACCTCCGAGTTGTAGAAGTCGATCGTGCGTCCGATGTTGGCCTCGAGCGCACCACGTGAAAAGTTGTAACACCACGCATCCCGGCTGGTCTTCAAGCCCGCGGAATAGTTCGCGAAGACACGTGTCACGTCAAGAGCAGCCTTCTTGTCGCCGATAACCGGCCACGCGCTGAACGCGTCGTCGCGTTGGTTGACCCAGTCACCGTGCAAGTTGGGTGTGACTGTCTGCCATTCCACCGTGTCGAGGTAGCCGTCGCCGACGATCCGCAACTTCTCCTCGCGACTCAGGTAATCGCCGATGTCGCGGTAAAGGACATCGCATGGCCCGCTGTGCTTCGGATCCTTGATGCCAAGGAAGATCGCCACCGTGTTGCGACTCCCCCCGCCAAAGACCTTGCCGCCTTCCTGGCGTGAGAGTTCCCCAGCTGTGCGCTGGTTCCCCCGCAGGTTGTACACATATACCGCCGCGTAGTCGTCGGCGAGCGACAACCGCATGCCGTCTGCCGTGTTGCCGTCTATGTACCCACCATTGGAGACGAATCCGACAACACCGTTGTCACCAATGCGGTCGGTCGCCCACCGGAACGCGCGAATATACGAGTCGTACAGGCTGTTCTTCAGCTGCGCCGTCGACCGCTTCGCGTACGTCTGCTCAATCCGCCCGTCCAACGTCGGATACTTCACGTTGGCGTTCAGGTCGTTCGCGCTGCTCTGCCCCACCGAGTACGGCGGATTCCCGATGATCACGCTGATCGGCGTCGCCAGCTGTCGCAAGATCCGAGCGTTGTTGTACGGGAACATGATCGCGTCCATCGAGTCCCCGGCTTCGGAAATCTGGAACGTGTCGGCCAGCGCCATCCCGGGGAACGGCTCATAGGCGTCGGCGTCGGCGGTCTTGCCCGCCAAAGCATGGTAGGTCGACTCGATGTTCACCGCGGCGATGTAGTACGCCAGCAGCATGATCTCGTTGGCGTGCAGCTCTTGCGAGTACTTTCGGGTGAGGTCGGCGGCCGTGATCAGGTCGGACTGCAGCAGCCGGGTAATGAATGTGCCCGTCCCGGCGAAGCCGTCCAGAATATGCACGCCCTCGTCGGTCAGCCCGCGCCCGAAATGCTTGCGCGACACGAAATCAGCCGCCCGCACAATGAAGTCCACGACCTCGACCGGCGTGTACACGATCCCCAGCGCCTCGGCCTGCTTCTTGAAGCCGATGCGGAAGAACTTCTCGTACAGCTCGGCGATCACCTGCTGCTTGCCCTCGGCGCTGGTGACCTCGCCGGCGCGCCGTCGCACCGATTCGTAAAAGCCTTCCAACCGAGCGGTTTCGGCCTCCAGGCCGGCACCCCCGACGGTGTCGACCATCTTCTGCATGGCCCGCGACACCGGGTTGTGCGACGCGAAGTCATGCCCGGCGAACAGCGCGTCGAACACCGGCTTGGTGATCAGGTGCTGCGAGAGCATGCTGATCGCGTCATCGGGGGTGATCGAGTCATTGAGGTTATCGCGCAGCCCGGCCAGGAACTGCTCGAACGCCGCCGCCGCCGTAGCGTCGGCGCCGCCGAGCAGGGCGTGGATACGGGTGGTCAGCGTCGCGGCGATGTCGGCGACATCGGCGGCCCACTGCTCCCAATAGGTCCGGGTGCCAACCTTGTCGACGATGCGCGCGTAGATCGCTTCCTGCCACTGCGACAACGAGAACATCGCCAACTGCTCCGCGACGGCGGGTCCCGCCTCGTCGGAGGTCGGCCCGATGTGACCGCCCAACAGCTTGTCGCTGCCTTCACCGGTCTTCGTCGGCTTCACGTTCAGCGCAATGCTGTTCACCATCGCGTCGAAGCGCTCGTCGTGCGACCGCAACGCGTTGAGGACCTGCCACACCACCTTGAACCGTTTGTTGTCGGCCAACGCGGCAGACGGCTCGACACCCTCGGGCACCGCCACCGGCAAGATGACGTACCCGTAGTCCTTGCCGGGCGACTTGCGCATCACCCGACCGACCGACTGCACCACGTCGACGATGGAATTGCGCGGATTCAGGAACAGCACCGCGTCCAGCGCGGGCACGTCGACCCCTTCGGAGAGGCAGCGGGCGTTGGACAGGATGCGGCATTCATCCTCGGCGACCACGCCTTTGAGCCAGGCCAGCTGTTCGTTGCGGACCAGCGCGTTGAACGTCCCGTCCACGTGGCGCACCGAACACGCCAGGCCCGGGCCGTCGTCAACCAATTCGCGGTATGCCTCAACCACTTTCGGGAACAGCTCGGCAACCTGCTTGGACGTCTTGATGTCCTTGGCGAACGCCACCGCCCGACGCATCGGCGGCTCACCGGCGACAATGCCGGTACCGGACCGCTTGGCCAGGCCATTCCAGCAGCCGACGATCTTGGAGGCGTCGTCGAGCATCAGCTCGCCGGAAACCCCGGAGAGTTCCTGCTGCAACCGGGGCGCGATCACGCCCTGATCGACGGTGAGCACCATCACCTTGTAGTCGGTGAGCAGCCCGCGCTCCACCGCCTCGCCGAACGACAGCCGGTGAAACTCCGGCCCGAACGTCAGCTCGTCGTCCATCGACACCAACTCGGCGGAGTGCTGGTCGGCCCTGTCCTTGATGCTCTCGGTGAAAATCCTTGGCGTGGCGGTCATATACAGCCGCCGGGCCGCCTTCAGATACTGACCGTCGTGCACCCGC" ]
def run(parser, args): DB = connect('atlas-%s' % (args.db_name)) if DB is not None: try: Variant.objects() logging.info("Connected to atlas-%s" % (args.db_name)) except (ServerSelectionTimeoutError, ConnectionError): DB = None logging.warning( "Could not connect to database. Continuing without using genetic backgrounds" ) mutations = [] reference = os.path.basename(args.reference_filepath).split('.fa')[0] if args.vcf: run_make_probes_from_vcf_file(args) elif args.genbank: aa2dna = GeneAminoAcidChangeToDNAVariants(args.reference_filepath, args.genbank) if args.text_file: with open(args.text_file, 'r') as infile: reader = csv.reader(infile, delimiter="\t") for row in reader: gene, mutation, alphabet = row if alphabet == "DNA": protein_coding_var = False else: protein_coding_var = True for var_name in aa2dna.get_variant_names( gene, mutation, protein_coding_var): mutations.append( Mutation(reference=reference, var_name=var_name, gene=aa2dna.get_gene(gene), mut=mutation)) else: for variant in args.variants: gene, mutation = variant.split("_") for var_name in aa2dna.get_variant_names(gene, mutation): mutations.append( Mutation(reference=reference, var_name=var_name, gene=gene, mut=mutation)) else: if args.text_file: with open(args.text_file, 'r') as infile: reader = csv.reader(infile, delimiter="\t") for row in reader: gene_name, pos, ref, alt, alphabet = row if gene_name == "ref": mutations.append( Mutation(reference=reference, var_name="".join([ref, pos, alt]))) else: mutations.append( Mutation(reference=reference, var_name=row[0])) else: mutations.extend( Mutation(reference=reference, var_name=v) for v in args.variants) al = AlleleGenerator(reference_filepath=args.reference_filepath, kmer=args.kmer) for enum, mut in enumerate(mutations): if enum % 100 == 0: logger.info( "%i of %i - %f%%" % (enum, len(mutations), round(100 * enum / len(mutations), 2))) variant_panel = make_variant_probe(al, mut.variant, args.kmer, DB=DB, no_backgrounds=args.no_backgrounds) if variant_panel is not None: for i, ref in enumerate(variant_panel.refs): try: gene_name = mut.gene.name except AttributeError: gene_name = "NA" sys.stdout.write( ">ref-%s?var_name=%s&num_alts=%i&ref=%s&enum=%i&gene=%s&mut=%s\n" % (mut.mut, mut.variant.var_name, len(variant_panel.alts), mut.reference, i, gene_name, mut.mut)) sys.stdout.write("%s\n" % ref) for i, a in enumerate(variant_panel.alts): sys.stdout.write( ">alt-%s?var_name=%s&enum=%i&gene=%s&mut=%s\n" % (mut.mut, mut.variant.var_name, i, gene_name, mut.mut)) sys.stdout.write("%s\n" % a) else: logging.warning("All variants failed for %s_%s - %s" % (mut.gene, mut.mut, mut.variant))
class TestINDELandSNPSAlleleGenerator(): def setUp(self): DB.drop_database('atlas-test') self.pg = AlleleGenerator( reference_filepath="atlasvar/data/BX571856.1.fasta") self.pg2 = AlleleGenerator( reference_filepath="atlasvar/data/NC_000962.2.fasta") self.reference_set = ReferenceSet().create_and_save(name="ref_set") self.variant_set = VariantSet.create_and_save( name="this_vcf_file", reference_set=self.reference_set) self.variant_sets = [self.variant_set] self.reference = Reference().create_and_save( name="ref", md5checksum="sre", reference_sets=[ self.reference_set]) def test_ins_with_SNP_context(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["ATTT"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"]) panel = self.pg.create(v, context=[v2]) assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT" in panel.refs assert sorted( panel.alts) == sorted( [ "CGATTAAAGATAGAAATACACGATGCGAGCATTTATCAAATTTCATAACATCACCATGAGTTTGAT", "CGATTAAAGATAGAAATACACGATGCGAGCATTTTTCAAATTTCATAACATCACCATGAGTTTGAT"]) def test_del_with_SNP_context1(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="AA", start=31, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=33, alternate_bases=["A"]) panel = self.pg.create(v, context=[v2]) assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT" in panel.refs assert sorted( panel.alts) == sorted( [ "CGATTAAAGATAGAAATACACGATGCGAGCATCAAATTTCATAACATCACCATGAGTTTGATC", "CGATTAAAGATAGAAATACACGATGCGAGCAACAAATTTCATAACATCACCATGAGTTTGATC"]) def test_del_with_SNP_context2(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="AA", start=31, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["T"]) panel = self.pg.create(v, context=[v2]) assert self.pg._remove_overlapping_contexts(v, [v2]) == [] assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT" in panel.refs assert sorted( panel.alts) == sorted( ["CGATTAAAGATAGAAATACACGATGCGAGCATCAAATTTCATAACATCACCATGAGTTTGATC"]) def test_del_with_ins_context1(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="AAT", start=31, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=4, alternate_bases=["TTTT"]) panel = self.pg.create(v, context=[v2]) assert self.pg._remove_overlapping_contexts(v, [v2]) == [v2] assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT" in panel.refs assert sorted( panel.alts) == sorted( [ "CGATTAAAGATAGAAATACACGATGCGAGCACAAATTTCATAACATCACCATGAGTTTGATCC", "CGATTTTTAAAGATAGAAATACACGATGCGAGCACAAATTTCATAACATCACCATGAGTTTGAT"]) def test_del_with_ins_context2(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=1, alternate_bases=["CTTT"]) panel = self.pg.create(v, context=[v2]) assert self.pg._remove_overlapping_contexts(v, [v2]) == [v2] assert self.pg._remove_contexts_not_within_k(v, [v2]) == [] assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGATC" in panel.refs assert sorted( panel.alts) == sorted( ["CGATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATCC"]) def test_del_with_ins_context3(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=5, alternate_bases=["TT"]) panel = self.pg.create(v, context=[v2]) assert self.pg._remove_overlapping_contexts(v, [v2]) == [v2] assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGATC" in panel.refs assert sorted( panel.alts) == sorted( [ "CGATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATCC", "GATTTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATCC"]) def test_del_with_ins_context4(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=5, alternate_bases=["TT"]) v3 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=5, alternate_bases=["TG"]) panel = self.pg.create(v, context=[v2, v3]) assert self.pg._remove_overlapping_contexts(v, [v2, v3]) == [v2, v3] assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGATC" in panel.refs assert sorted( panel.alts) == sorted( [ "CGATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATCC", "GATTTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATCC", "GATTGAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATCC"]) def test_del_with_ins_context5(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=5, alternate_bases=["TT"]) v3 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=6, alternate_bases=["AG"]) panel = self.pg.create(v, context=[v2, v3]) assert self.pg._remove_overlapping_contexts(v, [v2, v3]) == [v2, v3] assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGATC" in panel.refs assert sorted( panel.alts) == sorted( [ "CGATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATCC", "GATTTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATCC", "GATTAGAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATCC", "GATTTAGAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATC"]) def test_del_with_ins_context_where_base_is_deleted(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=33, alternate_bases=["C"]) panel = self.pg.create(v, context=[v2]) assert self.pg._remove_overlapping_contexts(v, [v2]) == [] assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGATC" in panel.refs assert sorted( panel.alts) == sorted( ["CGATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATCC"]) def test_del_with_ins_context_where_base_is_deleted2(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="TAAA", start=5, alternate_bases=["T"]) v3 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=7, alternate_bases=["AG"]) panel = self.pg.create(v, context=[v2, v3]) assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGATC" in panel.refs assert sorted( panel.alts) == sorted( [ "CGATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATCC", "CGATTGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATCCAAA", "GATTAAGAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATCC"]) panel = self.pg.create(v, context=[v3, v2]) assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGATC" in panel.refs assert sorted( panel.alts) == sorted( [ "CGATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATCC", "CGATTGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATCCAAA", "GATTAAGAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATCC"]) def test_snp_with_replace_context(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="G", start=2338961, alternate_bases=["A"]) v1 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="GGATG", start=2338990, alternate_bases=["CGATA"]) panel = self.pg2.create(v, context=[v1]) assert "CGACTAGCCACCATCGCGCATCAGTGCGAGGTCAAAAGCGACCAAAGCGAGCAAGTCGCGGAT" in panel.refs assert panel.alts == \ ["CGACTAGCCACCATCGCGCATCAGTGCGAGATCAAAAGCGACCAAAGCGAGCAAGTCGCGGAT", "CGACTAGCCACCATCGCGCATCAGTGCGAGATCAAAAGCGACCAAAGCGAGCAAGTCGCCGAT"] def test_indel_snp_indel_context(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="TCGCGTGGC", start=4021459, alternate_bases=["GCGAGCAGA"]) v1 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=4021455, alternate_bases=["ATCTAGCCGCAAG"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="T", start=4021489, alternate_bases=["G"]) panel = self.pg2.create(v) # , context = [v1, v2]) assert "ATCATGCGATTCTGCGTCTGCTCGCGAGGCTCGCGTGGCCGCCGGCGCTGGCGGGCGATCTCG" in panel.refs panel = self.pg2.create(v, context=[v1, v2]) assert sorted( panel.alts) == sorted( [ "ATCATGCGATTCTGCGTCTGCTCGCGAGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCTCG", "ATCATGCGATTCTGCGTCTGCTCGCGATCTAGCCGCAAGGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCTCG", "ATCATGCGATTCTGCGTCTGCTCGCGAGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCGCG", "ATCATGCGATTCTGCGTCTGCTCGCGATCTAGCCGCAAGGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCGCG"]) def test_complex_context(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATTT", start=1503643, alternate_bases=["A"]) v1 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="CCT", start=1503615, alternate_bases=["C"]) v2 = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=1503655, alternate_bases=["ATGCCGCCGCC"]) panel = self.pg2.create(v, context=[v1, v2]) assert "ATCCTGGAGCCCACCAGCGGAAACACCGGCATTTCGCTGGCGATGGCGGCCCGGTTGAAGGGG" in panel.refs assert panel.alts == [ "CATCCTGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGGCGGCCCGGTTGAAGGGGTA", "CCATCGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGGCGGCCCGGTTGAAGGGGTAC", "ATCCTGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGCCGCCGCCTGGCGGCCCGGTTGAAGGGG", "ATCGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGCCGCCGCCTGGCGGCCCGGTTGAAGGGG"]
class TestINDELAlleleGenerator(): def setUp(self): DB.drop_database('atlas-test') self.pg = AlleleGenerator( reference_filepath="atlasvar/data/BX571856.1.fasta") self.pg2 = AlleleGenerator( reference_filepath="atlasvar/data/NC_000962.2.fasta") self.reference_set = ReferenceSet().create_and_save(name="ref_set") self.variant_set = VariantSet.create_and_save( name="this_vcf_file", reference_set=self.reference_set) self.variant_sets = [self.variant_set] self.reference = Reference().create_and_save( name="ref", md5checksum="sre", reference_sets=[self.reference_set]) def test_simple_deletion1(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="AA", start=31, alternate_bases=["A"]) assert v.is_indel assert v.is_deletion panel = self.pg.create(v) assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT" in panel.refs assert self.pg._calculate_length_delta_from_indels(v, []) == 1 assert panel.alts == [ "CGATTAAAGATAGAAATACACGATGCGAGCATCAAATTTCATAACATCACCATGAGTTTGATC" ] def test_simple_deletion2(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="AT", start=32, alternate_bases=["A"]) panel = self.pg.create(v) assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGATC" in panel.refs assert panel.alts == [ "GATTAAAGATAGAAATACACGATGCGAGCAACAAATTTCATAACATCACCATGAGTTTGATCC" ] def test_simple_deletion3(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="AT", start=2902618, alternate_bases=["T"]) panel = self.pg.create(v) assert "TAACAAAATCCTTTTTATAACGCAAGTTCATTTTATACTACTGCTCAATTTTTTTACTTTTAT" in panel.refs assert panel.alts == [ "ATAACAAAATCCTTTTTATAACGCAAGTTCATTTTATACTACTGCTCAATTTTTTTACTTTTT" ] def test_simple_deletion4(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="ATC", start=32, alternate_bases=["A"]) panel = self.pg.create(v) assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGATC" in panel.refs assert panel.alts == [ "CGATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATCC" ] def test_simple_insertion1(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=1, alternate_bases=["TTTC"]) panel = self.pg.create(v) assert v.is_indel assert v.is_insertion assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT" in panel.refs assert panel.alts == [ "TTTCGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT" ] def test_simple_insertion2(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="C", start=1, alternate_bases=["CTTT"]) panel = self.pg.create(v) assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT" in panel.refs assert panel.alts == [ "CTTTGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT" ] def test_simple_insertion3(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=31, alternate_bases=["ATTT"]) panel = self.pg.create(v) assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT" in panel.refs assert panel.alts == [ "CGATTAAAGATAGAAATACACGATGCGAGCATTTATCAAATTTCATAACATCACCATGAGTTTGAT" ] def test_simple_insertion4(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=32, alternate_bases=["AGGGG"]) panel = self.pg.create(v) assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGATC" in panel.refs assert panel.alts == [ "GATTAAAGATAGAAATACACGATGCGAGCAAGGGGTCAAATTTCATAACATCACCATGAGTTTGATC" ] def test_simple_insertion5(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=2902618, alternate_bases=["ATGC"]) panel = self.pg.create(v) assert "TAACAAAATCCTTTTTATAACGCAAGTTCATTTTATACTACTGCTCAATTTTTTTACTTTTAT" in panel.refs assert panel.alts == [ "TAACAAAATCCTTTTTATAACGCAAGTTCATTTTATACTACTGCTCAATTTTTTTACTTTTATGCT" ] def test_double_insertion(self): v = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="A", start=4021408, alternate_bases=["ACGCTGGCGGGCG"]) v1 = Variant.create(variant_sets=self.variant_sets, reference=self.reference, reference_bases="AGA", start=4021406, alternate_bases=["CGG"]) context = [v1] assert self.pg2._remove_overlapping_contexts(v, [v1]) == [] panel = self.pg2.create(v, context=context) assert "ATCTAGCCGCAAGGGCGCGAGCAGACGCAGAATCGCATGATTTGAGCTCAAATCATGCGATTC" in panel.refs assert panel.alts == [ "ATCTAGCCGCAAGGGCGCGAGCAGACGCAGACGCTGGCGGGCGATCGCATGATTTGAGCTCAAATCATGCGATTC" ] def test_large_insertion(self): v = Variant.create( variant_sets=self.variant_sets, reference=self.reference, reference_bases="CCGCCGGCCCCGCCGTTT", start=1636155, alternate_bases=[ "CTGCCGGCCCCGCCGGCGCCGCCCAATCCACCGAAGCCCCTCCCTTCGGTGGGGTCGCTGCCGCCGTCGCCGCCGTCACCGCCCTTGCCGCCGGCCCCGCCGTCGCCGCCGGCTCCGGCGGTGCCGTCGCCGCCCTGGCCGCCGGCCCCGCCGTTTCCG" ]) panel = self.pg2.create(v, context=[]) assert "AGACCTAGCAGGGTGCCGGCGCCGCCCTTGCCGCCGGCCCCGCCGTTTCCGCCGCCGCCATCG" in panel.refs assert panel.alts == [ "GGTTGGATCGCCACCGGCGCCACCGGCGCCGCCCGCGCCACCAGCACCGCCGCTGCCATCTGGGTCCGTCGAGTCGCCGAGGACGCCGGCGCCGCCATTGTCGCCAAATACCGTGAGACCTAGCAGGGTGCCGGCGCCGCCCTTGCTGCCGGCCCCGCCGGCGCCGCCCAATCCACCGAAGCCCCTCCCTTCGGTGGGGTCGCTGCCGCCGTCGCCGCCGTCACCGCCCTTGCCGCCGGCCCCGCCGTCGCCGCCGGCTCCGGCGGTGCCGTCGCCGCCCTGGCCGCCGGCCCCGCCGTTTCCGCCGCCGCCGCCATCGCCGATGATGTTTTCCCCGCCCTTGCCGCCAGCCCCAGCGTTCCCGCCGGCTCCGCCACTGGCGCCGGTGCCGCCGGGTGCAACGGCGTTGGCGCCGTTACCGCCGTTGCCGCCTTT" ]