def test_write_vars_plus_flanks_to_fasta(self): '''test _write_vars_plus_flanks_to_fasta''' ref_seqs = { # 12345678901234567890 'ref1': pyfastaq.sequences.Fasta('ref1', 'AGTGGATAGCTAGCTAGAGA'), 'ref2': pyfastaq.sequences.Fasta('ref2', 'AGGAGAGAGAGAGAGAA'), 'ref3': pyfastaq.sequences.Fasta('ref3', 'AGCTTCATAGAGAGGTTTA'), } vcf_records = { 'ref1': [ vcf_record.VcfRecord( 'ref1\t3\tid_1\tT\tC,AG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80' ), vcf_record.VcfRecord( 'ref1\t10\tid_2\tCT\tA\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80' ), ], 'ref3': [ vcf_record.VcfRecord( 'ref3\t4\tid_3\tT\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80' ), ], } tmp_file = 'tmp.mapping_based_verifier.write_vars_plus_flanks_to_fasta.fa' mapping_based_verifier.MappingBasedVerifier._write_vars_plus_flanks_to_fasta( tmp_file, vcf_records, ref_seqs, 3) expected_file = os.path.join(data_dir, 'write_vars_plus_flanks_to_fasta.fa') self.assertTrue(filecmp.cmp(expected_file, tmp_file, shallow=False)) os.unlink(tmp_file)
def test_load_vcf_files(self): '''test _load_vcf_files''' vcf_file_1 = os.path.join(data_dir, 'load_vcf_files.1.vcf') expected_headers = { vcf_file_1: ['#file1 header1', '#file1 header2'], } expected_records = { 'ref.1': [ vcf_record.VcfRecord('ref.1\t5\tid3\tG\tA\tPASS\tSVTYPE=SNP\tGT\t1/1'), vcf_record.VcfRecord('ref.1\t10\tid1\tA\tT\tPASS\tSVTYPE=SNP\tGT\t1/1'), ], 'ref.2': [vcf_record.VcfRecord('ref.2\t42\tid2\tG\tC\tPASS\tSVTYPE=SNP\tGT\t1/1')], } expected_sample = 'sample' got_sample, got_headers, got_records = vcf_clusterer.VcfClusterer._load_vcf_files([vcf_file_1]) self.assertEqual(expected_sample, got_sample) self.assertEqual(expected_headers, got_headers) self.assertEqual(expected_records, got_records) vcf_file_2 = os.path.join(data_dir, 'load_vcf_files.2.vcf') expected_headers[vcf_file_2] = ['#file2 header', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_from_vcf_2'] expected_records['ref.3'] = [vcf_record.VcfRecord('ref.3\t8\tid5\tA\tG\tPASS\tSVTYPE=SNP\tGT\t1/1')] expected_records['ref.1'].insert(1, vcf_record.VcfRecord('ref.1\t8\tid4\tC\tG\tPASS\tSVTYPE=SNP\tGT\t1/1')) expected_sample = 'sample_from_vcf_2' got_sample, got_headers, got_records = vcf_clusterer.VcfClusterer._load_vcf_files([vcf_file_1, vcf_file_2]) self.assertEqual(expected_sample, got_sample) self.assertEqual(expected_headers, got_headers) self.assertEqual(expected_records, got_records)
def test_update_vcf_record_using_gramtools_allele_depths_homozygous(self): """test update_using_gramtools_allele_depths homozygous""" record = vcf_record.VcfRecord( "ref\t4\t.\tT\tTC,G\t228\t.\tINDEL;IDV=54;IMF=0.885246;DP=61;VDB=7.33028e-19;SGB=-0.693147;MQSB=0.9725;MQ0F=0;AC=2;AN=2;DP4=0,0,23,31;MQ=57\tGT:PL\t1/1:255,163,0" ) allele_combination_cov = {"1": 1, "2": 80} allele_groups_dict = {"1": {0}, "2": {2}, "3": {1, 2}} allele_per_base_cov = [[1], [0, 0], [80]] expected = vcf_record.VcfRecord( "ref\t4\t.\tT\tTC,G\t.\t.\t.\tGT:DP:COV:GT_CONF\t2/2:81:1,0,80:87.29" ) mean_depth = 85 error_rate = 0.001 got_filtered = gramtools.update_vcf_record_using_gramtools_allele_depths( record, allele_combination_cov, allele_per_base_cov, allele_groups_dict, mean_depth, error_rate, ) self.assertEqual(expected, record) expected_filtered = vcf_record.VcfRecord( "ref\t4\t.\tT\tG\t.\t.\t.\tGT:DP:COV:GT_CONF\t1/1:81:1,80:87.29") self.assertEqual(expected_filtered, got_filtered)
def test_vcf_file_to_dict(self): '''test vcf_file_to_dict''' expected_header = ['# header1', '# header2'] lines = [ 'ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80', 'ref_42\t12\tid_foo\tC\tG\t42.43\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,53:39.81', 'ref_43\t42\tid_foo\tT\tG\t43.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,54:39.82', 'ref_43\t43\tid_foo\tT\tG,*\t43.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,54:39.83', 'ref_43\t44\tid_foo\tT\t*\t43.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,54:39.84', ] expected_records = { 'ref_42': [vcf_record.VcfRecord(lines[0]), vcf_record.VcfRecord(lines[1])], 'ref_43': [vcf_record.VcfRecord(lines[x]) for x in (2, 3, 4)], } infile = os.path.join(data_dir, 'vcf_file_to_dict.vcf') got_header, got_records = vcf_file_read.vcf_file_to_dict(infile) self.assertEqual(expected_records, got_records) self.assertEqual(expected_header, got_header) infile = os.path.join(data_dir, 'vcf_file_to_dict.vcf.gz') got_header, got_records = vcf_file_read.vcf_file_to_dict(infile) self.assertEqual(expected_records, got_records) self.assertEqual(expected_header, got_header) expected_records['ref_43'].pop() expected_records['ref_43'][-1].remove_asterisk_alts() infile = os.path.join(data_dir, 'vcf_file_to_dict.vcf') got_header, got_records = vcf_file_read.vcf_file_to_dict(infile, remove_asterisk_alts=True) self.assertEqual(expected_records, got_records) self.assertEqual(expected_header, got_header)
def test_VcfRecord_constructor(self): '''test VcfRecord constructor''' line = 'ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80\n' record = vcf_record.VcfRecord(line) self.assertEqual(record.CHROM, 'ref_42') self.assertEqual(record.POS, 10) self.assertEqual(record.ID, 'id_foo') self.assertEqual(record.REF, 'A') self.assertEqual(record.ALT, ['G']) self.assertEqual(record.QUAL, 42.42) self.assertEqual(record.FILTER, 'PASS') self.assertEqual(record.INFO, { 'KMER': '31', 'SVLEN': '0', 'SVTYPE': 'SNP' }) self.assertEqual(record.FORMAT, { 'GT': '1/1', 'COV': '0,52', 'GT_CONF': '39.80' }) line = 'ref_42\t11\tid_foo\tA\tG,TC\t.\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80\n' record = vcf_record.VcfRecord(line) self.assertEqual(record.QUAL, None) self.assertEqual(record.ALT, ['G', 'TC'])
def test_make_all_variants_intervals(self): """test _make_all_variants_intervals""" variants = { "seq.1": [ vcf_record.VcfRecord( "seq.1\t15\t.\tAGTTGTC\tA\t.\t.\tSVTYPE=DEL"), vcf_record.VcfRecord("seq.1\t100\t.\tT\tA\t.\t.\tSVTYPE=SNP"), ], "seq.2": [vcf_record.VcfRecord("seq.1\t43\t.\tA\tACGTA\t.\t.\tSVTYPE=INS")], } big_variant_intervals = { "seq.1": [ pyfastaq.intervals.Interval(9, 19), pyfastaq.intervals.Interval(50, 60), ], "seq.3": [pyfastaq.intervals.Interval(42, 45)], } got = dnadiff.Dnadiff._make_all_variants_intervals( variants, big_variant_intervals) expected = { "seq.1": [ pyfastaq.intervals.Interval(9, 20), pyfastaq.intervals.Interval(50, 60), pyfastaq.intervals.Interval(99, 99), ], "seq.2": [pyfastaq.intervals.Interval(42, 42)], "seq.3": [pyfastaq.intervals.Interval(42, 45)], } self.assertEqual(expected, got)
def test_update_vcf_record_using_gramtools_allele_depths_heterozygous( self): """test update_using_gramtools_allele_depths heterozygous""" record = vcf_record.VcfRecord( "ref\t4\t.\tT\tA,G,TC\t228\t.\tINDEL;IDV=54;IMF=0.885246;DP=61;VDB=7.33028e-19;SGB=-0.693147;MQSB=0.9725;MQ0F=0;AC=2;AN=2;DP4=0,0,23,31;MQ=57\tGT:PL\t1/1:255,163,0" ) allele_combination_cov = {"1": 9, "2": 7, "3": 1} allele_groups_dict = {"1": {0}, "2": {2}, "3": {2, 3}} allele_per_base_cov = [[0], [9], [7], [1, 0]] expected = vcf_record.VcfRecord( "ref\t4\t.\tT\tA,G,TC\t.\t.\t.\tGT:DP:COV:GT_CONF\t0/2:17:9,0,7,0:54.46" ) mean_depth = 15 error_rate = 0.001 got_filtered = gramtools.update_vcf_record_using_gramtools_allele_depths( record, allele_combination_cov, allele_per_base_cov, allele_groups_dict, mean_depth, error_rate, ) self.assertEqual(expected, record) expected_filtered = vcf_record.VcfRecord( "ref\t4\t.\tT\tG\t.\t.\t.\tGT:DP:COV:GT_CONF\t0/1:17:9,7:54.46") self.assertEqual(expected_filtered, got_filtered)
def test_write_vars_plus_flanks_to_fasta(self): """test _write_vars_plus_flanks_to_fasta""" ref_seqs = { # 12345678901234567890 "ref1": pyfastaq.sequences.Fasta("ref1", "AGTGGATAGCTAGCTAGAGA"), "ref2": pyfastaq.sequences.Fasta("ref2", "AGGAGAGAGAGAGAGAA"), "ref3": pyfastaq.sequences.Fasta("ref3", "AGCTTCATAGAGAGGTTTA"), } vcf_records = { "ref1": [ vcf_record.VcfRecord( "ref1\t3\tid_1\tT\tC,AG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80" ), vcf_record.VcfRecord( "ref1\t10\tid_2\tCT\tA\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80" ), ], "ref3": [ vcf_record.VcfRecord( "ref3\t4\tid_3\tT\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80" ) ], } tmp_file = "tmp.mapping_based_verifier.write_vars_plus_flanks_to_fasta.fa" mapping_based_verifier.MappingBasedVerifier._write_vars_plus_flanks_to_fasta( tmp_file, vcf_records, ref_seqs, 3) expected_file = os.path.join(data_dir, "write_vars_plus_flanks_to_fasta.fa") self.assertTrue(filecmp.cmp(expected_file, tmp_file, shallow=False)) os.unlink(tmp_file)
def test_VcfRecord_constructor(self): """test VcfRecord constructor""" line = "ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80\n" record = vcf_record.VcfRecord(line) self.assertEqual(record.CHROM, "ref_42") self.assertEqual(record.POS, 10) self.assertEqual(record.ID, "id_foo") self.assertEqual(record.REF, "A") self.assertEqual(record.ALT, ["G"]) self.assertEqual(record.QUAL, 42.42) self.assertEqual(record.FILTER, {"PASS"}) self.assertEqual(record.INFO, { "KMER": "31", "SVLEN": "0", "SVTYPE": "SNP" }) self.assertEqual(record.FORMAT, { "GT": "1/1", "COV": "0,52", "GT_CONF": "39.80" }) line = "ref_42\t11\tid_foo\tA\tG,TC\t.\tFilter1;Filter2\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80\n" record = vcf_record.VcfRecord(line) self.assertEqual(record.QUAL, None) self.assertEqual(record.ALT, ["G", "TC"]) self.assertEqual(record.FILTER, {"Filter1", "Filter2"}) line = "ref_42\t11\tid_foo\tA\tG,TC\t.\t.\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80\n" record = vcf_record.VcfRecord(line) self.assertEqual(record.FILTER, set())
def test_add_vcf_record_and_len(self): '''test add_vcf_record and len''' record1 = vcf_record.VcfRecord( 'ref_42\t11\tid_1\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80' ) record2 = vcf_record.VcfRecord( 'ref_42\t12\tid_2\tC\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80' ) record3 = vcf_record.VcfRecord( 'ref_42\t15\tid_2\tC\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80' ) record4 = vcf_record.VcfRecord( 'ref_42\t19\tid_2\tCCCCC\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80' ) record5 = vcf_record.VcfRecord( 'ref_42\t23\tid_2\tC\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80' ) cluster = vcf_record_cluster.VcfRecordCluster( max_distance_between_variants=3) self.assertEqual(0, len(cluster)) self.assertTrue(cluster.add_vcf_record(record1)) self.assertEqual(1, len(cluster)) self.assertTrue(cluster.add_vcf_record(record2)) self.assertEqual(2, len(cluster)) self.assertTrue(cluster.add_vcf_record(record3)) self.assertEqual(3, len(cluster)) self.assertFalse(cluster.add_vcf_record(record4)) self.assertEqual(3, len(cluster)) cluster.max_distance_between_variants = 5 self.assertTrue(cluster.add_vcf_record(record4)) self.assertEqual(4, len(cluster)) self.assertTrue(cluster.add_vcf_record(record5)) self.assertEqual(5, len(cluster))
def test_vcf_records_make_same_allele_combination(): ref_seqs = {"ref1": "GCTGT"} record1 = vcf_record.VcfRecord("ref1\t1\t.\tGCT\tGC,GCGT\t.\t.\t.") record2 = vcf_record.VcfRecord("ref1\t5\t.\tT\tTGG,G\t.\t.\t.") record3 = vcf_record.VcfRecord("ref2\t5\t.\tT\tTGG,G\t.\t.\t.") assert variant_tracking.vcf_records_make_same_allele_combination( record1, record2, ref_seqs) assert not variant_tracking.vcf_records_make_same_allele_combination( record1, record3, ref_seqs)
def test_total_coverage(self): """test total_coverage""" record = vcf_record.VcfRecord( "ref\t3\tid_foo\tC\tA\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:GT_CONF\t1/1:39.80\n" ) self.assertEqual(None, record.total_coverage()) record = vcf_record.VcfRecord( "ref\t3\tid_foo\tC\tA\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:1,2,39:39.80\n" ) self.assertEqual(42, record.total_coverage())
def test_to_record_per_alt(self): """test to_record_per_alt""" vcf = vcf_record.VcfRecord("ref\t42\t.\tA\tC\t.\tPASS\tSVTYPE=SNP\n") self.assertEqual([vcf], vcf.to_record_per_alt()) vcf = vcf_record.VcfRecord( "ref\t42\t.\tA\tC,TC\t.\tPASS\tSVTYPE=SNP\n") vcf_c = vcf_record.VcfRecord("ref\t42\t.\tA\tC\t.\tPASS\tSVTYPE=SNP\n") vcf_tc = vcf_record.VcfRecord( "ref\t42\t.\tA\tTC\t.\tPASS\tSVTYPE=SNP\n") self.assertEqual([vcf_c, vcf_tc], vcf.to_record_per_alt())
def test_to_record_per_alt(self): '''test to_record_per_alt''' vcf = vcf_record.VcfRecord('ref\t42\t.\tA\tC\t.\tPASS\tSVTYPE=SNP\n') self.assertEqual([vcf], vcf.to_record_per_alt()) vcf = vcf_record.VcfRecord( 'ref\t42\t.\tA\tC,TC\t.\tPASS\tSVTYPE=SNP\n') vcf_c = vcf_record.VcfRecord('ref\t42\t.\tA\tC\t.\tPASS\tSVTYPE=SNP\n') vcf_tc = vcf_record.VcfRecord( 'ref\t42\t.\tA\tTC\t.\tPASS\tSVTYPE=SNP\n') self.assertEqual([vcf_c, vcf_tc], vcf.to_record_per_alt())
def test_vcf_file_to_dict(): vcf_file = os.path.join(data_dir, "vcf_file_to_dict.vcf") expect = { "ref1": [vcf_record.VcfRecord("ref1\t42\t1\tT\tA\t.\tPASS\t.\tGT\t1/1")], "ref2": [ vcf_record.VcfRecord("ref2\t43\t3\tT\tA,C\t.\tPASS\t.\tGT\t2/2"), vcf_record.VcfRecord("ref2\t44\t2\tT\tA\t.\tPASS\t.\tGT\t1/1"), ], } got = recall._vcf_file_to_dict(vcf_file) assert got == expect
def test_load_vcf_files(self): """test _load_vcf_files""" vcf_file_1 = os.path.join(data_dir, "load_vcf_files.1.vcf") expected_headers = {vcf_file_1: ["#file1 header1", "#file1 header2"]} expected_records = { "ref.1": [ vcf_record.VcfRecord( "ref.1\t5\tid3\tG\tA\tPASS\tSVTYPE=SNP\tGT\t1/1"), vcf_record.VcfRecord( "ref.1\t10\tid1\tA\tT\tPASS\tSVTYPE=SNP\tGT\t1/1"), ], "ref.2": [ vcf_record.VcfRecord( "ref.2\t42\tid2\tG\tC\tPASS\tSVTYPE=SNP\tGT\t1/1") ], } expected_sample = "sample" ( got_sample, got_headers, got_records, ) = vcf_clusterer.VcfClusterer._load_vcf_files([vcf_file_1], None) self.assertEqual(expected_sample, got_sample) self.assertEqual(expected_headers, got_headers) self.assertEqual(expected_records, got_records) vcf_file_2 = os.path.join(data_dir, "load_vcf_files.2.vcf") expected_headers[vcf_file_2] = [ "#file2 header", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_from_vcf_2", ] expected_records["ref.3"] = [ vcf_record.VcfRecord( "ref.3\t8\tid5\tA\tG\tPASS\tSVTYPE=SNP\tGT\t1/1") ] expected_records["ref.1"].insert( 1, vcf_record.VcfRecord( "ref.1\t8\tid4\tC\tG\tPASS\tSVTYPE=SNP\tGT\t1/1")) expected_sample = "sample_from_vcf_2" ( got_sample, got_headers, got_records, ) = vcf_clusterer.VcfClusterer._load_vcf_files( [vcf_file_1, vcf_file_2], None) self.assertEqual(expected_sample, got_sample) self.assertEqual(expected_headers, got_headers) self.assertEqual(expected_records, got_records)
def test_record_with_zero_pos_valueerror_raised(self): ref_seq = 'AGCTATCTGCGTATTCGATC' record_1 = vcf_record.VcfRecord( 'ref\t0\t.\tC\tCG\t42.42\tPASS\tSVTPYPE=INDEL\tGT\t1/1') record_2 = vcf_record.VcfRecord( 'ref\t1\t.\tT\tA\t42.42\tPASS\tSVTPYPE=SNP\tGT\t1/1') cluster = vcf_record_cluster.VcfRecordCluster( vcf_record=record_1, max_distance_between_variants=1) cluster.add_vcf_record(record_2) with self.assertRaises(ValueError): cluster.make_one_merged_vcf_record_for_gramtools(ref_seq)
def test_str(self): """test __str__""" line = "ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80" record = vcf_record.VcfRecord(line) self.assertEqual(line, str(record)) line = "ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\t." record = vcf_record.VcfRecord(line) self.assertEqual(line, str(record)) line = "ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\tFOO;KMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80" record = vcf_record.VcfRecord(line) self.assertEqual(line, str(record))
def test_GT_always_printed_first_when_present(self): """test GT always printed first when present""" line1 = "ref\t11\tid\tA\tG\t42.0\tPASS\t.\tGT:COV:GT_CONF\t1/1:0,52:39.80" line2 = "ref\t11\tid\tA\tG\t42.0\tPASS\t.\tCOV:GT:GT_CONF\t0,52:1/1:39.80" vcf = vcf_record.VcfRecord(line1) self.assertEqual(line1, str(vcf)) vcf = vcf_record.VcfRecord(line2) self.assertEqual(line1, str(vcf)) line3 = "ref\t11\tid\tA\tG\t42.0\tPASS\t.\tCOV:GT_CONF\t0,52:39.80" vcf = vcf_record.VcfRecord(line3) self.assertEqual(line3, str(vcf)) vcf.set_format_key_value("GT", "1/1") self.assertEqual(line1, str(vcf))
def test_remove_asterisk_alts(self): '''test remove_asterisk_alts''' record = vcf_record.VcfRecord( 'ref.3\t8\tid5\tA\tG\tPASS\tSVTYPE=SNP\tGT\t1/1') record.remove_asterisk_alts() self.assertEqual(['G'], record.ALT) record = vcf_record.VcfRecord( 'ref.3\t8\tid5\tA\tG,*\tPASS\tSVTYPE=SNP\tGT\t1/1') record.remove_asterisk_alts() self.assertEqual(['G'], record.ALT) record = vcf_record.VcfRecord( 'ref.3\t8\tid5\tA\t*\tPASS\tSVTYPE=SNP\tGT\t1/1') record.remove_asterisk_alts() self.assertEqual([], record.ALT)
def test_merge(self): """test merge""" ref_seq = pyfastaq.sequences.Fasta("ref", "AGCTAGGTCAG") record1 = vcf_record.VcfRecord("wrong_ref\t3\t.\tC\tA\t228\t.\t.\t.") record2 = vcf_record.VcfRecord( "ref\t1\t.\tAG\tGAA\t228\t.\tINDEL;IDV=54;IMF=0.885246;DP=61;VDB=7.33028e-19;SGB=-0.693147;MQSB=0.9725;MQ0F=0;AC=2;AN=2;DP4=0,0,23,31;MQ=57\tGT:PL\t1/1:255,163,0" ) record3 = vcf_record.VcfRecord( "ref\t2\t.\tG\tAA\t228\t.\tINDEL;IDV=54;IMF=0.885246;DP=61;VDB=7.33028e-19;SGB=-0.693147;MQSB=0.9725;MQ0F=0;AC=2;AN=2;DP4=0,0,23,31;MQ=57\tGT:PL\t1/1:255,163,0" ) record4 = vcf_record.VcfRecord( "ref\t3\t.\tC\tCAT\t21.4018\t.\tINDEL;IDV=2;IMF=0.0338983;DP=59;VDB=0.18;SGB=-0.453602;MQ0F=0;AC=2;AN=2;DP4=0,0,0,2;MQ=60\tGT:PL\t1/1:48,6,0" ) record5 = vcf_record.VcfRecord("ref\t7\t.\tG\tC\t21.4018\t.\t.\t.\t.") self.assertIsNone(record1.merge(record2, ref_seq)) self.assertIsNone(record2.merge(record1, ref_seq)) self.assertIsNone(record1.merge(record3, ref_seq)) self.assertIsNone(record3.merge(record1, ref_seq)) got = record3.merge(record4, ref_seq) expected = vcf_record.VcfRecord( "ref\t2\t.\tGC\tAACAT\t.\t.\tSVTYPE=MERGED\tGT\t1/1") self.assertEqual(expected, got) got = record4.merge(record3, ref_seq) expected = vcf_record.VcfRecord( "ref\t2\t.\tGC\tAACAT\t.\t.\tSVTYPE=MERGED\tGT\t1/1") self.assertEqual(expected, got) got = record4.merge(record5, ref_seq) expected = vcf_record.VcfRecord( "ref\t3\t.\tCTAGG\tCATTAGC\t.\t.\tSVTYPE=MERGED\tGT\t1/1") self.assertEqual(expected, got)
def test_intersects(self): """test intersects""" record1 = vcf_record.VcfRecord("ref_42\t11\t.\tA\tG\t42.42\tPASS\t.") record2 = vcf_record.VcfRecord("ref_42\t12\t.\tC\tT\t42.42\tPASS\t.") record3 = vcf_record.VcfRecord("ref_43\t12\t.\tC\tT\t42.42\tPASS\t.") record4 = vcf_record.VcfRecord("ref_42\t11\t.\tCT\tT\t42.42\tPASS\t.") self.assertTrue(record1.intersects(record1)) self.assertTrue(record2.intersects(record2)) self.assertFalse(record1.intersects(record2)) self.assertFalse(record2.intersects(record1)) self.assertFalse(record3.intersects(record2)) self.assertFalse(record2.intersects(record3)) self.assertTrue(record2.intersects(record4)) self.assertTrue(record4.intersects(record2))
def test_remove_asterisk_alts(self): """test remove_asterisk_alts""" record = vcf_record.VcfRecord( "ref.3\t8\tid5\tA\tG\tPASS\tSVTYPE=SNP\tGT\t1/1") record.remove_asterisk_alts() self.assertEqual(["G"], record.ALT) record = vcf_record.VcfRecord( "ref.3\t8\tid5\tA\tG,*\tPASS\tSVTYPE=SNP\tGT\t1/1") record.remove_asterisk_alts() self.assertEqual(["G"], record.ALT) record = vcf_record.VcfRecord( "ref.3\t8\tid5\tA\t*\tPASS\tSVTYPE=SNP\tGT\t1/1") record.remove_asterisk_alts() self.assertEqual([], record.ALT)
def test_ref_string_matches_dict_of_ref_sequences(self): """test ref_string_matches_dict_of_ref_sequences""" record = vcf_record.VcfRecord( "ref1\t3\t.\tA\tG\tPASS\tSVTYPE=SNP\tGT\t1/1") ref_seqs = {"ref1": "GTACG", "ref2": "TTTTT"} self.assertTrue( record.ref_string_matches_dict_of_ref_sequences(ref_seqs)) record = vcf_record.VcfRecord( "ref2\t3\t.\tA\tG\tPASS\tSVTYPE=SNP\tGT\t1/1") self.assertFalse( record.ref_string_matches_dict_of_ref_sequences(ref_seqs)) record = vcf_record.VcfRecord( "ref3\t3\t.\tA\tG\tPASS\tSVTYPE=SNP\tGT\t1/1") self.assertFalse( record.ref_string_matches_dict_of_ref_sequences(ref_seqs))
def test_start_and_end(self): '''test start_and_end''' cluster = vcf_record_cluster.VcfRecordCluster( max_distance_between_variants=3) self.assertEqual((None, None), cluster.start_and_end()) record1 = vcf_record.VcfRecord( 'ref_42\t11\tid_1\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80' ) self.assertTrue(cluster.add_vcf_record(record1)) self.assertEqual((10, 10), cluster.start_and_end()) record2 = vcf_record.VcfRecord( 'ref_42\t12\tid_2\tC\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80' ) self.assertTrue(cluster.add_vcf_record(record2)) self.assertEqual((10, 11), cluster.start_and_end())
def test_get_total_length_of_expected_regions_called(self): '''test _get_total_length_of_expected_regions_called''' expected_regions = { 'ref.1': [ pyfastaq.intervals.Interval(101, 200), # 100 long, 92 get called pyfastaq.intervals.Interval(251, 260), # 10 long, none get called ], 'ref.2': [ pyfastaq.intervals.Interval(42, 43), # 2 long, none get called ], } called_vcf_records = { 'ref.1': [ vcf_record.VcfRecord( 'ref.1\t100\t.\tACGTACTGTA\tA,G\t42.0\t.\tDP4=42\tGT\t2/2' ), ], } got_all, got_called = mapping_based_verifier.MappingBasedVerifier._get_total_length_of_expected_regions_called( expected_regions, called_vcf_records) self.assertEqual(112, got_all) self.assertEqual(8, got_called)
def test_ref_string_matches_ref_sequence(self): """test ref_string_matches_ref_sequence""" record = vcf_record.VcfRecord( "ref_name\t1\t.\tAGT\tG\tPASS\tSVTYPE=SNP\tGT\t1/1") self.assertFalse(record.ref_string_matches_ref_sequence("AG")) record = vcf_record.VcfRecord( "ref_name\t3\t.\tA\tG\tPASS\tSVTYPE=SNP\tGT\t1/1") self.assertTrue(record.ref_string_matches_ref_sequence("GCATG")) self.assertFalse(record.ref_string_matches_ref_sequence("GCxTG")) record = vcf_record.VcfRecord( "ref_name\t3\t.\tAGT\tG\tPASS\tSVTYPE=SNP\tGT\t1/1") self.assertTrue(record.ref_string_matches_ref_sequence("GCAGT")) self.assertFalse(record.ref_string_matches_ref_sequence("GCAGC")) self.assertFalse(record.ref_string_matches_ref_sequence("GCAG")) self.assertFalse(record.ref_string_matches_ref_sequence("GCA")) self.assertFalse(record.ref_string_matches_ref_sequence("GA"))
def make_separate_indels_and_one_alt_with_all_snps_no_combinations(self, ref_seq): '''Returns a VCF record, where each indel from this cluster is in a separate ALT. Then all the remaining SNPs are applied to make one ALT. If >1 SNP in same place, either one might be used''' final_start_position = min([x.POS for x in self.vcf_records]) final_end_position = max([x.ref_end_pos() for x in self.vcf_records]) snps = [] new_vcf_records = [] for record in self.vcf_records: if record.is_snp(): snps.append(copy.copy(record)) else: new_record = copy.copy(record) new_record.add_flanking_seqs(ref_seq, final_start_position, final_end_position) new_vcf_records.append(new_record) if len(snps): new_record = copy.copy(snps[0]) for snp in snps[1:]: merged = new_record.merge(snp, ref_seq) if merged is not None: new_record = merged new_record.add_flanking_seqs(ref_seq, final_start_position, final_end_position) new_vcf_records.append(new_record) alts = ','.join([x.ALT[0] for x in new_vcf_records]) new_record = vcf_record.VcfRecord('\t'.join([self.vcf_records[0].CHROM, str(final_start_position + 1), '.', new_vcf_records[0].REF, alts, '.', 'PASS', '.'])) return new_record
def _variant_cluster_to_vcf_line(self, variants, variant_ids, max_alleles=None): if len(variants) == 0: return None ref_seq = self.ref_seqs[self.ref_seq_names[variants[0].seq_id]] if logging.getLogger().level <= logging.DEBUG: logging.debug(f"Clustering variants:") for v in variants: logging.debug(f" {ref_seq.id}\t{v.pos}\t{v.ref}\t{v.alt}") start, end, alts = allele_combinations.var_cluster_to_coords_and_alts( variants, ref_seq, max_alleles=max_alleles ) info_field = "." if alts is None: alts = set() var_id_to_var = dict(zip(variant_ids, variants)) var_patterns = var_patterns_from_block_slices( self.var_block_tabixes, var_id_to_var, variants[0].seq_id, start, end ) for var_pattern in var_patterns: alt_alleles = var_pattern_to_alleles( var_id_to_var, var_pattern, ref_seq, start, end, max_alleles=max_alleles, ) if alt_alleles is None: logging.warning("Conflicting allele combination:") for var_id in sorted(list(var_pattern)): var = var_id_to_var[var_id] logging.warning( f" {ref_seq.id} {var.pos+1} {var.ref} {var.alt}" ) else: alts.update(alt_alleles) info_field = "High_variability" if len(alts) == 0: logging.warning("Could not make VCF record from these variants:") for variant in variants: logging.warning(" " + str(variant)) return None else: return vcf_record.VcfRecord( "\t".join( [ ref_seq.id, str(start + 1), ".", ref_seq[start : end + 1], ",".join(sorted(list(alts))), ".", "PASS", info_field, ] ) )
def test_ref_end_pos(self): """test ref_end_pos""" line = "ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80\n" record = vcf_record.VcfRecord(line) self.assertEqual(10, record.ref_end_pos()) line = "ref_42\t11\tid_foo\tAA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80\n" record = vcf_record.VcfRecord(line) self.assertEqual(11, record.ref_end_pos()) line = "ref_42\t11\tid_foo\tAAG\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80\n" record = vcf_record.VcfRecord(line) self.assertEqual(12, record.ref_end_pos()) line = "ref_42\t11\tid_foo\t.\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80\n" record = vcf_record.VcfRecord(line) self.assertEqual(10, record.ref_end_pos())