def test_should_obtain_correct_vcf_header_on_reduce(self): temp_vcf_filename_b = self.__run_wecall_variant_caller( "2", "AAAAAAAAAAACGCACCCCCCATAAAAAAAATTTTTTTTTTT", ["............T........................C...."], vcf_stem="ab" ) temp_vcf_filename_a = self.__run_wecall_variant_caller( "1", "AAAAAAAAAAACGCACCCCCCATAAAAAAAATTTTTTTTTTT", ["...................T......................"], vcf_stem="aa" ) with VCFReaderContextManager(temp_vcf_filename_a) as temp_vcf_a: with VCFReaderContextManager(temp_vcf_filename_b) as temp_vcf_b: temp_vcf_a.read_header() header_a = temp_vcf_a.header temp_vcf_b.read_header() header_b = temp_vcf_b.header self.__run_wecall_reduce() with VCFReaderContextManager(self.final_vcf_location) as final_vcf: final_vcf.read_header() expected_header = header_a expected_header.set_contig('2', header_b.get_contig('2').length) self.assertEqual(final_vcf.header, expected_header)
def test_should_derive_use_lexigraphical_order_of_vcfs_for_reduce(self): temp_vcf_filename_b = self.__run_wecall_variant_caller( "2", "AAAAAAAAAAACGCACCCCCCATAAAAAAAATTTTTTTTTTT", ["............T........................C...."], vcf_stem="ab" ) temp_vcf_filename_a = self.__run_wecall_variant_caller( "1", "AAAAAAAAAAACGCACCCCCCATAAAAAAAATTTTTTTTTTT", ["...................T......................"], vcf_stem="aa" ) with VCFReaderContextManager(temp_vcf_filename_a) as temp_vcf_a: with VCFReaderContextManager(temp_vcf_filename_b) as temp_vcf_b: # aa is lexicographical less than ab reference_records = list( temp_vcf_a.read_records()) + list(temp_vcf_b.read_records()) self.__run_wecall_reduce() with VCFReaderContextManager(self.final_vcf_location) as final_vcf: final_records = list(final_vcf.read_records()) self.assertEqual(len(final_records), 3) self.assertEqual(final_records, reference_records)
def __run_small_variant_caller(self, refcalls, format): sample_bank = SampleBank("T") sample_bank.add_sample_name("TEST").add_sequence(".") variant_caller_builder = VariantCallerBuilderFromSampleBank( sample_bank, self.work_dir) variant_caller_builder.configuration = {} # clear config. variant_caller = variant_caller_builder.build() variant_caller.add_additional_command('outputRefCalls', refcalls) variant_caller.add_additional_command('outputFormat', "VCF{}".format(format)) variant_caller.run() with VCFReaderContextManager(variant_caller.output_vcf) as vcf_file: actual_schema = vcf_file.read_header() reference = os.path.splitext( os.path.basename(variant_caller_builder.wecall_input_data. reference_filename))[0] expected_schema = wecall_schema( file_date=datetime.datetime.today().strftime('%F'), reference=reference, contigs={ sample_bank.reference.chrom: { "length": sample_bank.reference.length_minus_deletions() } }, add_ref_calls=refcalls, format=format) return expected_schema, actual_schema
def test_read_variant_from_vcf(self): with VCFReaderContextManager( os.path.join(self.data_dir, "vcf_example.vcf")) as vcf_handler: variant_gen = (record.variant for record in vcf_handler.read_records()) next_variant = next(variant_gen) self.assertEqual(next_variant.chrom, "20") self.assertEqual(next_variant.pos_from, 9) self.assertEqual(next_variant.ref, "CT") self.assertEqual(next_variant.alt, "C")
def test_should_reduce_a_wecall_produced_vcf_to_a_valid_vcf(self): temp_vcf_filename = self.__run_wecall_variant_caller( "1", "AAAAAAAAAAACGCACCCCCCATAAAAAAAATTTTTTTTTTT", ["...................T......................"], ) with VCFReaderContextManager(temp_vcf_filename) as temp_vcf: reference_header = temp_vcf.header reference_records = list(temp_vcf.read_records()) self.__run_wecall_reduce() with VCFReaderContextManager(self.final_vcf_location) as final_vcf: self.assertEqual(final_vcf.header, reference_header) final_records = list(final_vcf.read_records()) self.assertEqual(len(final_records), 1) self.assertEqual(final_records, reference_records)
def __init__(self, test_case, path): self.__test_case = test_case self.__path = path self.__test_case.assertTrue(os.path.exists(self.__path)) with VCFReaderContextManager(self.__path) as vcf_reader: self.__schema = vcf_reader.read_header() self.__records = list(vcf_reader.read_records()) # ensure genotype likelihoods are within range throughout for record in self.__records: for sample_name in record.sample_info.get_sample_names(): try: for GL_value in record.sample_info.get_field( sample_name, 'GL'): self.__test_case.assertTrue(GL_value <= 0.0) except KeyError: pass
def test_reads_simple_file(self): filename = os.path.join(self.work_dir, "test.vcf") with VCFWriterContextManager(filename) as left_vcf: left_vcf.write_variant(Variant("1", 1, "A", "T")) left_vcf.write_variant(Variant("2", 1, "A", "T")) left_vcf.write_variant(Variant("10", 1, "A", "T")) expected_variants = [ Variant("1", 1, "A", "T"), Variant("2", 1, "A", "T"), Variant("10", 1, "A", "T"), ] with VCFReaderContextManager(filename) as vcf_reader: actual_variants = [record.variant for record in vcf_reader.read_records()] self.assertEqual(expected_variants, actual_variants)
def test_should_write_missing_values_in_sample_data(self): with VCFReaderContextManager( os.path.join(self.data_dir, "vcf_example.vcf")) as vcf_handler: first_record = next(vcf_handler.read_records()) sample_data = SampleData(['GT', 'PL', 'GQ'], ['sample1', 'sample2', 'sample3']) sample_data.add_sample_data("sample1", "GT", GenotypeCall("1|0")) sample_data.add_sample_data("sample1", "PL", [3000, 0, 3000]) sample_data.add_sample_data("sample1", "GQ", [1000]) sample_data.add_sample_data("sample2", "GT", GenotypeCall("1|1")) sample_data.add_sample_data("sample2", "PL", [2000, 0, 1000]) sample_data.add_sample_data("sample2", "GQ", [3]) first_record.sample_info = sample_data print((sample_data.to_vcf_columns())) vcf_string = vcf_row_from_record(first_record) expected_vcf_string = "20 10 . CT C 3000 PASS PP=3000;DP=250;DPR=140;DPF=110;VC=100;VCR=49;VCF=51;ABPV=0.2;SBPV=0.3;MQ=70.0;BR=31.0;QD=None GT:PL:GQ 1|0:3000,0,3000:1000 1|1:2000,0,1000:3 ./.:.:." # noqa self.assertEqual(expected_vcf_string, vcf_string)
def test_read_record_line(self): with VCFReaderContextManager( os.path.join(self.data_dir, "vcf_example.vcf")) as vcf_handler: record_gen = vcf_handler.read_records() next_record = next(record_gen) self.assertEqual(next_record.chrom, "20") self.assertEqual(next_record.pos_from, 9) self.assertEqual(next_record.ids, set()) self.assertEqual(next_record.ref, "CT") self.assertEqual(next_record.alt, "C") self.assertEqual(next_record.quality, 3000) self.assertEqual(next_record.filters, set()) self.assertEqual(next_record.passes_filter, True) self.assertEqual(next_record.from_multi_alt, False) self.assertEqual(next_record.type, variant.TYPE_DEL) self.assertEqual(next_record.info['PP'], [3000]) self.assertEqual(next_record.info['DP'], [250]) self.assertEqual(next_record.info['VC'], [100]) self.assertEqual(next_record.info['ABPV'], [0.2]) self.assertEqual(next_record.info['SBPV'], [0.3]) self.assertEqual(next_record.info['MQ'], [70]) self.assertEqual(next_record.info['QD'], [None]) self.assertTrue(next_record.sample_info.has_sample("sample1")) self.assertEqual(next_record.genotypes, { "sample1": GenotypeCall("1|0"), "sample2": GenotypeCall("1|1") }) self.assertEqual( next_record.sample_info.get_field("sample1", 'GT'), GenotypeCall("1|0")) self.assertEqual( next_record.sample_info.get_field("sample1", 'PL'), [3000, 0, 3000]) self.assertEqual( next_record.sample_info.get_field("sample1", "GQ"), [1000])
def __get_example_schema(self, filename): with VCFReaderContextManager(os.path.join(self.data_dir, filename)) as vcf_handler: vcf_handler.read_header() return vcf_handler.header