def test_getitem_method_returns_expected_value(self): genotype_data = self.sample_data.get_genotype_data("sample_name1") self.assertEqual(genotype_data["GT"], GenotypeCall("./.")) self.assertEqual(genotype_data["key"], [1, 2]) genotype_data = self.sample_data.get_genotype_data("sample_name2") self.assertEqual(genotype_data["GT"], GenotypeCall("0/1")) self.assertEqual(genotype_data["key"], [])
def test_should_get_homozygous_alt_if_one_is_homozyzgous_ref_and_other_is_homozygous_alt( self): genotype_call_1 = GenotypeCall("0/0") genotype_call_2 = GenotypeCall("1/1") self.assertEqual( merge_genotype_calls(genotype_call_1, genotype_call_2), genotype_call_2)
def test_should_combine_two_unphased_heterozygous_genotypes_to_homozygous_alt( self): genotype_call_1 = GenotypeCall("0/1") genotype_call_2 = GenotypeCall("0/1") self.assertEqual( GenotypeCall("1/1"), merge_genotype_calls(genotype_call_1, genotype_call_2))
def test_read_sample_data(self): schema = self.__get_example_schema("vcf_example.vcf") sample_schema = [key for key, _ in schema.iter_sample_data()] sample_data = SampleData(sample_schema, ['sample1']) sample_data.add_sample_data("sample1", "GT", GenotypeCall("1|0")) sample_data.add_sample_data("sample1", "PL", [3000, 0, 3000]) sample_data.add_sample_data("sample1", "GQ", [1000]) sample_data.add_sample_data("sample1", "PQ", [2000]) sample_data.add_sample_data("sample1", "PS", [60000]) sample_data.add_sample_data("sample1", "AD", [140, 110]) sample_data.add_sample_data("sample1", "DP", [250]) sample_data.add_sample_data("sample1", "VAF", [0.4]) self.assertTrue(sample_data.has_sample("sample1")) self.assertEqual(sample_data.genotypes(), {"sample1": GenotypeCall("1|0")}) self.assertEqual(sample_data.get_field("sample1", 'GT'), GenotypeCall("1|0")) self.assertEqual(sample_data.get_field("sample1", 'PL'), [3000, 0, 3000]) genotype_data = sample_data.get_genotype_data("sample1") self.assertEqual(genotype_data.genotype(), GenotypeCall("1|0")) self.assertEqual(genotype_data['GT'], GenotypeCall("1|0")) self.assertEqual(genotype_data['PL'], [3000, 0, 3000])
def test_should_get_homozygous_ref_if_combining_two_homozygous_ref_genotypes( self): genotype_call_1 = GenotypeCall("0/0") genotype_call_2 = GenotypeCall("0/0") self.assertEqual( merge_genotype_calls(genotype_call_1, genotype_call_2), GenotypeCall("0/0"))
def test_should_give_correct_output_for_different_sample_names(self): self.sample_name1 = "SAMPLE_A" self.sample_name2 = "SAMPLE_B" n_copies1 = 1 n_copies2 = 5 self.setParallelAndSerialVariantCallers(n_copies1, n_copies2) self.vc_wrapper_parallel.add_additional_command("numberOfJobs", "2") self.vc_wrapper_parallel.add_additional_command("workDir", self.vc_work_dir) self.vc_wrapper_parallel.run() expected_var_A_1 = Variant(self.chrom1, 3, "CTT", "C") expected_var_B_1 = Variant(self.chrom2, 7, "AT", "A") parallel_variants_with_genotypes = self.vc_wrapper_parallel \ .get_variant_callset(self) \ .get_variants_with_genotypes() self.assertTrue(expected_var_A_1 in list(parallel_variants_with_genotypes.keys())) self.assertTrue(expected_var_B_1 in list(parallel_variants_with_genotypes.keys())) self.assertEqual(GenotypeCall("1/1"), parallel_variants_with_genotypes[expected_var_A_1][self.sample_name1]) self.assertEqual(GenotypeCall("./."), parallel_variants_with_genotypes[expected_var_A_1][self.sample_name2]) self.assertEqual(GenotypeCall("./."), parallel_variants_with_genotypes[expected_var_B_1][self.sample_name1]) self.assertEqual(GenotypeCall("1/1"), parallel_variants_with_genotypes[expected_var_B_1][self.sample_name2])
def test_values_method_returns_expected_data(self): genotype_data = self.sample_data.get_genotype_data("sample_name1") self.assertEqual(list(genotype_data.values()), [GenotypeCall("./."), [1, 2]]) genotype_data = self.sample_data.get_genotype_data("sample_name2") self.assertEqual(list(genotype_data.values()), [GenotypeCall("0/1"), []])
def test_should_merge_genotype_call_object_in_sample_data(self): sample_data1 = SampleData(['GT'], ['sample_name']) sample_data1.add_sample_data('sample_name', 'GT', GenotypeCall('0/1')) sample_data2 = SampleData(['GT'], ['sample_name']) sample_data2.add_sample_data('sample_name', 'GT', GenotypeCall('0/1')) sample_data1.merge_genotype_calls(sample_data2.genotypes()) self.assertEqual(sample_data1.get_field("sample_name", "GT"), GenotypeCall("1/1"))
def test_should_fail_if_sample_data_objects_have_different_sample(self): sample_data1 = SampleData(['GT'], ['sample_name_1']) sample_data1.add_sample_data('sample_name_1', 'GT', GenotypeCall('0/0')) sample_data2 = SampleData(['GT'], ['sample_name_2']) sample_data2.add_sample_data('sample_name_2', 'GT', GenotypeCall('0/0')) self.assertRaises(Exception, sample_data1.merge_genotype_calls, sample_data2.genotypes())
def test_default_values_are_assigned_when_sample_data_is_constructed(self): sample_data = SampleData(['GT', 'key1', 'key2'], ['sample_name1', 'sample_name2']) self.assertEqual(sample_data.get_field('sample_name1', 'GT'), GenotypeCall("./.")) self.assertEqual(sample_data.get_field('sample_name2', 'GT'), GenotypeCall("./.")) self.assertEqual(sample_data.get_field('sample_name1', 'key1'), []) self.assertEqual(sample_data.get_field('sample_name2', 'key1'), []) self.assertEqual(sample_data.get_field('sample_name1', 'key2'), []) self.assertEqual(sample_data.get_field('sample_name2', 'key2'), [])
def test_homozygous_unphased_genotypes(self): self.assertEqual((1, ), GenotypeCall('0').normalized_allele_count) self.assertEqual((1, ), GenotypeCall('0/0').normalized_allele_count) self.assertEqual((1, ), GenotypeCall('0/0/0').normalized_allele_count) self.assertEqual((0, 1), GenotypeCall('1').normalized_allele_count) self.assertEqual((0, 1), GenotypeCall('1/1').normalized_allele_count) self.assertEqual((0, 1), GenotypeCall('1/1/1').normalized_allele_count) self.assertEqual((0, 1), GenotypeCall('2').normalized_allele_count) self.assertEqual((0, 1), GenotypeCall('2/2').normalized_allele_count) self.assertEqual((0, 1), GenotypeCall('2/2/2').normalized_allele_count)
def test_should_not_mark_following_as_called(self): self.assertFalse(GenotypeCall("./.").is_called()) self.assertFalse(GenotypeCall("./0").is_called()) self.assertFalse(GenotypeCall("0/.").is_called()) self.assertFalse(GenotypeCall("0|.").is_called()) self.assertFalse(GenotypeCall(".|0").is_called()) self.assertFalse(GenotypeCall(".").is_called()) self.assertFalse(GenotypeCall("0").is_called()) self.assertFalse(GenotypeCall("././.").is_called()) self.assertFalse(GenotypeCall(".|0|.").is_called())
def test_homozygous_phased_genotypes(self): self.assertEqual((1, ), GenotypeCall('0').normalized_allele_count) self.assertEqual((1, ), GenotypeCall('0|0').normalized_allele_count) self.assertEqual((1, ), GenotypeCall('0|0|0').normalized_allele_count) self.assertEqual((0, 1), GenotypeCall('1').normalized_allele_count) self.assertEqual((0, 1), GenotypeCall('1|1').normalized_allele_count) self.assertEqual((0, 1), GenotypeCall('1|1|1').normalized_allele_count) self.assertEqual((0, 1), GenotypeCall('2').normalized_allele_count) self.assertEqual((0, 1), GenotypeCall('2|2').normalized_allele_count) self.assertEqual((0, 1), GenotypeCall('2|2|2').normalized_allele_count)
def test_should_mark_following_as_heterozygous(self): self.assertTrue(GenotypeCall("0/1").is_heterozygous()) self.assertTrue(GenotypeCall("1/0").is_heterozygous()) self.assertTrue(GenotypeCall("1/.").is_heterozygous()) self.assertTrue(GenotypeCall("./1").is_heterozygous()) self.assertTrue(GenotypeCall("0|1").is_heterozygous()) self.assertTrue(GenotypeCall("1|0").is_heterozygous()) self.assertTrue(GenotypeCall("1|.").is_heterozygous()) self.assertTrue(GenotypeCall(".|1").is_heterozygous()) self.assertTrue(GenotypeCall("1|2").is_heterozygous())
def calls_variants_with_genotype( self, ref, sequence_list, expected_haplotypes=None, expected_variants_with_genotypes=None, config_dict=None): self.__validate_expected_calls( expected_haplotypes, expected_variants_with_genotypes) sample_bank = self.__build_default_sample_bank(ref, sequence_list) variant_callset = self.__run_wecall(sample_bank, config_dict) wecall_calls = variant_callset.get_variants_with_genotypes() if expected_variants_with_genotypes is None: expected_calls_for_default_sample = { sample_bank.sample_names[0]: expected_haplotypes} expected_calls = self.__get_expected_calls_from_sample_ascii_haplotypes( expected_calls_for_default_sample, sample_bank.reference) else: expected_calls = {} sample_name = sample_bank.sample_names[0] for expected_stub in expected_variants_with_genotypes: variant = self._variant_from_stub( sample_bank.reference.chrom, expected_stub) expected_calls[variant] = { sample_name: GenotypeCall( expected_stub[3])} self.assertDictEqual(expected_calls, wecall_calls)
def has_genotype(self, genotype_string): actual_genotype_call = self.get_genotype() expected_genotype_call = GenotypeCall(genotype_string) self.__test_case.assertEqual(expected_genotype_call, actual_genotype_call) return self
def test_should_mark_following_as_called(self): self.assertTrue(GenotypeCall("0/1").is_called()) self.assertTrue(GenotypeCall("0|1").is_called()) self.assertTrue(GenotypeCall("./1").is_called()) self.assertTrue(GenotypeCall(".|1").is_called()) self.assertTrue(GenotypeCall("0/2").is_called()) self.assertTrue(GenotypeCall("1/2").is_called()) self.assertTrue(GenotypeCall("././1").is_called()) self.assertTrue(GenotypeCall("0/0/1").is_called())
def test_read_VCF_line(self): with open(os.path.join(self.data_dir, "vcf_example.vcf"), "r") as vcf_file: vcf_handler = VCFReader(vcf_file) vcf_handler.read_header() self.assertEqual(len(vcf_handler.header.file_metadata), 7) self.assertEqual(len(vcf_handler.header.samples), 2) records = list(vcf_handler.read_records()) self.assertEqual(len(records), 2) # test first record fully self.variant_is_equal(records[0], ("20", 9, set(), "CT", "C")) # zero=based representation self.assertEqual(records[0].filters, set()) self.assertEqual(records[0].passes_filter, True) self.assertEqual(len(records[0].info), 12) self.assertEqual(records[0].info["PP"], [3000]) self.assertEqual(records[0].info["DP"], [250]) self.assertEqual(records[0].info["DPR"], [140]) self.assertEqual(records[0].info["DPF"], [110]) self.assertEqual(records[0].info["VC"], [100]) self.assertEqual(records[0].info["VCR"], [49]) self.assertEqual(records[0].info["VCF"], [51]) self.assertEqual(records[0].info["ABPV"], [0.2]) self.assertEqual(records[0].info["SBPV"], [0.3]) self.assertEqual(records[0].info["MQ"], [70]) self.assertEqual(records[0].info["BR"], [31]) self.assertEqual(records[0].info["QD"], [None]) self.assertEqual(records[0].samples, ['sample1', 'sample2']) self.assertEqual(records[0].sample_info.get_field('sample1', "GT"), GenotypeCall("0/1")) self.assertEqual(records[0].sample_info.get_field('sample2', "GT"), GenotypeCall("1/1")) self.assertEqual(records[0].sample_info.get_field('sample1', 'PL'), [3000, 0, 3000]) self.assertEqual(records[0].sample_info.get_field('sample2', 'PL'), [114, 0, 0]) self.assertEqual(records[0].sample_info.get_field('sample1', 'GQ'), [1000]) self.assertEqual(records[0].sample_info.get_field('sample2', 'GQ'), [None]) # check that ordering in the dictionaries is preserved expected_keys = ["PP", "DP", "DPR", "DPF", "VC", "VCR", "VCF", "ABPV", "SBPV", "MQ", "BR", "QD"] self.assertEqual(list(records[0].info.keys()), expected_keys) # ensure last record is still being read correctly self.variant_is_equal(records[-1], ("20", 10, set(), "T", "G"))
def test_split_genotype_likelihood_with_missing_genotype_likelihood_diploid(self, log): split_func = make_split_sample_alt_func("G", lambda x: x) self.assertEqual( [[None, None, None], [None, None, None]], split_func([1.0, 2.0, 3.0], 2, GenotypeCall("0/1")) ) log.check( ('wecall.vcfutils.fieldmetadata', 'WARNING', "Incorrect number of values 'G' cardinality, expected 6, got 3"), )
def test_split_genotype_likelihood_warns_for_non_haploid_diploid(self, log): split_func = make_split_sample_alt_func("G", lambda x: x) self.assertEqual( [[1.0, 2.0], [1.0, 2.0]], split_func([1.0, 2.0], 2, GenotypeCall("0/1/2")) ) log.check( ('wecall.vcfutils.fieldmetadata', 'WARNING', "Unable to handle ploidy other than haploid or diploid."), )
def test_should_drop_genotype_likelihood_with_mismatch_ploidy(self): schema = Schema() schema.set_sample_data('GT', '1', 'String', '') schema.set_sample_data('GL', 'G', 'Float', '') schema.samples = ['foo'] records = list( generate_records(schema, [ 'chrZ', '200', '.', 'C', 'A,T', '.', 'PASS', '.', 'GT:GL', '0/1:1,2,3,4' ])) self.assertEqual(GenotypeCall("0/1"), records[0].sample_info.get_field('foo', 'GT')) self.assertEqual([None, None, None], records[0].sample_info.get_field('foo', 'GL')) self.assertEqual(GenotypeCall("0/0"), records[1].sample_info.get_field('foo', 'GT')) self.assertEqual([None, None, None], records[1].sample_info.get_field('foo', 'GL'))
def test_should_split_genotype_likelihood_properly(self): schema = Schema() schema.set_sample_data('GT', '1', 'String', '') schema.set_sample_data('GL', 'G', 'Float', '') schema.samples = ['foo'] records = list( generate_records(schema, [ 'chrZ', '200', '.', 'C', 'A,T', '.', 'PASS', '.', 'GT:GL', '0/1:1,2,3,4,5,6' ])) self.assertEqual(GenotypeCall("0/1"), records[0].sample_info.get_field('foo', 'GT')) self.assertEqual([1.0, 2.0, 3.0], records[0].sample_info.get_field('foo', 'GL')) self.assertEqual(GenotypeCall("0/0"), records[1].sample_info.get_field('foo', 'GT')) self.assertEqual([1.0, 4.0, 6.0], records[1].sample_info.get_field('foo', 'GL'))
def __get_expected_calls_from_haplotypes(ascii_strings, reference): if len(ascii_strings) != 2: raise weCallException( "Expected calls have to be defined as a diploid.") if not all(len(str) == reference.length_with_deletions() for str in ascii_strings): raise weCallException( "Ascii haplotypes have to be of the same length as the reference") vars_from_hap1 = Sequence(reference, ascii_strings[0]).variants vars_from_hap2 = Sequence(reference, ascii_strings[1]).variants calls = {} for var in vars_from_hap1.intersection(vars_from_hap2): calls[var] = GenotypeCall("1/1") for var in vars_from_hap1.symmetric_difference(vars_from_hap2): calls[var] = GenotypeCall("0/1") return calls
def test_should_mark_following_as_not_heterozygous(self): self.assertFalse(GenotypeCall("./.").is_heterozygous()) self.assertFalse(GenotypeCall(".|.").is_heterozygous()) self.assertFalse(GenotypeCall("1/1").is_heterozygous()) self.assertFalse(GenotypeCall("1|1").is_heterozygous()) self.assertFalse(GenotypeCall("2/2").is_heterozygous()) self.assertFalse(GenotypeCall("2|2").is_heterozygous())
def test_should_write_missing_values_in_sample_data(self): with VCFReaderContextManager( os.path.join(self.data_dir, "vcf_example.vcf")) as vcf_handler: first_record = next(vcf_handler.read_records()) sample_data = SampleData(['GT', 'PL', 'GQ'], ['sample1', 'sample2', 'sample3']) sample_data.add_sample_data("sample1", "GT", GenotypeCall("1|0")) sample_data.add_sample_data("sample1", "PL", [3000, 0, 3000]) sample_data.add_sample_data("sample1", "GQ", [1000]) sample_data.add_sample_data("sample2", "GT", GenotypeCall("1|1")) sample_data.add_sample_data("sample2", "PL", [2000, 0, 1000]) sample_data.add_sample_data("sample2", "GQ", [3]) first_record.sample_info = sample_data print((sample_data.to_vcf_columns())) vcf_string = vcf_row_from_record(first_record) expected_vcf_string = "20 10 . CT C 3000 PASS PP=3000;DP=250;DPR=140;DPF=110;VC=100;VCR=49;VCF=51;ABPV=0.2;SBPV=0.3;MQ=70.0;BR=31.0;QD=None GT:PL:GQ 1|0:3000,0,3000:1000 1|1:2000,0,1000:3 ./.:.:." # noqa self.assertEqual(expected_vcf_string, vcf_string)
def test_read_record_line(self): with VCFReaderContextManager( os.path.join(self.data_dir, "vcf_example.vcf")) as vcf_handler: record_gen = vcf_handler.read_records() next_record = next(record_gen) self.assertEqual(next_record.chrom, "20") self.assertEqual(next_record.pos_from, 9) self.assertEqual(next_record.ids, set()) self.assertEqual(next_record.ref, "CT") self.assertEqual(next_record.alt, "C") self.assertEqual(next_record.quality, 3000) self.assertEqual(next_record.filters, set()) self.assertEqual(next_record.passes_filter, True) self.assertEqual(next_record.from_multi_alt, False) self.assertEqual(next_record.type, variant.TYPE_DEL) self.assertEqual(next_record.info['PP'], [3000]) self.assertEqual(next_record.info['DP'], [250]) self.assertEqual(next_record.info['VC'], [100]) self.assertEqual(next_record.info['ABPV'], [0.2]) self.assertEqual(next_record.info['SBPV'], [0.3]) self.assertEqual(next_record.info['MQ'], [70]) self.assertEqual(next_record.info['QD'], [None]) self.assertTrue(next_record.sample_info.has_sample("sample1")) self.assertEqual(next_record.genotypes, { "sample1": GenotypeCall("1|0"), "sample2": GenotypeCall("1|1") }) self.assertEqual( next_record.sample_info.get_field("sample1", 'GT'), GenotypeCall("1|0")) self.assertEqual( next_record.sample_info.get_field("sample1", 'PL'), [3000, 0, 3000]) self.assertEqual( next_record.sample_info.get_field("sample1", "GQ"), [1000])
def __init__(self, key_names, sample_names): self.__sample_names = sample_names self.__key_to_sample_values = OrderedDict() self.__merged_genotypes = False for key_name in key_names: default_value = [] if key_name == GENOTYPE_KEY: default_value = GenotypeCall('./.') self.__key_to_sample_values[key_name] = [ default_value for _ in range(len(self.__sample_names)) ]
def test_binary_heterozygous_phased_genotypes(self): self.assertEqual((1, 1), GenotypeCall('0|1').normalized_allele_count) self.assertEqual((1, 1), GenotypeCall('1|0').normalized_allele_count) self.assertEqual((1, 1), GenotypeCall('0|2').normalized_allele_count) self.assertEqual((1, 1), GenotypeCall('2|0').normalized_allele_count) self.assertEqual((0, 1, 1), GenotypeCall('1|2').normalized_allele_count) self.assertEqual((0, 1, 1), GenotypeCall('2|1').normalized_allele_count)
def test_should_return_phased_heterozygous_genotype_when_merging_two_phased_identical_heterozygous_genotypes( self): genotype_call_1 = GenotypeCall("1|0") genotype_call_2 = GenotypeCall("1|0") self.assertEqual( merge_genotype_calls(genotype_call_1, genotype_call_2), GenotypeCall("1|0")) genotype_call_1 = GenotypeCall("0|1") genotype_call_2 = GenotypeCall("0|1") self.assertEqual( merge_genotype_calls(genotype_call_1, genotype_call_2), GenotypeCall("0|1"))
def test_should_return_default_diploid_genotype(self): sample_data = SampleData(['GT', 'GL'], ["NA12878"]) self.assertEqual(GenotypeCall("./."), GenotypeCall("./.")) self.assertTrue(sample_data.has_sample("NA12878")) self.assertEqual(sample_data.genotypes(), {"NA12878": GenotypeCall("./.")}) self.assertEqual(sample_data.get_field("NA12878", 'GT'), GenotypeCall("./.")) self.assertEqual(sample_data.get_field("NA12878", 'GL'), []) genotype_data = sample_data.get_genotype_data("NA12878") self.assertEqual(genotype_data.genotype(), GenotypeCall("./.")) self.assertEqual(genotype_data['GT'], GenotypeCall("./.")) self.assertEqual(genotype_data['GL'], [])