def test_sample_data_copes_with_mixed_missing_values_in_PL(self): sample_name = 'sample_name' sample_data = SampleData(['PL'], [sample_name]) sample_data.set_genotype_likelihoods(sample_name, [-0.1, '.', -0.2, None, -0.3]) self.assertEqual(sample_data.get_field(sample_name, 'PL'), [1.0, None, 2.0, None, 3.0])
def test_sample_data_copes_with_mixed_missing_values_in_PL(self): sample_name = 'sample_name' sample_data = SampleData(['PL'], [sample_name]) sample_data.add_sample_data(sample_name, 'PL', [-0.1, '.', -0.2, None, -0.3]) self.assertEqual(sample_data.get_genotype_likelihoods(sample_name), [0.01, None, 0.02, None, 0.03])
def test_eq(self): reference = Record(None, Variant("1", 20, "A", "G"), set(), 0.0, set(), InfoData(None, {}), SampleData([], []), False) self.assertTrue( reference == Record(None, Variant("1", 20, "A", "G"), set( ), 0.0, set(), InfoData(None, {}), SampleData([], []), False)) self.assertFalse( reference == Record(None, Variant("2", 20, "A", "G"), set( ), 0.0, set(), InfoData(None, {}), SampleData([], []), False)) self.assertFalse(reference == Record(None, Variant( "1", 20, "A", "G"), set("rs0"), 0.0, set(), InfoData(None, {}), SampleData([], []), False)) self.assertFalse( reference == Record(None, Variant("1", 20, "A", "G"), set( ), 5.0, set(), InfoData(None, {}), SampleData([], []), False)) self.assertFalse( reference == Record(None, Variant("1", 20, "A", "G"), set( ), 0.0, set("CV"), InfoData(None, {}), SampleData([], []), False)) self.assertFalse(reference == Record(None, Variant( "1", 20, "A", "G"), set(), 0.0, set(), InfoData(None, {'AF': []}), SampleData([], []), False)) self.assertFalse(reference == Record( None, Variant("1", 20, "A", "G"), set(), 0.0, set(), InfoData(None, {}), SampleData([], ['NA12787']), False)) self.assertFalse( reference == Record(None, Variant("1", 20, "A", "G"), set( ), 0.0, set(), InfoData(None, {}), SampleData([], []), True))
def test_should_fail_if_sample_data_objects_have_different_sample(self): sample_data1 = SampleData(['GT'], ['sample_name_1']) sample_data1.add_sample_data('sample_name_1', 'GT', GenotypeCall('0/0')) sample_data2 = SampleData(['GT'], ['sample_name_2']) sample_data2.add_sample_data('sample_name_2', 'GT', GenotypeCall('0/0')) self.assertRaises(Exception, sample_data1.merge_genotype_calls, sample_data2.genotypes())
def test_should_allow_multiple_samples_for_add_sample_data(self): sample_data = SampleData(['genotype_key1'], ['sample_name1', 'sample_name2']) sample_data.add_sample_data('sample_name1', 'genotype_key1', [1]) sample_data.add_sample_data('sample_name2', 'genotype_key1', [3, 4]) self.assertEqual( sample_data.get_field('sample_name1', 'genotype_key1'), [1]) self.assertEqual( sample_data.get_field('sample_name2', 'genotype_key1'), [3, 4])
def test_should_raise_when_adding_wrong_genotype_data(self): sample_data = SampleData(['GT'], ['sample_name']) self.assertRaisesRegex( weCallException, "Genotype field must be a GenotypeCall.", sample_data.add_sample_data, 'sample_name', 'GT', [1], )
def test_should_raise_when_adding_sample_data_to_missing_sample(self): sample_data = SampleData(['key'], ['sample_name']) self.assertRaisesRegex( weCallException, "Missing sample name missing_sample_name supplied when adding sample data.", sample_data.add_sample_data, 'missing_sample_name', 'key', [1], )
def test_default_values_are_assigned_when_sample_data_is_constructed(self): sample_data = SampleData(['GT', 'key1', 'key2'], ['sample_name1', 'sample_name2']) self.assertEqual(sample_data.get_field('sample_name1', 'GT'), GenotypeCall("./.")) self.assertEqual(sample_data.get_field('sample_name2', 'GT'), GenotypeCall("./.")) self.assertEqual(sample_data.get_field('sample_name1', 'key1'), []) self.assertEqual(sample_data.get_field('sample_name2', 'key1'), []) self.assertEqual(sample_data.get_field('sample_name1', 'key2'), []) self.assertEqual(sample_data.get_field('sample_name2', 'key2'), [])
def test_should_return_default_diploid_genotype(self): sample_data = SampleData(['GT', 'GL'], ["NA12878"]) self.assertEqual(GenotypeCall("./."), GenotypeCall("./.")) self.assertTrue(sample_data.has_sample("NA12878")) self.assertEqual(sample_data.genotypes(), {"NA12878": GenotypeCall("./.")}) self.assertEqual(sample_data.get_field("NA12878", 'GT'), GenotypeCall("./.")) self.assertEqual(sample_data.get_field("NA12878", 'GL'), []) genotype_data = sample_data.get_genotype_data("NA12878") self.assertEqual(genotype_data.genotype(), GenotypeCall("./.")) self.assertEqual(genotype_data['GT'], GenotypeCall("./.")) self.assertEqual(genotype_data['GL'], [])
def generate_record_from_variant(self, variant, **kwargs): annotations = { 'variant_id': set(), 'quality': None, 'filters': set(), 'info': InfoData(self.schema, {}), 'sample_info': SampleData([key for key, _ in self.schema.iter_sample_data()], self.schema.samples), 'from_multi_alt': False, } for key, value in kwargs.items(): annotations[key] = value return Record(schema=self.schema, variant=variant, **annotations)
def test_has_genotype_key_should_report_expected_value(self): sample_data = SampleData(['genotype_key'], ['sample_name']) self.assertTrue(sample_data.has_genotype_key('genotype_key')) self.assertFalse(sample_data.has_genotype_key('missing_genotype_key'))
def setUp(self): self.sample_data = SampleData(['GT', 'key'], ['sample_name1', 'sample_name2']) self.sample_data.add_sample_data("sample_name1", "key", [1, 2]) self.sample_data.add_sample_data("sample_name2", "GT", GenotypeCall("0/1"))
def test_has_sample_reports_expected_value(self): sample_data = SampleData(['key1'], ['sample_name']) self.assertTrue(sample_data.has_sample('sample_name')) self.assertFalse(sample_data.has_sample('missing_sample_name'))
def test_gets_value_for_GQ_key(self): sample_name = 'sample_name' sample_data = SampleData(['GQ'], [sample_name]) sample_data.add_sample_data(sample_name, 'GQ', [2.3]) self.assertEqual(sample_data.get_genotype_quality(sample_name), [2.3])
class TestGenotypeDataView(unittest.TestCase): def setUp(self): self.sample_data = SampleData(['GT', 'key'], ['sample_name1', 'sample_name2']) self.sample_data.add_sample_data("sample_name1", "key", [1, 2]) self.sample_data.add_sample_data("sample_name2", "GT", GenotypeCall("0/1")) def test_contains_method_returns_expected_value_sample1(self): genotype_data = self.sample_data.get_genotype_data("sample_name1") self.assertNotIn("cheesecake", genotype_data) self.assertNotIn("sample_name1", genotype_data) self.assertIn("GT", genotype_data) self.assertIn("key", genotype_data) def test_contains_method_returns_expected_value_sample2(self): genotype_data = self.sample_data.get_genotype_data("sample_name2") self.assertIn("GT", genotype_data) self.assertIn("key", genotype_data) def test_getitem_method_returns_expected_value(self): genotype_data = self.sample_data.get_genotype_data("sample_name1") self.assertEqual(genotype_data["GT"], GenotypeCall("./.")) self.assertEqual(genotype_data["key"], [1, 2]) genotype_data = self.sample_data.get_genotype_data("sample_name2") self.assertEqual(genotype_data["GT"], GenotypeCall("0/1")) self.assertEqual(genotype_data["key"], []) def test_keys_method_returns_expected_data(self): genotype_data = self.sample_data.get_genotype_data("sample_name1") self.assertEqual(list(genotype_data.keys()), ["GT", "key"]) genotype_data = self.sample_data.get_genotype_data("sample_name2") self.assertEqual(list(genotype_data.keys()), ["GT", "key"]) def test_values_method_returns_expected_data(self): genotype_data = self.sample_data.get_genotype_data("sample_name1") self.assertEqual(list(genotype_data.values()), [GenotypeCall("./."), [1, 2]]) genotype_data = self.sample_data.get_genotype_data("sample_name2") self.assertEqual(list(genotype_data.values()), [GenotypeCall("0/1"), []])
def test_should_merge_genotype_call_object_in_sample_data(self): sample_data1 = SampleData(['GT'], ['sample_name']) sample_data1.add_sample_data('sample_name', 'GT', GenotypeCall('0/1')) sample_data2 = SampleData(['GT'], ['sample_name']) sample_data2.add_sample_data('sample_name', 'GT', GenotypeCall('0/1')) sample_data1.merge_genotype_calls(sample_data2.genotypes()) self.assertEqual(sample_data1.get_field("sample_name", "GT"), GenotypeCall("1/1"))
def test_default_field_value_is_assigned_when_sample_data_is_constructed( self): sample_data = SampleData(['key1'], ['sample_name']) self.assertEqual(sample_data.get_field('sample_name', 'key1'), [])
def test_should_write_missing_values_in_sample_data(self): with VCFReaderContextManager( os.path.join(self.data_dir, "vcf_example.vcf")) as vcf_handler: first_record = next(vcf_handler.read_records()) sample_data = SampleData(['GT', 'PL', 'GQ'], ['sample1', 'sample2', 'sample3']) sample_data.add_sample_data("sample1", "GT", GenotypeCall("1|0")) sample_data.add_sample_data("sample1", "PL", [3000, 0, 3000]) sample_data.add_sample_data("sample1", "GQ", [1000]) sample_data.add_sample_data("sample2", "GT", GenotypeCall("1|1")) sample_data.add_sample_data("sample2", "PL", [2000, 0, 1000]) sample_data.add_sample_data("sample2", "GQ", [3]) first_record.sample_info = sample_data print((sample_data.to_vcf_columns())) vcf_string = vcf_row_from_record(first_record) expected_vcf_string = "20 10 . CT C 3000 PASS PP=3000;DP=250;DPR=140;DPF=110;VC=100;VCR=49;VCF=51;ABPV=0.2;SBPV=0.3;MQ=70.0;BR=31.0;QD=None GT:PL:GQ 1|0:3000,0,3000:1000 1|1:2000,0,1000:3 ./.:.:." # noqa self.assertEqual(expected_vcf_string, vcf_string)
def test_gets_exact_values_if_key_is_NR(self): sample_name = 'sample_name' sample_data = SampleData(['NR'], [sample_name]) sample_data.add_sample_data(sample_name, 'NR', [100]) self.assertEqual(sample_data.get_read_depth(sample_name), [100])
def generate_records(schema, cols): alts = cols[ALT_COL].split(',') vars = [Variant(cols[CHROM_COL], int(cols[POS_COL]) - 1, cols[REF_COL], alt) for alt in alts] info_data_list = [] if len(alts) == 1: # deferred parsing is simple with a single alt info_data_list.append( DeferredInfoData( schema, lambda: defer_parse_info_field( schema, cols[INFO_COL]))) else: # extract and split info data into lists of length n_alts split_info_data = OrderedDict() for key, value in parse_info_field(cols[INFO_COL]): try: info_metadata = schema.get_info_data(key) except KeyError: split_info_data[key] = [ DeferredInfoValue( schema, key, value) for index in range( len(alts))] else: split_info_data[key] = info_metadata.split_alts( value if isinstance(value, list) else value.split(','), n_alts=len(alts) ) # construct InfoData objects from prepared info data for index in range(len(alts)): info_dict = OrderedDict([ (key, values[index]) for key, values in list(split_info_data.items()) ]) info_data_list.append(InfoData(schema, info_dict)) try: sample_format = cols[FORMAT_COL].split(':') except IndexError: sample_data_list = repeat(None) else: # extract sample format split_sample_data = {sample_name: sample_field.split( ':') for sample_name, sample_field in zip(schema.samples, cols[SAMPLE_COL:])} sample_data_list = [ SampleData( cols[FORMAT_COL].split(':'), schema.samples) for _ in alts] for sample_name, sample_items in list(split_sample_data.items()): split_sample_items = {} # extract data from sample fields gt = None for key, item in zip(sample_format, sample_items): try: if key == GENOTYPE_KEY: gt = GenotypeCall(item) values = [ GenotypeCall(gt.deliminator().join( # Note: default value should be '.', but # downstream tools aren't good enough to use it {None: '.', 0: '0', 1 + index: '1'}.get(gt_index, '0') for gt_index in gt )) for index in range(len(alts)) ] elif key == GENOTYPE_LIKELIHOODS_KEY or key == GENOTYPE_PHRED_LIKELIHOODS_KEY: values = schema.get_sample_data(key).split_alts( item.split(','), len(alts), gt) else: values = schema.get_sample_data(key).split_alts( item.split(','), len(alts), None) split_sample_items[key] = values except Exception as e: raise type(e)( "Error parsing field {} for sample {}: {}".format( key, sample_name, e)) # distribute data to each split sample meta-data container for index in range(len(alts)): sample_data = sample_data_list[index] for key, value in list(split_sample_items.items()): sample_data.add_sample_data(sample_name, key, value[index]) # generate & return record objects for var, info_data, sample_data in zip( vars, info_data_list, sample_data_list): qual = variant_quality_from_vcf(cols[QUALITY_COL]) ids = variant_ids_from_vcf(cols[ID_COL]) filts = filters_from_vcf(cols[FILTER_COL]) yield Record(schema, var, ids, qual, filts, info_data, sample_data, len(alts) > 1)
def test_gets_dot_if_key_is_PL(self): sample_name = 'sample_name' sample_data = SampleData(['PL'], [sample_name]) sample_data.set_genotype_likelihoods(sample_name, '.') self.assertEqual(sample_data.get_field(sample_name, 'PL'), '.')
def test_genotype_field_default_value_is_assigned_when_sample_data_is_constructed( self): sample_data = SampleData(['GT'], ['sample_name']) self.assertEqual(sample_data.get_field('sample_name', 'GT'), GenotypeCall("./."))
def test_gets_exact_values_if_key_is_GL(self): sample_name = 'sample_name' sample_data = SampleData(['GL'], [sample_name]) sample_data.add_sample_data(sample_name, 'GL', [-0.1, -0.2, -0.3]) self.assertEqual(sample_data.get_genotype_likelihoods(sample_name), [-0.1, -0.2, -0.3])
def test_gets_list_of_none_if_key_is_GL(self): sample_name = 'sample_name' sample_data = SampleData(['GL'], [sample_name]) sample_data.add_sample_data(sample_name, 'GL', [None, None, None]) self.assertEqual(sample_data.get_genotype_likelihoods(sample_name), [None, None, None])
def test_has_genotype_keys_should_support_multiple_keys(self): sample_data = SampleData(['genotype_key1', 'genotype_key2'], ['sample_name']) self.assertTrue(sample_data.has_genotype_key('genotype_key1')) self.assertTrue(sample_data.has_genotype_key('genotype_key2'))
def test_read_sample_data(self): schema = self.__get_example_schema("vcf_example.vcf") sample_schema = [key for key, _ in schema.iter_sample_data()] sample_data = SampleData(sample_schema, ['sample1']) sample_data.add_sample_data("sample1", "GT", GenotypeCall("1|0")) sample_data.add_sample_data("sample1", "PL", [3000, 0, 3000]) sample_data.add_sample_data("sample1", "GQ", [1000]) sample_data.add_sample_data("sample1", "PQ", [2000]) sample_data.add_sample_data("sample1", "PS", [60000]) sample_data.add_sample_data("sample1", "AD", [140, 110]) sample_data.add_sample_data("sample1", "DP", [250]) sample_data.add_sample_data("sample1", "VAF", [0.4]) self.assertTrue(sample_data.has_sample("sample1")) self.assertEqual(sample_data.genotypes(), {"sample1": GenotypeCall("1|0")}) self.assertEqual(sample_data.get_field("sample1", 'GT'), GenotypeCall("1|0")) self.assertEqual(sample_data.get_field("sample1", 'PL'), [3000, 0, 3000]) genotype_data = sample_data.get_genotype_data("sample1") self.assertEqual(genotype_data.genotype(), GenotypeCall("1|0")) self.assertEqual(genotype_data['GT'], GenotypeCall("1|0")) self.assertEqual(genotype_data['PL'], [3000, 0, 3000])
def test_should_add_sample_data(self): sample_data = SampleData(['genotype_key1'], ['sample_name']) sample_data.add_sample_data('sample_name', 'genotype_key1', [1]) self.assertEqual(sample_data.get_field('sample_name', 'genotype_key1'), [1])
def test_gets_exact_values_if_key_is_NV(self): sample_name = 'sample_name' sample_data = SampleData(['NV'], [sample_name]) sample_data.add_sample_data(sample_name, 'NV', [100]) self.assertEqual(sample_data.get_variant_support(sample_name), [100])
def test_gets_exact_values_if_key_is_PL(self): sample_name = 'sample_name' sample_data = SampleData(['PL'], [sample_name]) sample_data.set_genotype_likelihoods(sample_name, [-0.1, -0.2, -0.3]) self.assertEqual(sample_data.get_field(sample_name, 'PL'), [1, 2, 3])
def test_gets_dot_if_key_is_GL(self): sample_name = 'sample_name' sample_data = SampleData(['GL'], [sample_name]) sample_data.add_sample_data(sample_name, 'GL', '.') self.assertEqual(sample_data.get_genotype_likelihoods(sample_name), '.')