def test_create_processed_variant_no_change(self): variant = self._get_sample_variant() header_fields = vcf_header_parser.HeaderFields({}, {}) counter_factory = _CounterSpyFactory() factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=False, counter_factory=counter_factory) proc_var = factory.create_processed_variant(variant) # In this mode, the only difference between the original `variant` and # `proc_var` should be that INFO fields are copied to `_non_alt_info` map # and `_alternate_datas` are filled with alternate bases information only. proc_var_synthetic = processed_variant.ProcessedVariant(variant) proc_var_synthetic._non_alt_info = { 'A1': 'some data', 'A2': ['data1', 'data2'] } proc_var_synthetic._alternate_datas = [ processed_variant.AlternateBaseData(a) for a in ['A', 'TT'] ] self.assertEqual([proc_var_synthetic], [proc_var]) self.assertEqual( counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0)
def test_no_header_fields(self): header_fields = vcf_header_parser.HeaderFields({}, {}) self._assert_fields_equal( self._generate_expected_fields(), bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))
def test_bigquery_field_name_sanitize(self): infos = OrderedDict([ ('_', Info('_', 1, 'String', 'desc', 'src', 'v')), ('_A', Info('_A', 1, 'String', 'desc', 'src', 'v')), ('0a', Info('0a', 1, 'String', 'desc', 'src', 'v')), ('A-B*C', Info('A-B*C', 1, 'String', 'desc', 'src', 'v')), ('I-A', Info('I-A', field_counts['A'], 'Float', 'desc', 'src', 'v')), ('OK_info_09', Format('OK_info_09', 1, 'String', 'desc')) ]) formats = OrderedDict([('a^b', Format('a^b', 1, 'String', 'desc')), ('OK_format_09', Format('OK_format_09', 1, 'String', 'desc'))]) header_fields = vcf_header_parser.HeaderFields(infos, formats) self._assert_fields_equal( self._generate_expected_fields(alt_fields=['I_A'], call_fields=['a_b', 'OK_format_09'], info_fields=[ 'field__', 'field__A', 'field_0a', 'A_B_C', 'OK_info_09' ]), bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))
def test_info_header_fields(self): infos = OrderedDict([ ('I1', Info('I1', 1, 'String', 'desc', 'src', 'v')), ('I2', Info('I2', 2, 'Integer', 'desc', 'src', 'v')), ('IA', Info('IA', field_counts['A'], 'Float', 'desc', 'src', 'v')), ('IU', Info('IU', field_counts['.'], 'Character', 'desc', 'src', 'v')), ('IG', Info('IG', field_counts['G'], 'String', 'desc', 'src', 'v')), ('I0', Info('I0', 0, 'Flag', 'desc', 'src', 'v')), ('IA2', Info('IA2', field_counts['A'], 'Float', 'desc', 'src', 'v')), ( 'END', # END should not be included in the generated schema. Info('END', 1, 'Integer', 'Special END key', 'src', 'v')) ]) header_fields = vcf_header_parser.HeaderFields(infos, {}) self._assert_fields_equal( self._generate_expected_fields( alt_fields=['IA', 'IA2'], info_fields=['I1', 'I2', 'IU', 'IG', 'I0']), bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields))) # Test with split_alternate_allele_info_fields=False. actual_schema = bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=False)) self._assert_fields_equal( self._generate_expected_fields( info_fields=['I1', 'I2', 'IA', 'IU', 'IG', 'I0', 'IA2']), actual_schema) # Verify types and modes. expected_type_modes = { 'I1': (TableFieldConstants.TYPE_STRING, TableFieldConstants.MODE_NULLABLE), 'I2': (TableFieldConstants.TYPE_INTEGER, TableFieldConstants.MODE_REPEATED), 'IA': (TableFieldConstants.TYPE_FLOAT, TableFieldConstants.MODE_REPEATED), 'IU': (TableFieldConstants.TYPE_STRING, TableFieldConstants.MODE_REPEATED), 'IG': (TableFieldConstants.TYPE_STRING, TableFieldConstants.MODE_REPEATED), 'I0': (TableFieldConstants.TYPE_BOOLEAN, TableFieldConstants.MODE_NULLABLE), 'IA2': (TableFieldConstants.TYPE_FLOAT, TableFieldConstants.MODE_REPEATED) } for field in actual_schema.fields: if field.name in expected_type_modes: expected_type, expected_mode = expected_type_modes[field.name] self.assertEqual(expected_type, field.type) self.assertEqual(expected_mode, field.mode)
def _get_row_list_from_variant(self, variant, **kwargs): # TODO(bashir2): To make this more of a "unit" test, we should create # ProcessedVariant instances directly (instead of Variant) and avoid calling # create_processed_variant here. Then we should also add cases that # have annotation fields. header_fields = vcf_header_parser.HeaderFields({}, {}) proc_var = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant) return list( bigquery_vcf_schema.get_rows_from_variant(proc_var, **kwargs))
def test_create_processed_variant_move_alt_info(self): variant = self._get_sample_variant() header_fields = vcf_header_parser.HeaderFields({}, {}) factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True) proc_var = factory.create_processed_variant(variant) alt1 = processed_variant.AlternateBaseData('A') alt1._info = {'A2': 'data1'} alt2 = processed_variant.AlternateBaseData('TT') alt2._info = {'A2': 'data2'} self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) self.assertFalse(proc_var.non_alt_info.has_key('A2'))
def _get_sample_variant_and_header_with_csq(self): variant = self._get_sample_variant() variant.info['CSQ'] = vcfio.VariantInfo( data=['A|C1|I1|S1|G1', 'TT|C2|I2|S2|G2', 'A|C3|I3|S3|G3'], field_count='.') csq_info = parser._Info( id=None, num='.', type=None, desc='some desc Allele|Consequence|IMPACT|SYMBOL|Gene', source=None, version=None) header_fields = vcf_header_parser.HeaderFields(infos={'CSQ': csq_info}, formats={}) return variant, header_fields
def test_variant_merger_modify_schema(self): infos = OrderedDict([('I1', Info('I1', 1, 'String', 'desc', 'src', 'v')), ('IA', Info('IA', field_counts['A'], 'Integer', 'desc', 'src', 'v'))]) formats = OrderedDict([('F1', Format('F1', 1, 'String', 'desc'))]) header_fields = vcf_header_parser.HeaderFields(infos, formats) self._assert_fields_equal( self._generate_expected_fields( alt_fields=['IA'], call_fields=['F1'], info_fields=['I1', 'ADDED_BY_MERGER']), bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields), variant_merger=_DummyVariantMergeStrategy()))
def test_convert_variant_to_bigquery_row(self): variant_1, row_1 = self._get_sample_variant_1() variant_2, row_2 = self._get_sample_variant_2() variant_3, row_3 = self._get_sample_variant_3() header_fields = vcf_header_parser.HeaderFields({}, {}) proc_var_1 = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant_1) proc_var_2 = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant_2) proc_var_3 = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant_3) pipeline = TestPipeline() bigquery_rows = ( pipeline | Create([proc_var_1, proc_var_2, proc_var_3]) | 'ConvertToRow' >> ParDo(ConvertToBigQueryTableRow())) assert_that(bigquery_rows, equal_to([row_1, row_2, row_3])) pipeline.run()
def _get_row_list_from_variant(self, variant, schema_descriptor=None, allow_incompatible_records=False, **kwargs): # TODO(bashir2): To make this more of a "unit" test, we should create # ProcessedVariant instances directly (instead of Variant) and avoid calling # create_processed_variant here. Then we should also add cases that # have annotation fields. header_fields = vcf_header_parser.HeaderFields({}, {}) proc_var = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant) if not schema_descriptor: schema_descriptor = mock_bigquery_schema_descriptor.MockSchemaDescriptor( ) return list( bigquery_vcf_schema.get_rows_from_variant( proc_var, schema_descriptor, self._conflict_resolver, allow_incompatible_records, **kwargs))
def test_info_and_format_header_fields(self): infos = OrderedDict([('I1', Info('I1', 1, 'String', 'desc', 'src', 'v')), ('IA', Info('IA', field_counts['A'], 'Integer', 'desc', 'src', 'v'))]) # GT and PS should not be set as they're already included in special # 'genotype' and 'phaseset' fields. formats = OrderedDict([ ('F1', Format('F1', 1, 'String', 'desc')), ('F2', Format('F2', 2, 'Integer', 'desc')), ('FU', Format('FU', field_counts['.'], 'Float', 'desc')), ('GT', Format('GT', 2, 'Integer', 'Special GT key')), ('PS', Format('PS', 1, 'Integer', 'Special PS key')) ]) header_fields = vcf_header_parser.HeaderFields(infos, formats) self._assert_fields_equal( self._generate_expected_fields(alt_fields=['IA'], call_fields=['F1', 'F2', 'FU'], info_fields=['I1']), bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))
def test_create_processed_variant_annotation_alt_allele_num(self): csq_info = parser._Info( id=None, num='.', type=None, desc='some desc Allele|Consequence|IMPACT|ALLELE_NUM', source=None, version=None) header_fields = vcf_header_parser.HeaderFields(infos={'CSQ': csq_info}, formats={}) variant = vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='C', # The following represent a SNV and an insertion, resp. alternate_bases=['T', 'CT'], names=['rs1'], quality=2, filters=['PASS'], # Note that in the minimal mode of VEP, 'T' is an ambiguous annotation # ALT because it can map to either the 'T' SNV or the 'CT' insertion. # But because there is ALLELE_NUM there should be no ambiguity. # The last four annotations have incorrect ALLELE_NUMs. info={ 'CSQ': vcfio.VariantInfo(data=[ 'T|C1|I1|1', 'T|C2|I2|2', 'T|C3|I3|0', 'T|C4|I4|3', 'T|C5|I5|TEST', 'T|C6|I6|' ], field_count='.') }) counter_factory = _CounterSpyFactory() factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True, annotation_fields=['CSQ'], use_allele_num=True, minimal_match=True, # This should be ignored by the factory method. counter_factory=counter_factory) proc_var = factory.create_processed_variant(variant) alt1 = processed_variant.AlternateBaseData('T') alt1._info = { 'CSQ': [{ processed_variant._ANNOTATION_ALT: 'T', 'Consequence': 'C1', 'IMPACT': 'I1', 'ALLELE_NUM': '1' }] } alt2 = processed_variant.AlternateBaseData('CT') alt2._info = { 'CSQ': [{ processed_variant._ANNOTATION_ALT: 'T', 'Consequence': 'C2', 'IMPACT': 'I2', 'ALLELE_NUM': '2' }] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) self.assertEqual( counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MINIMAL_MATCH.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ALLELE_NUM_INCORRECT.value].get_value(), 4)