def test_generate_header_fields_from_schema_none_mode(self): schema_non_reserved_fields = bigquery.TableSchema() schema_non_reserved_fields.fields.append( bigquery.TableFieldSchema( name='field', type=bigquery_util.TableFieldConstants.TYPE_STRING, description='desc')) header = schema_converter.generate_header_fields_from_schema( schema_non_reserved_fields) infos = OrderedDict([('field', createInfo('field', 1, 'String', 'desc', None, None))]) formats = OrderedDict() expected_header = vcf_header_io.VcfHeader(infos=infos, formats=formats) self.assertEqual(header, expected_header) schema_reserved_fields = bigquery.TableSchema() schema_reserved_fields.fields.append( bigquery.TableFieldSchema( name='AA', type=bigquery_util.TableFieldConstants.TYPE_STRING, description='desc')) header = schema_converter.generate_header_fields_from_schema( schema_reserved_fields) infos = OrderedDict([('AA', createInfo('AA', 1, 'String', 'desc', None, None))]) formats = OrderedDict() expected_header = vcf_header_io.VcfHeader(infos=infos, formats=formats) self.assertEqual(header, expected_header)
def _get_sample_variant_and_header_with_csq(self, additional_infos=None): """Provides a simple `Variant` and `VcfHeader` with info fields Args: additional_infos: A list of tuples of the format (key, `Info`) to be added to the `VcfHeader`. """ # type: ( variant = self._get_sample_variant() variant.info['CSQ'] = [ 'A|C1|I1|S1|G1', 'TT|C2|I2|S2|G2', 'A|C3|I3|S3|G3' ] infos = OrderedDict([ ('A1', createInfo('A1', 1, '.', 'desc', None, None)), ('A2', createInfo('A2', 'A', '.', 'desc', None, None)), ('CSQ', createInfo('CSQ', '.', '.', 'some desc Allele|Consequence|IMPACT|SYMBOL|Gene', None, None)) ]) if additional_infos is not None: for key, value in additional_infos: infos[key] = value header_fields = vcf_header_io.VcfHeader(infos=infos) return variant, header_fields
def test_add_info_fields_reserved_field(self): field_with_desc = bigquery.TableFieldSchema( name='AA', type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='bigquery desc') infos = OrderedDict() schema_converter._add_info_fields(field_with_desc, infos) expected_infos = OrderedDict([ ('AA', createInfo('AA', 1, 'String', 'bigquery desc', None, None)) ]) self.assertEqual(infos, expected_infos) field_without_desc = bigquery.TableFieldSchema( name='AA', type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='') infos = OrderedDict() schema_converter._add_info_fields(field_without_desc, infos) expected_infos = OrderedDict([ ('AA', createInfo('AA', 1, 'String', 'Ancestral allele', None, None)) ]) self.assertEqual(infos, expected_infos)
def test_infer_annotation_pipeline(self): anno_fields = ['CSQ'] header = self._get_sample_header_fields(with_annotation=True) variant1 = self._get_sample_variant_1() variant1.info['CSQ'] = [ 'A|1|100|1.2', 'A|2|101|1.3', 'A|12|start|0', 'TT|13|end|7' ] variant2 = self._get_sample_variant_1() variant2.info['CSQ'] = [ 'A|1|100|', 'A|2|101|', 'A|1.2|102|0', 'TT|1.3|103|7' ] desc = 'Inferred type field for annotation {}.' expected = vcf_header_io.VcfHeader( infos={ 'CSQ_Gene_TYPE': createInfo('CSQ_Gene_TYPE', 1, 'Float', desc.format('Gene')), 'CSQ_Position_TYPE': createInfo('CSQ_Position_TYPE', 1, 'String', desc.format('Position')), 'CSQ_Score_TYPE': createInfo('CSQ_Score_TYPE', 1, 'Float', desc.format('Score')) }) with TestPipeline() as p: inferred_headers = ( p | Create([variant1, variant2]) | 'InferAnnotationTypes' >> infer_headers.InferHeaderFields( defined_headers=header, infer_headers=False, annotation_fields_to_infer=anno_fields)) assert_that(inferred_headers, asserts.header_fields_equal_ignore_order([expected])) p.run()
def test_variant_merger_modify_schema(self): infos = OrderedDict([ ('I1', createInfo('I1', 1, 'String', 'desc', 'src', 'v')), ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v'))]) formats = OrderedDict([('F1', createFormat('F1', 1, 'String', 'desc'))]) header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats) self._validate_schema( self._generate_expected_fields( alt_fields=['IA'], call_fields=['F1'], info_fields=['I1', 'ADDED_BY_MERGER']), schema_converter.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields), variant_merger=_DummyVariantMergeStrategy()))
def test_generate_header_fields_from_schema(self): sample_schema = bigquery_schema_util.get_sample_table_schema() header = schema_converter.generate_header_fields_from_schema( sample_schema) infos = OrderedDict([ ('AF', createInfo('AF', 'A', 'Float', 'desc', None, None)), ('AA', createInfo('AA', 1, 'String', 'desc', None, None)), ('IFR', createInfo('IFR', '.', 'Float', 'desc', None, None)), ('IS', createInfo('IS', 1, 'String', 'desc', None, None))]) formats = OrderedDict([ ('FB', createFormat('FB', 1, 'String', 'desc')), ('GQ', createFormat('GQ', 1, 'Integer', 'desc'))]) expected_header = vcf_header_io.VcfHeader(infos=infos, formats=formats) self.assertEqual(header, expected_header)
def test_report_multiple_files(self): header_definitions = VcfHeaderDefinitions() header_definitions._infos = { 'NS': { Definition(1, 'Float'): ['file1', 'file2'], Definition(1, 'Integer'): ['file3'] } } infos = OrderedDict([('NS', createInfo('NS', 1, 'Float', 'Number samples', None, None))]) resolved_headers = VcfHeader(infos=infos) expected = [ preprocess_reporter._InconsistencyType.HEADER_CONFLICTS + '\n', preprocess_reporter._HeaderLine.CONFLICTS_HEADER + '\n', (preprocess_reporter._DELIMITER).join([ 'NS', 'INFO', 'num=1 type=Float', 'file1', 'num=1 type=Float\n' ]), (preprocess_reporter._DELIMITER).join( [' ', ' ', ' ', 'file2', ' \n']), (preprocess_reporter._DELIMITER).join( [' ', ' ', 'num=1 type=Integer', 'file3', ' \n']), '\n' ] self._generate_report_and_assert_contents_equal( expected, header_definitions, resolved_headers)
def test_infer_annotation_types_with_multiple_annotation_fields(self): anno_fields = ['CSQ', 'CSQ_VT'] infos = self._get_annotation_infos() infos['CSQ_VT'] = createInfo( 'CSQ_VT', 'A', 'String', 'Annotations from VEP. Format: Allele|Gene|Position|Score', 'source', 'v') variant = self._get_sample_variant_1() variant.info['CSQ_VT'] = ['A|1|100|1.2', 'A|2|101|1.3'] variant.info['CSQ'] = ['A|1|100|1.2', 'A|2|101|1.3'] inferred_infos = infer_headers_util.infer_info_fields( variant, vcf_header_io.VcfHeader(infos=infos), False, anno_fields) expected_infos = { 'CSQ_Gene_TYPE': self._get_inferred_info('CSQ', 'Gene', 'Integer'), 'CSQ_Position_TYPE': self._get_inferred_info('CSQ', 'Position', 'Integer'), 'CSQ_Score_TYPE': self._get_inferred_info('CSQ', 'Score', 'Float'), 'CSQ_VT_Gene_TYPE': self._get_inferred_info('CSQ_VT', 'Gene', 'Integer'), 'CSQ_VT_Position_TYPE': self._get_inferred_info('CSQ_VT', 'Position', 'Integer'), 'CSQ_VT_Score_TYPE': self._get_inferred_info('CSQ_VT', 'Score', 'Float') } self.assertDictEqual(expected_infos, inferred_infos)
def test_report_conflicted_and_inferred_headers(self): header_definitions = VcfHeaderDefinitions() header_definitions._infos = { 'NS': { Definition(1, 'Float'): ['file1'], Definition(1, 'Integer'): ['file2'] } } infos = OrderedDict([('NS', createInfo('NS', 1, 'Float', 'Number samples', None, None))]) formats = OrderedDict([('DP', createFormat('DP', 2, 'Float', 'Total Depth'))]) resolved_headers = VcfHeader(infos=infos, formats=formats) inferred_headers = VcfHeader(formats=formats) expected = [ preprocess_reporter._InconsistencyType.HEADER_CONFLICTS + '\n', preprocess_reporter._HeaderLine.CONFLICTS_HEADER + '\n', (preprocess_reporter._DELIMITER).join([ 'NS', 'INFO', 'num=1 type=Float', 'file1', 'num=1 type=Float\n' ]), (preprocess_reporter._DELIMITER).join( [' ', ' ', 'num=1 type=Integer', 'file2', ' \n']), '\n', preprocess_reporter._InconsistencyType.INFERRED_HEADERS + '\n', preprocess_reporter._HeaderLine.INFERRED_FIELD_HEADER + '\n', (preprocess_reporter._DELIMITER).join( ['DP', 'FORMAT', 'num=2 type=Float\n']), '\n' ] self._generate_report_and_assert_contents_equal( expected, header_definitions, resolved_headers, inferred_headers)
def test_infer_annotation_types_with_multiple_annotation_fields(self): anno_fields = ['CSQ', 'CSQ_VT'] csq_vt = [ ('CSQ_VT', createInfo( 'CSQ_VT', 'A', 'String', 'Annotations from VEP. Format: Allele|Gene|Position|Score', 'source', 'v')) ] header = self._get_sample_header_fields(with_annotation=csq_vt) variant = self._get_sample_variant_1() variant.info['CSQ_VT'] = ['A|1|100|1.2', 'A|2|101|1.3'] variant.info['CSQ'] = ['A|1|100|1.2', 'A|2|101|1.3'] infer_header_fields = infer_headers._InferHeaderFields( False, anno_fields) inferred_headers = next(infer_header_fields.process(variant, header)) expected_types = { 'CSQ_Gene_TYPE': 'Integer', 'CSQ_Position_TYPE': 'Integer', 'CSQ_Score_TYPE': 'Float', 'CSQ_VT_Gene_TYPE': 'Integer', 'CSQ_VT_Position_TYPE': 'Integer', 'CSQ_VT_Score_TYPE': 'Float' } for key, item in inferred_headers.infos.items(): self.assertEqual(item['type'], expected_types[key]) self.assertEqual(len(expected_types), len(inferred_headers.infos))
def test_add_info_fields_reserved_field_schema_compatibility(self): field_conflict_info_type = bigquery.TableFieldSchema( name='AA', type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='desc') with self.assertRaises(ValueError): schema_converter._add_info_fields(field_conflict_info_type, OrderedDict()) field_conflict_info_format = bigquery.TableFieldSchema( name='AA', type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='desc') with self.assertRaises(ValueError): schema_converter._add_info_fields(field_conflict_info_format, OrderedDict()) info_allow_incompatible_schema = OrderedDict() schema_converter._add_info_fields(field_conflict_info_format, info_allow_incompatible_schema, allow_incompatible_schema=True) expected_infos = OrderedDict([('AA', createInfo('AA', '.', 'String', 'desc', None, None))]) self.assertEqual(info_allow_incompatible_schema, expected_infos)
def test_create_alt_bases_field_schema_types(self): ids = [ 'CSQ_Allele_TYPE', 'CSQ_Consequence_TYPE', 'CSQ_IMPACT_TYPE', 'CSQ_SYMBOL_TYPE' ] types = ['String', 'Integer', 'Integer', 'Float'] infos = [(i, createInfo(i, 1, t, 'desc', None, None)) for i, t in zip(ids, types)] _, header_fields = self._get_sample_variant_and_header_with_csq( additional_infos=infos) for hfi in header_fields.infos.values(): if hfi['type'] == '.': hfi['type'] = 'String' factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True, annotation_fields=['CSQ']) schema = factory.create_alt_bases_field_schema() csq_field = [field for field in schema.fields if field.name == 'CSQ'][0] expected_name_type_map = { 'CSQ': 'RECORD', 'allele': 'STRING', 'Consequence': 'INTEGER', 'IMPACT': 'INTEGER', 'SYMBOL': 'FLOAT', 'Gene': 'STRING' } for field in csq_field.fields: self.assertEqual(field.type, expected_name_type_map[field.name])
def test_header_fields_inferred_from_two_variants(self): with TestPipeline() as p: variant_1 = self._get_sample_variant_1() variant_2 = self._get_sample_variant_2() inferred_headers = ( p | Create([variant_1, variant_2]) | 'InferHeaderFields' >> infer_headers.InferHeaderFields( defined_headers=None, infer_headers=True)) expected_infos = { 'IS': createInfo('IS', 1, 'String', ''), 'ISI': createInfo('ISI', 1, 'Integer', ''), 'ISF': createInfo('ISF', 1, 'Float', ''), 'IF': createInfo('IF', 1, 'Float', ''), 'IB': createInfo('IB', 0, 'Flag', ''), 'IA': createInfo('IA', '.', 'Integer', ''), 'IS_2': createInfo('IS_2', 1, 'String', '') } expected_formats = { 'FI': createFormat('FI', 1, 'Integer', ''), 'FU': createFormat('FU', '.', 'Float', ''), 'FI_2': createFormat('FI_2', 1, 'Integer', '') } expected = vcf_header_io.VcfHeader(infos=expected_infos, formats=expected_formats) assert_that(inferred_headers, asserts.header_fields_equal_ignore_order([expected])) p.run()
def _get_sample_header_fields(self, with_annotation=False): """Provides a simple `VcfHeader` with info and format fields Args: with_annotation: Can be bool or list of tuples. Tuples should be additional annotation fields in the format (key, `Info`). """ infos = OrderedDict([ ('IS', createInfo('I1', 1, 'String', 'desc', 'src', 'v')), ('ISI', createInfo('ISI', 1, 'Integer', 'desc', 'src', 'v')), ('ISF', createInfo('ISF', 1, 'Float', 'desc', 'src', 'v')), ('IF', createInfo('IF', 1, 'Float', 'desc', 'src', 'v')), ('IB', createInfo('I1', 1, 'Flag', 'desc', 'src', 'v')), ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v')) ]) if with_annotation: infos['CSQ'] = createInfo( 'CSQ', '.', 'String', 'Annotations from VEP. Format: Allele|Gene|Position|Score', 'src', 'v') if isinstance(with_annotation, list): for key, value in with_annotation: infos[key] = value formats = OrderedDict([ ('FS', createFormat('FS', 1, 'String', 'desc')), ('FI', createFormat('FI', 2, 'Integer', 'desc')), ('FU', createFormat('FU', '.', 'Float', 'desc')), ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')), ('PS', createFormat('PS', 1, 'Integer', 'Special PS key')) ]) return vcf_header_io.VcfHeader(infos=infos, formats=formats)
def test_vcf_header_to_schema_to_vcf_header(self): infos = OrderedDict([ ('I1', createInfo('I1', '.', 'String', 'desc', None, None)), ('IA', createInfo('IA', '.', 'Integer', 'desc', None, None))]) formats = OrderedDict([ ('F1', createFormat('F1', '.', 'String', 'desc')), ('F2', createFormat('F2', '.', 'Integer', 'desc')), ('FU', createFormat('FU', '.', 'Float', 'desc'))]) original_header = vcf_header_io.VcfHeader(infos=infos, formats=formats) schema = schema_converter.generate_schema_from_header_fields( original_header, processed_variant.ProcessedVariantFactory(original_header)) reconstructed_header = ( schema_converter.generate_header_fields_from_schema( schema)) self.assertEqual(original_header, reconstructed_header)
def test_info_header_fields(self): infos = OrderedDict([ ('I1', createInfo('I1', 1, 'String', 'desc', 'src', 'v')), ('I2', createInfo('I2', 2, 'Integer', 'desc', 'src', 'v')), ('IA', createInfo('IA', 'A', 'Float', 'desc', 'src', 'v')), ('IU', createInfo('IU', '.', 'Character', 'desc', 'src', 'v')), ('IG', createInfo('IG', 'G', 'String', 'desc', 'src', 'v')), ('I0', createInfo('I0', 0, 'Flag', 'desc', 'src', 'v')), ('IA2', createInfo('IA2', 'A', 'Float', 'desc', 'src', 'v')), ( 'END', # END should not be included in the generated schema. createInfo('END', 1, 'Integer', 'Special END key', 'src', 'v')) ]) header_fields = vcf_header_io.VcfHeader(infos=infos) self._validate_schema( self._generate_expected_fields( alt_fields=['IA', 'IA2'], info_fields=['I1', 'I2', 'IU', 'IG', 'I0']), schema_converter.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields))) # Test with split_alternate_allele_info_fields=False. actual_schema = (schema_converter.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=False))) self._validate_schema( self._generate_expected_fields( info_fields=['I1', 'I2', 'IA', 'IU', 'IG', 'I0', 'IA2']), actual_schema) # Verify types and modes. expected_type_modes = { 'I1': (TableFieldConstants.TYPE_STRING, TableFieldConstants.MODE_NULLABLE), 'I2': (TableFieldConstants.TYPE_INTEGER, TableFieldConstants.MODE_REPEATED), 'IA': (TableFieldConstants.TYPE_FLOAT, TableFieldConstants.MODE_REPEATED), 'IU': (TableFieldConstants.TYPE_STRING, TableFieldConstants.MODE_REPEATED), 'IG': (TableFieldConstants.TYPE_STRING, TableFieldConstants.MODE_REPEATED), 'I0': (TableFieldConstants.TYPE_BOOLEAN, TableFieldConstants.MODE_NULLABLE), 'IA2': (TableFieldConstants.TYPE_FLOAT, TableFieldConstants.MODE_REPEATED) } for field in actual_schema.fields: if field.name in expected_type_modes: expected_type, expected_mode = expected_type_modes[field.name] self.assertEqual(expected_type, field.type) self.assertEqual(expected_mode, field.mode)
def test_add_info_fields_from_alternate_bases_reserved_field(self): alternate_bases_record_with_desc = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each alternate base (if any).') alternate_bases_record_with_desc.fields.append( bigquery.TableFieldSchema( name='AF', type=bigquery_util.TableFieldConstants.TYPE_FLOAT, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='bigquery desc')) infos_with_desc = OrderedDict() schema_converter._add_info_fields(alternate_bases_record_with_desc, infos_with_desc) expected_infos = OrderedDict([ ('AF', createInfo('AF', 'A', 'Float', 'bigquery desc', None, None)) ]) self.assertEqual(infos_with_desc, expected_infos) alternate_bases_record_no_desc = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each alternate base (if any).') alternate_bases_record_no_desc.fields.append( bigquery.TableFieldSchema( name='AF', type=bigquery_util.TableFieldConstants.TYPE_FLOAT, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='')) infos_no_desc = OrderedDict() schema_converter._add_info_fields(alternate_bases_record_no_desc, infos_no_desc) expected_infos = OrderedDict([( 'AF', createInfo( 'AF', 'A', 'Float', 'Allele frequency for each ALT allele in the same order ' 'as listed (estimated from primary data, not called genotypes', None, None))]) self.assertEqual(infos_no_desc, expected_infos)
def test_create_processed_variant_annotation_alt_allele_num(self): csq_info = createInfo( None, '.', '.', 'some desc Allele|Consequence|IMPACT|ALLELE_NUM', source=None, version=None) header_fields = vcf_header_io.VcfHeader(infos={'CSQ': csq_info}) variant = vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='C', # The following represent a SNV and an insertion, resp. alternate_bases=['T', 'CT'], names=['rs1'], quality=2, filters=['PASS'], # Note that in the minimal mode of VEP, 'T' is an ambiguous annotation # ALT because it can map to either the 'T' SNV or the 'CT' insertion. # But because there is ALLELE_NUM there should be no ambiguity. # The last four annotations have incorrect ALLELE_NUMs. info={'CSQ': ['T|C1|I1|1', 'T|C2|I2|2', 'T|C3|I3|0', 'T|C4|I4|3', 'T|C5|I5|TEST', 'T|C6|I6|']}) counter_factory = _CounterSpyFactory() factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True, annotation_fields=['CSQ'], use_allele_num=True, minimal_match=True, # This should be ignored by the factory method. counter_factory=counter_factory) proc_var = factory.create_processed_variant(variant) alt1 = processed_variant.AlternateBaseData('T') alt1._info = { 'CSQ': [ {annotation_parser.ANNOTATION_ALT: 'T', 'Consequence': 'C1', 'IMPACT': 'I1', 'ALLELE_NUM': '1'}] } alt2 = processed_variant.AlternateBaseData('CT') alt2._info = { 'CSQ': [ {annotation_parser.ANNOTATION_ALT: 'T', 'Consequence': 'C2', 'IMPACT': 'I2', 'ALLELE_NUM': '2'}] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) self.assertFalse('CSQ' in proc_var.non_alt_info) self.assertEqual(counter_factory.counter_map[ CEnum.VARIANT.value].get_value(), 1) self.assertEqual(counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ALLELE_NUM_INCORRECT.value].get_value(), 4)
def test_add_info_fields_non_reserved_field(self): non_reserved_field = bigquery.TableFieldSchema( name='non_reserved_info', type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='') infos = OrderedDict() schema_converter._add_info_fields(non_reserved_field, infos) expected_infos = OrderedDict([('non_reserved_info', createInfo('non_reserved_info', 1, 'String', '', None, None))]) self.assertEqual(infos, expected_infos)
def test_info_and_format_header_fields(self): infos = OrderedDict([ ('I1', createInfo('I1', 1, 'String', 'desc', 'src', 'v')), ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v'))]) # GT and PS should not be set as they're already included in special # 'genotype' and 'phaseset' fields. formats = OrderedDict([ ('F1', createFormat('F1', 1, 'String', 'desc')), ('F2', createFormat('F2', 2, 'Integer', 'desc')), ('FU', createFormat('FU', '.', 'Float', 'desc')), ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')), ('PS', createFormat('PS', 1, 'Integer', 'Special PS key'))]) header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats) self._validate_schema( self._generate_expected_fields( alt_fields=['IA'], call_fields=['F1', 'F2', 'FU'], info_fields=['I1']), schema_converter.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))
def test_bigquery_field_name_sanitize(self): infos = OrderedDict([ ('_', createInfo('_', 1, 'String', 'desc', 'src', 'v')), ('_A', createInfo('_A', 1, 'String', 'desc', 'src', 'v')), ('0a', createInfo('0a', 1, 'String', 'desc', 'src', 'v')), ('A-B*C', createInfo('A-B*C', 1, 'String', 'desc', 'src', 'v')), ('I-A', createInfo('I-A', 'A', 'Float', 'desc', 'src', 'v')), ('OK_info_09', createInfo('OK_info_09', 1, 'String', 'desc')) ]) formats = OrderedDict([ ('a^b', createFormat('a^b', 1, 'String', 'desc')), ('OK_format_09', createFormat('OK_format_09', 1, 'String', 'desc')) ]) header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats) self._validate_schema( self._generate_expected_fields(alt_fields=['I_A'], call_fields=['a_b', 'OK_format_09'], info_fields=[ 'field__', 'field__A', 'field_0a', 'A_B_C', 'OK_info_09' ]), schema_converter.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))
def _get_annotation_infos(self): return OrderedDict([ ('CSQ', createInfo( 'CSQ', '.', 'String', 'Annotations from VEP. Format: Allele|Gene|Position|Score', 'src', 'v')), ('IS', createInfo('I1', 1, 'String', 'desc', 'src', 'v')), ('ISI', createInfo('ISI', 1, 'Integer', 'desc', 'src', 'v')), ('ISF', createInfo('ISF', 1, 'Float', 'desc', 'src', 'v')), ('IF', createInfo('IF', 1, 'Float', 'desc', 'src', 'v')), ('IB', createInfo('I1', 1, 'Flag', 'desc', 'src', 'v')), ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v')) ])
def test_generate_header_fields_from_schema_invalid_description(self): schema = bigquery.TableSchema() schema.fields.append( bigquery.TableFieldSchema( name='invalid_description', type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Desc\nThis is added intentionally.')) header = schema_converter.generate_header_fields_from_schema(schema) infos = OrderedDict([('invalid_description', createInfo('invalid_description', 1, 'String', 'Desc This is added intentionally.', None, None))]) expected_header = vcf_header_io.VcfHeader(infos=infos, formats=OrderedDict()) self.assertEqual(header, expected_header)
def test_infer_info_fields_combined_conflicts(self): variant = self._get_sample_variant_info_ia_cardinality_mismatch() infos = { 'IS': createInfo('IS', 1, 'String', ''), 'ISI': createInfo('ISI', 1, 'Integer', ''), 'ISF': createInfo('ISF', 1, 'Float', ''), 'IB': createInfo('IB', 0, 'Flag', ''), 'IA': createInfo('IA', 'A', 'Integer', '') } inferred_infos = infer_headers_util.infer_info_fields( variant, vcf_header_io.VcfHeader(infos=infos), infer_headers=True) expected_infos = { 'IF': createInfo('IF', 1, 'Float', ''), 'IA': createInfo('IA', '.', 'Float', '') } self.assertEqual(expected_infos, inferred_infos)
def test_infer_mismatched_info_field_correct_type_list(self): variant = self._get_sample_variant_info_ia_float_in_list() infos = { 'IS': createInfo('IS', 1, 'String', ''), 'ISI': createInfo('ISI', 1, 'Integer', ''), 'ISF': createInfo('ISF', 1, 'Float', ''), 'IF': createInfo('IF', 1, 'Float', ''), 'IB': createInfo('IB', 0, 'Flag', ''), 'IA': createInfo('IA', '.', 'Integer', '') } corrected_info = infer_headers_util._infer_mismatched_info_field( 'IA', variant.info.get('IA'), vcf_header_io.VcfHeader(infos=infos).infos.get('IA'), len(variant.alternate_bases)) expected = createInfo('IA', '.', 'Float', '') self.assertEqual(expected, corrected_info)
def test_add_info_fields_from_alternate_bases_non_reserved_field(self): alternate_bases_record = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each alternate base (if any).') alternate_bases_record.fields.append(bigquery.TableFieldSchema( name='non_reserved', type=bigquery_util.TableFieldConstants.TYPE_FLOAT, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='bigquery desc')) infos = OrderedDict() schema_converter._add_info_fields( alternate_bases_record, infos) expected_infos = OrderedDict([ ('non_reserved', createInfo('non_reserved', 'A', 'Float', 'bigquery desc', None, None))]) self.assertEqual(infos, expected_infos)
def test_infer_info_fields_no_conflicts(self): variant = self._get_sample_variant_1() infos = { 'IS': createInfo('IS', 1, 'String', ''), 'ISI': createInfo('ISI', 1, 'Integer', ''), 'ISF': createInfo('ISF', 1, 'Float', ''), 'IF': createInfo('IF', 1, 'Float', ''), 'IB': createInfo('IB', 0, 'Flag', ''), 'IA': createInfo('IA', 'A', 'Float', '') } inferred_infos = infer_headers_util.infer_info_fields( variant, vcf_header_io.VcfHeader(infos=infos), infer_headers=True) self.assertEqual({}, inferred_infos)
def test_generate_header_fields_from_schema_schema_compatibility(self): schema_conflict = bigquery.TableSchema() schema_conflict.fields.append(bigquery.TableFieldSchema( name='AA', type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='desc')) with self.assertRaises(ValueError): schema_converter.generate_header_fields_from_schema( schema_conflict) header = schema_converter.generate_header_fields_from_schema( schema_conflict, allow_incompatible_schema=True) infos = OrderedDict([ ('AA', createInfo('AA', 1, 'Integer', 'desc', None, None))]) expected_header = vcf_header_io.VcfHeader(infos=infos, formats=OrderedDict()) self.assertEqual(header, expected_header)
def test_report_no_conflicts(self): header_definitions = VcfHeaderDefinitions() header_definitions._infos = {'NS': {Definition(1, 'Float'): ['file1']}} header_definitions._formats = { 'NS': { Definition(1, 'Float'): ['file2'] } } infos = OrderedDict([('NS', createInfo('NS', 1, 'Integer', 'Number samples', None, None))]) formats = OrderedDict([('NS', createFormat('NS', 1, 'Float', 'Number samples'))]) resolved_headers = VcfHeader(infos=infos, formats=formats) expected = ['No Header Conflicts Found.\n', '\n'] self._generate_report_and_assert_contents_equal( expected, header_definitions, resolved_headers)
def test_defined_fields_filtered_two_variants(self): # Only INFO and FORMAT in the first variants are already defined in the # header section of the VCF files. with TestPipeline() as p: vcf_headers = self._get_sample_header_fields() vcf_headers_side_input = p | 'vcf_header' >> Create([vcf_headers]) variant_1 = self._get_sample_variant_1() variant_2 = self._get_sample_variant_2() inferred_headers = ( p | Create([variant_1, variant_2]) | 'InferHeaderFields' >> infer_headers.InferHeaderFields( pvalue.AsSingleton(vcf_headers_side_input), infer_headers=True)) expected_infos = {'IS_2': createInfo('IS_2', 1, 'String', '')} expected_formats = {'FI_2': createFormat('FI_2', 1, 'Integer', '')} expected = vcf_header_io.VcfHeader(infos=expected_infos, formats=expected_formats) assert_that(inferred_headers, asserts.header_fields_equal_ignore_order([expected])) p.run()