def _get_sample_variant_and_header_with_csq(self, additional_infos=None): """Provides a simple `Variant` and `VcfHeader` with info fields Args: additional_infos: A list of tuples of the format (key, `Info`) to be added to the `VcfHeader`. """ # type: ( variant = self._get_sample_variant() variant.info['CSQ'] = [ 'A|C1|I1|S1|G1', 'TT|C2|I2|S2|G2', 'A|C3|I3|S3|G3' ] infos = OrderedDict([ ('A1', Info('A1', 1, None, '', None, None)), ('A2', Info('A2', parser.field_counts['A'], None, '', None, None)), ('CSQ', Info('CSQ', parser.field_counts['.'], None, 'some desc Allele|Consequence|IMPACT|SYMBOL|Gene', None, None)) ]) if additional_infos is not None: for key, value in additional_infos: infos[key] = value header_fields = vcf_header_io.VcfHeader(infos=infos) return variant, header_fields
def test_infer_annotation_pipeline(self): anno_fields = ['CSQ'] header = self._get_sample_header_fields(with_annotation=True) variant1 = self._get_sample_variant_1() variant1.info['CSQ'] = [ 'A|1|100|1.2', 'A|2|101|1.3', 'A|12|start|0', 'TT|13|end|7' ] variant2 = self._get_sample_variant_1() variant2.info['CSQ'] = [ 'A|1|100|', 'A|2|101|', 'A|1.2|102|0', 'TT|1.3|103|7' ] desc = 'Inferred type field for annotation {}.' expected = vcf_header_io.VcfHeader( infos={ 'CSQ_Gene_TYPE': Info('CSQ_Gene_TYPE', 1, 'Float', desc.format('Gene'), '', ''), 'CSQ_Position_TYPE': Info('CSQ_Position_TYPE', 1, 'String', desc.format('Position'), '', ''), 'CSQ_Score_TYPE': Info('CSQ_Score_TYPE', 1, 'Float', desc.format('Score'), '', '') }) with TestPipeline() as p: inferred_headers = ( p | Create([variant1, variant2]) | 'InferAnnotationTypes' >> infer_headers.InferHeaderFields( defined_headers=header, infer_headers=False, annotation_fields_to_infer=anno_fields)) assert_that(inferred_headers, asserts.header_fields_equal_ignore_order([expected])) p.run()
def test_bigquery_field_name_sanitize(self): infos = OrderedDict([ ('_', Info('_', 1, 'String', 'desc', 'src', 'v')), ('_A', Info('_A', 1, 'String', 'desc', 'src', 'v')), ('0a', Info('0a', 1, 'String', 'desc', 'src', 'v')), ('A-B*C', Info('A-B*C', 1, 'String', 'desc', 'src', 'v')), ('I-A', Info('I-A', field_counts['A'], 'Float', 'desc', 'src', 'v')), ('OK_info_09', Format('OK_info_09', 1, 'String', 'desc')) ]) formats = OrderedDict([('a^b', Format('a^b', 1, 'String', 'desc')), ('OK_format_09', Format('OK_format_09', 1, 'String', 'desc'))]) header_fields = vcf_header_parser.HeaderFields(infos, formats) self._assert_fields_equal( self._generate_expected_fields(alt_fields=['I_A'], call_fields=['a_b', 'OK_format_09'], info_fields=[ 'field__', 'field__A', 'field_0a', 'A_B_C', 'OK_info_09' ]), bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))
def test_header_fields_inferred_from_two_variants(self): with TestPipeline() as p: variant_1 = self._get_sample_variant_1() variant_2 = self._get_sample_variant_2() inferred_headers = ( p | Create([variant_1, variant_2]) | 'InferUndefinedHeaderFields' >> infer_undefined_headers. InferUndefinedHeaderFields(defined_headers=None)) expected_infos = { 'IS': Info('IS', 1, 'String', '', '', ''), 'IF': Info('IF', 0, 'Flag', '', '', ''), 'IA': Info('IA', None, 'Float', '', '', ''), 'IS_2': Info('IS_2', 1, 'String', '', '', '') } expected_formats = { 'FI': Format('FI', 1, 'Integer', ''), 'FU': Format('FU', None, 'Float', ''), 'FI_2': Format('FI_2', 1, 'Integer', '') } expected = vcf_header_io.VcfHeader(infos=expected_infos, formats=expected_formats) assert_that(inferred_headers, equal_to([expected])) p.run()
def _get_sample_header_fields(self): infos = OrderedDict([ ('IS', Info('I1', 1, 'String', 'desc', 'src', 'v')), ('IF', Info('I1', 1, 'Flag', 'desc', 'src', 'v')), ('IA', Info('IA', field_counts['A'], 'Integer', 'desc', 'src', 'v'))]) formats = OrderedDict([ ('FS', Format('FS', 1, 'String', 'desc')), ('FI', Format('FI', 2, 'Integer', 'desc')), ('FU', Format('FU', field_counts['.'], 'Float', 'desc')), ('GT', Format('GT', 2, 'Integer', 'Special GT key')), ('PS', Format('PS', 1, 'Integer', 'Special PS key'))]) return vcf_header_io.VcfHeader(infos=infos, formats=formats)
def test_create_alt_bases_field_schema_types(self): ids = [ 'CSQ_Allele_TYPE', 'CSQ_Consequence_TYPE', 'CSQ_IMPACT_TYPE', 'CSQ_SYMBOL_TYPE' ] types = ['String', 'Integer', 'Integer', 'Float'] infos = [(i, Info(i, 1, t, '', None, None)) for i, t in zip(ids, types)] _, header_fields = self._get_sample_variant_and_header_with_csq( additional_infos=infos) for hfi in header_fields.infos.values(): if hfi['type'] is None: hfi['type'] = 'String' factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True, annotation_fields=['CSQ']) schema = factory.create_alt_bases_field_schema() csq_field = [field for field in schema.fields if field.name == 'CSQ'][0] expected_name_type_map = { 'CSQ': 'RECORD', 'allele': 'STRING', 'Consequence': 'INTEGER', 'IMPACT': 'INTEGER', 'SYMBOL': 'FLOAT', 'Gene': 'STRING' } for field in csq_field.fields: self.assertEqual(field.type, expected_name_type_map[field.name])
def _infer_undefined_info_fields(self, variant, defined_headers): """Returns info fields not defined in the headers. Args: variant (:class:`vcfio.Variant`): variant obj. defined_headers (:class:`vcf_header_io.VcfHeader`): header fields defined in header section of VCF files. Returns: A dict of (info_key(str), :class:`Info`) for any info field in `variant` that is not defined in the header. """ infos = {} for info_field_key, variant_info in variant.info.iteritems(): info_field_value = variant_info.data if not defined_headers or info_field_key not in defined_headers.infos: if info_field_key in infos: raise ValueError( 'Invalid VCF file. Duplicate INFO field in variant {}'. format(variant)) infos[info_field_key] = Info( info_field_key, self._get_field_count(info_field_value), self._get_field_type(info_field_value), '', # NO_DESCRIPTION '', # UNKNOWN_SOURCE '') # UNKNOWN_VERSION return infos
def test_report_multiple_files(self): header_definitions = merge_header_definitions.VcfHeaderDefinitions() header_definitions._infos = { 'NS': {Definition(1, 'Float'): ['file1', 'file2'], Definition(1, 'Integer'): ['file3']} } infos = OrderedDict([ ('NS', Info('NS', 1, 'Float', 'Number samples', None, None))]) resolved_headers = VcfHeader(infos=infos) expected = [ preprocess_reporter._InconsistencyType.HEADER_CONFLICTS + '\n', preprocess_reporter._HeaderLine.CONFLICTS_HEADER + '\n', (preprocess_reporter._DELIMITER).join([ 'NS', 'INFO', 'num=1 type=Float', 'file1', 'num=1 type=Float\n']), (preprocess_reporter._DELIMITER).join([ ' ', ' ', ' ', 'file2', ' \n']), (preprocess_reporter._DELIMITER).join([ ' ', ' ', 'num=1 type=Integer', 'file3', ' \n']), '\n' ] self._generate_report_and_assert_contents_equal(expected, header_definitions, resolved_headers)
def _infer_annotation_type_info_fields(self, variant, infos, defined_headers): # type: (vcfio.Variant, Dict[str, Info], vcf_header_io.VcfHeader) -> None """Updates `infos` with inferred annotation type info fields. All annotation headers in each annotation field are converted to Info header lines where the new ID corresponds to the given annotation field and header, and the new TYPE corresponds to inferred type of the original header. Since each variant potentially contains multiple values for each annotation header, a small 'merge' of value types is performed before VcfHeader creation for each variant. Args: variant: variant object infos: dict of (info_key, `Info`) for any info field in `variant` that is not defined in the header or the definition mismatches the field values. defined_headers: header fields defined in header section of VCF files. """ def _check_annotation_lists_lengths(names, values): lengths = set(len(v) for v in values) lengths.add(len(names)) if len(lengths) != 1: error = ( 'Annotation lists have inconsistent lengths: {}.\nnames={}\n' 'values={}').format(lengths, names, values) raise ValueError(error) resolver = vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=True) for field in self._annotation_fields_to_infer: if field not in variant.info: continue annotation_names = annotation_parser.extract_annotation_names( defined_headers.infos[field][_HeaderKeyConstants.DESC]) # First element (ALT) is ignored, since its type is hard-coded as string annotation_values = [ annotation_parser.extract_annotation_list_with_alt(annotation) [1:] for annotation in variant.info[field] ] _check_annotation_lists_lengths(annotation_names, annotation_values) annotation_values = zip(*annotation_values) for name, values in zip(annotation_names, annotation_values): variant_merged_type = None for v in values: if not v: continue variant_merged_type = resolver.resolve_attribute_conflict( _HeaderKeyConstants.TYPE, variant_merged_type, self._get_field_type(v)) if variant_merged_type == _HeaderTypeConstants.STRING: break key_id = get_inferred_annotation_type_header_key(field, name) infos[key_id] = Info( key_id, 1, # field count variant_merged_type, ('Inferred type field for annotation {}.'.format(name)), '', # UNKNOWN_SOURCE '') # UNKNOWN_VERSION
def test_infer_annotation_types_with_multiple_annotation_fields(self): anno_fields = ['CSQ', 'CSQ_VT'] csq_vt = [ ('CSQ_VT', Info('CSQ_VT', -1, 'String', 'Annotations from VEP. Format: Allele|Gene|Position|Score', 'source', 'v')) ] header = self._get_sample_header_fields(with_annotation=csq_vt) variant = self._get_sample_variant_1() variant.info['CSQ_VT'] = ['A|1|100|1.2', 'A|2|101|1.3'] variant.info['CSQ'] = ['A|1|100|1.2', 'A|2|101|1.3'] infer_header_fields = infer_headers._InferHeaderFields( False, anno_fields) inferred_headers = next(infer_header_fields.process(variant, header)) expected_types = { 'CSQ_Gene_TYPE': 'Integer', 'CSQ_Position_TYPE': 'Integer', 'CSQ_Score_TYPE': 'Float', 'CSQ_VT_Gene_TYPE': 'Integer', 'CSQ_VT_Position_TYPE': 'Integer', 'CSQ_VT_Score_TYPE': 'Float' } for key, item in inferred_headers.infos.iteritems(): self.assertEqual(item['type'], expected_types[key]) self.assertEqual(len(expected_types), len(inferred_headers.infos))
def test_report_conflicted_and_inferred_headers(self): header_definitions = merge_header_definitions.VcfHeaderDefinitions() header_definitions._infos = {'NS': {Definition(1, 'Float'): ['file1'], Definition(1, 'Integer'): ['file2']}} infos = OrderedDict([ ('NS', Info('NS', 1, 'Float', 'Number samples', None, None))]) formats = OrderedDict([ ('DP', Format('DP', 2, 'Float', 'Total Depth'))]) resolved_headers = VcfHeader(infos=infos, formats=formats) inferred_headers = VcfHeader(formats=formats) expected = [ preprocess_reporter._InconsistencyType.HEADER_CONFLICTS + '\n', preprocess_reporter._HeaderLine.CONFLICTS_HEADER + '\n', (preprocess_reporter._DELIMITER).join([ 'NS', 'INFO', 'num=1 type=Float', 'file1', 'num=1 type=Float\n']), (preprocess_reporter._DELIMITER).join([ ' ', ' ', 'num=1 type=Integer', 'file2', ' \n']), '\n', preprocess_reporter._InconsistencyType.UNDEFINED_HEADERS + '\n', preprocess_reporter._HeaderLine.UNDEFINED_FIELD_HEADER + '\n', (preprocess_reporter._DELIMITER).join([ 'DP', 'FORMAT', 'num=2 type=Float\n']), '\n' ] self._generate_report_and_assert_contents_equal(expected, header_definitions, resolved_headers, inferred_headers)
def test_info_header_fields(self): infos = OrderedDict([ ('I1', Info('I1', 1, 'String', 'desc', 'src', 'v')), ('I2', Info('I2', 2, 'Integer', 'desc', 'src', 'v')), ('IA', Info('IA', field_counts['A'], 'Float', 'desc', 'src', 'v')), ('IU', Info('IU', field_counts['.'], 'Character', 'desc', 'src', 'v')), ('IG', Info('IG', field_counts['G'], 'String', 'desc', 'src', 'v')), ('I0', Info('I0', 0, 'Flag', 'desc', 'src', 'v')), ('IA2', Info('IA2', field_counts['A'], 'Float', 'desc', 'src', 'v')), ( 'END', # END should not be included in the generated schema. Info('END', 1, 'Integer', 'Special END key', 'src', 'v')) ]) header_fields = vcf_header_parser.HeaderFields(infos, {}) self._assert_fields_equal( self._generate_expected_fields( alt_fields=['IA', 'IA2'], info_fields=['I1', 'I2', 'IU', 'IG', 'I0']), bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields))) # Test with split_alternate_allele_info_fields=False. actual_schema = bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=False)) self._assert_fields_equal( self._generate_expected_fields( info_fields=['I1', 'I2', 'IA', 'IU', 'IG', 'I0', 'IA2']), actual_schema) # Verify types and modes. expected_type_modes = { 'I1': (TableFieldConstants.TYPE_STRING, TableFieldConstants.MODE_NULLABLE), 'I2': (TableFieldConstants.TYPE_INTEGER, TableFieldConstants.MODE_REPEATED), 'IA': (TableFieldConstants.TYPE_FLOAT, TableFieldConstants.MODE_REPEATED), 'IU': (TableFieldConstants.TYPE_STRING, TableFieldConstants.MODE_REPEATED), 'IG': (TableFieldConstants.TYPE_STRING, TableFieldConstants.MODE_REPEATED), 'I0': (TableFieldConstants.TYPE_BOOLEAN, TableFieldConstants.MODE_NULLABLE), 'IA2': (TableFieldConstants.TYPE_FLOAT, TableFieldConstants.MODE_REPEATED) } for field in actual_schema.fields: if field.name in expected_type_modes: expected_type, expected_mode = expected_type_modes[field.name] self.assertEqual(expected_type, field.type) self.assertEqual(expected_mode, field.mode)
def test_variant_merger_modify_schema(self): infos = OrderedDict([('I1', Info('I1', 1, 'String', 'desc', 'src', 'v')), ('IA', Info('IA', field_counts['A'], 'Integer', 'desc', 'src', 'v'))]) formats = OrderedDict([('F1', Format('F1', 1, 'String', 'desc'))]) header_fields = vcf_header_parser.HeaderFields(infos, formats) self._assert_fields_equal( self._generate_expected_fields( alt_fields=['IA'], call_fields=['F1'], info_fields=['I1', 'ADDED_BY_MERGER']), bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields), variant_merger=_DummyVariantMergeStrategy()))
def _get_sample_header_fields(self, with_annotation=False): """Provides a simple `VcfHeader` with info and format fields Args: with_annotation: Can be bool or list of tuples. Tuples should be additional annotation fields in the format (key, `Info`). """ infos = OrderedDict([ ('IS', Info('I1', 1, 'String', 'desc', 'src', 'v')), ('ISI', Info('ISI', 1, 'Int', 'desc', 'src', 'v')), ('ISF', Info('ISF', 1, 'Float', 'desc', 'src', 'v')), ('IF', Info('IF', 1, 'Float', 'desc', 'src', 'v')), ('IB', Info('I1', 1, 'Flag', 'desc', 'src', 'v')), ('IA', Info('IA', field_counts['A'], 'Integer', 'desc', 'src', 'v')) ]) if with_annotation: infos['CSQ'] = Info( 'CSQ', field_counts['.'], 'String', 'Annotations from VEP. Format: Allele|Gene|Position|Score', 'src', 'v') if isinstance(with_annotation, list): for key, value in with_annotation: infos[key] = value formats = OrderedDict([ ('FS', Format('FS', 1, 'String', 'desc')), ('FI', Format('FI', 2, 'Integer', 'desc')), ('FU', Format('FU', field_counts['.'], 'Float', 'desc')), ('GT', Format('GT', 2, 'Integer', 'Special GT key')), ('PS', Format('PS', 1, 'Integer', 'Special PS key')) ]) return vcf_header_io.VcfHeader(infos=infos, formats=formats)
def test_pipeline(self): infos = {'IS': Info('IS', 1, 'String', '', '', ''), 'ISI': Info('ISI', 1, 'Integer', '', '', ''), 'ISF': Info('ISF', 1, 'Float', '', '', ''), 'IB': Info('IB', 0, 'Flag', '', '', ''), 'IA': Info('IA', -1, 'Integer', '', '', '')} formats = OrderedDict([ ('FS', Format('FS', 1, 'String', 'desc')), ('FI', Format('FI', 2, 'Integer', 'desc')), ('GT', Format('GT', 2, 'Integer', 'Special GT key')), ('PS', Format('PS', 1, 'Integer', 'Special PS key'))]) with TestPipeline() as p: variant_1 = self._get_sample_variant_info_ia_cardinality_mismatch() variant_2 = self._get_sample_variant_format_fi_float_value() inferred_headers = ( p | Create([variant_1, variant_2]) | 'InferHeaderFields' >> infer_headers.InferHeaderFields( defined_headers=vcf_header_io.VcfHeader(infos=infos, formats=formats), allow_incompatible_records=True)) expected_infos = {'IA': Info('IA', None, 'Float', '', '', ''), 'IF': Info('IF', 1, 'Float', '', '', '')} expected_formats = {'FI': Format('FI', 2, 'Float', 'desc'), 'FU': Format('FU', None, 'Float', '')} expected = vcf_header_io.VcfHeader(infos=expected_infos, formats=expected_formats) assert_that(inferred_headers, asserts.header_fields_equal_ignore_order([expected])) p.run()
def test_info_and_format_header_fields(self): infos = OrderedDict([('I1', Info('I1', 1, 'String', 'desc', 'src', 'v')), ('IA', Info('IA', field_counts['A'], 'Integer', 'desc', 'src', 'v'))]) # GT and PS should not be set as they're already included in special # 'genotype' and 'phaseset' fields. formats = OrderedDict([ ('F1', Format('F1', 1, 'String', 'desc')), ('F2', Format('F2', 2, 'Integer', 'desc')), ('FU', Format('FU', field_counts['.'], 'Float', 'desc')), ('GT', Format('GT', 2, 'Integer', 'Special GT key')), ('PS', Format('PS', 1, 'Integer', 'Special PS key')) ]) header_fields = vcf_header_parser.HeaderFields(infos, formats) self._assert_fields_equal( self._generate_expected_fields(alt_fields=['IA'], call_fields=['F1', 'F2', 'FU'], info_fields=['I1']), bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))
def test_report_no_conflicts(self): header_definitions = merge_header_definitions.VcfHeaderDefinitions() header_definitions._infos = {'NS': {Definition(1, 'Float'): ['file1']}} header_definitions._formats = {'NS': {Definition(1, 'Float'): ['file2']}} infos = OrderedDict([ ('NS', Info('NS', 1, 'Integer', 'Number samples', None, None))]) formats = OrderedDict([('NS', Format('NS', 1, 'Float', 'Number samples'))]) resolved_headers = VcfHeader(infos=infos, formats=formats) expected = ['No Header Conflicts Found.\n', '\n'] self._generate_report_and_assert_contents_equal(expected, header_definitions, resolved_headers)
def test_infer_info_fields_combined_conflicts(self): variant = self._get_sample_variant_info_ia_cardinality_mismatch() infos = {'IS': Info('IS', 1, 'String', '', '', ''), 'ISI': Info('ISI', 1, 'Integer', '', '', ''), 'ISF': Info('ISF', 1, 'Float', '', '', ''), 'IB': Info('IB', 0, 'Flag', '', '', ''), 'IA': Info('IA', -1, 'Integer', '', '', '')} infer_header_fields = infer_headers._InferHeaderFields() inferred_infos = infer_header_fields._infer_info_fields( variant, vcf_header_io.VcfHeader(infos=infos)) expected_infos = {'IF': Info('IF', 1, 'Float', '', '', ''), 'IA': Info('IA', None, 'Float', '', '', '')} self.assertEqual(expected_infos, inferred_infos)
def test_infer_mismatched_info_field_correct_num(self): variant = self._get_sample_variant_info_ia_cardinality_mismatch() infos = {'IS': Info('IS', 1, 'String', '', '', ''), 'ISI': Info('ISI', 1, 'Integer', '', '', ''), 'ISF': Info('ISF', 1, 'Float', '', '', ''), 'IF': Info('IF', 1, 'Float', '', '', ''), 'IB': Info('IB', 0, 'Flag', '', '', ''), 'IA': Info('IA', -1, 'Float', '', '', '')} infer_header_fields = infer_headers._InferHeaderFields() corrected_info = infer_header_fields._infer_mismatched_info_field( 'IA', variant.info.get('IA'), vcf_header_io.VcfHeader(infos=infos).infos.get('IA'), len(variant.alternate_bases)) expected = Info('IA', None, 'Float', '', '', '') self.assertEqual(expected, corrected_info)
def test_infer_info_fields_no_conflicts(self): variant = self._get_sample_variant_1() infos = {'IS': Info('IS', 1, 'String', '', '', ''), 'ISI': Info('ISI', 1, 'Integer', '', '', ''), 'ISF': Info('ISF', 1, 'Float', '', '', ''), 'IF': Info('IF', 1, 'Float', '', '', ''), 'IB': Info('IB', 0, 'Flag', '', '', ''), 'IA': Info('IA', -1, 'Float', '', '', '')} infer_header_fields = infer_headers._InferHeaderFields() inferred_infos = infer_header_fields._infer_info_fields( variant, vcf_header_io.VcfHeader(infos=infos)) self.assertEqual({}, inferred_infos)
def _infer_info_fields(self, variant, defined_headers): # type: (vcfio.Variant, vcf_header_io.VcfHeader) -> Dict[str, Info] """Returns inferred info fields. Two types of info fields are inferred: - The info fields are undefined in the headers. - The info fields' definitions provided by the header does not match the field value. Args: variant: variant obj. defined_headers: header fields defined in header section of VCF files. Returns: A dict of (info_key(str), :class:`Info`) for any info field in `variant` that is not defined in the header or the definition mismatches the field values. """ infos = {} for info_field_key, info_field_value in variant.info.iteritems(): if not defined_headers or info_field_key not in defined_headers.infos: if info_field_key in infos: raise ValueError( 'Duplicate INFO field "{}" in variant "{}"'.format( info_field_key, variant)) logging.warning('Undefined INFO field "%s" in variant "%s"', info_field_key, str(variant)) infos[info_field_key] = Info(info_field_key, self._get_field_count(info_field_value), self._get_field_type(info_field_value), '', # NO_DESCRIPTION '', # UNKNOWN_SOURCE '') # UNKNOWN_VERSION else: defined_header = defined_headers.infos.get(info_field_key) corrected_info = self._infer_mismatched_info_field( info_field_key, info_field_value, defined_header, len(variant.alternate_bases)) if corrected_info: logging.warning( 'Incorrect INFO field "%s". Defined as "type=%s,num=%s", ' 'got "%s", in variant "%s"', info_field_key, defined_header.get(_HeaderKeyConstants.TYPE), str(defined_header.get(_HeaderKeyConstants.NUM)), str(info_field_value), str(variant)) infos[info_field_key] = corrected_info return infos
def test_infer_mismatched_info_field_no_mismatches(self): variant = self._get_sample_variant_info_ia_float_2_0_in_list() infos = {'IS': Info('IS', 1, 'String', '', '', ''), 'ISI': Info('ISI', 1, 'Integer', '', '', ''), 'ISF': Info('ISF', 1, 'Float', '', '', ''), 'IF': Info('IF', 1, 'Float', '', '', ''), 'IB': Info('IB', 0, 'Flag', '', '', ''), 'IA': Info('IA', 'A', 'Integer', '', '', '')} infer_header_fields = infer_headers._InferHeaderFields() corrected_info = infer_header_fields._infer_mismatched_info_field( 'IA', variant.info.get('IA'), vcf_header_io.VcfHeader(infos=infos).infos.get('IA'), len(variant.alternate_bases)) self.assertEqual(None, corrected_info)
def test_defined_fields_filtered_two_variants(self): # Only INFO and FORMAT in the first variants are already defined in the # header section of the VCF files. with TestPipeline() as p: vcf_headers = self._get_sample_header_fields() vcf_headers_side_input = p | 'vcf_header' >> Create([vcf_headers]) variant_1 = self._get_sample_variant_1() variant_2 = self._get_sample_variant_2() inferred_headers = ( p | Create([variant_1, variant_2]) | 'InferUndefinedHeaderFields' >> infer_undefined_headers.InferUndefinedHeaderFields( pvalue.AsSingleton(vcf_headers_side_input))) expected_infos = {'IS_2': Info('IS_2', 1, 'String', '', '', '')} expected_formats = {'FI_2': Format('FI_2', 1, 'String', '')} expected = vcf_header_io.VcfHeader( infos=expected_infos, formats=expected_formats) assert_that(inferred_headers, equal_to([expected])) p.run()
def _infer_mismatched_info_field( field_key, # type: str field_value, # type: Any defined_header, # type: Dict num_alternate_bases # type: int ): # type: (...) -> Optional[Info] """Returns corrected info if there are mismatches. Two mismatches are handled: - Defined num is `A`, but the provided values do not have the same cardinality as the alternate bases. Correct the num to be `None`. - Defined type is `Integer`, but the provided value is float. Correct the type to be `Float`. Args: field_key: the info field key. field_value: the value of the field key given in the variant. defined_header: The definition of `field_key` in the header. num_alternate_bases: number of the alternate bases. Returns: Corrected info definition if there are mismatches. """ corrected_num = defined_header.get(_HeaderKeyConstants.NUM) if (corrected_num == field_counts[_FIELD_COUNT_ALTERNATE_ALLELE] and len(field_value) != num_alternate_bases): corrected_num = field_counts['.'] corrected_type = _get_corrected_type( defined_header.get(_HeaderKeyConstants.TYPE), field_value) if (corrected_type != defined_header.get(_HeaderKeyConstants.TYPE) or corrected_num != defined_header.get(_HeaderKeyConstants.NUM)): return Info(field_key, corrected_num, corrected_type, defined_header.get(_HeaderKeyConstants.DESC), defined_header.get(_HeaderKeyConstants.SOURCE), defined_header.get(_HeaderKeyConstants.VERSION)) return None