def _infer_annotation_type_info_fields(self, variant, infos, defined_headers): # type: (vcfio.Variant, Dict[str, Info], vcf_header_io.VcfHeader) -> None """Updates `infos` with inferred annotation type info fields. All annotation headers in each annotation field are converted to Info header lines where the new ID corresponds to the given annotation field and header, and the new TYPE corresponds to inferred type of the original header. Since each variant potentially contains multiple values for each annotation header, a small 'merge' of value types is performed before VcfHeader creation for each variant. Args: variant: variant object infos: dict of (info_key, `Info`) for any info field in `variant` that is not defined in the header or the definition mismatches the field values. defined_headers: header fields defined in header section of VCF files. """ def _check_annotation_lists_lengths(names, values): lengths = set(len(v) for v in values) lengths.add(len(names)) if len(lengths) != 1: error = ( 'Annotation lists have inconsistent lengths: {}.\nnames={}\n' 'values={}').format(lengths, names, values) raise ValueError(error) resolver = vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=True) for field in self._annotation_fields_to_infer: if field not in variant.info: continue annotation_names = annotation_parser.extract_annotation_names( defined_headers.infos[field][_HeaderKeyConstants.DESC]) # First element (ALT) is ignored, since its type is hard-coded as string annotation_values = [ annotation_parser.extract_annotation_list_with_alt(annotation) [1:] for annotation in variant.info[field] ] _check_annotation_lists_lengths(annotation_names, annotation_values) annotation_values = zip(*annotation_values) for name, values in zip(annotation_names, annotation_values): variant_merged_type = None for v in values: if not v: continue variant_merged_type = resolver.resolve_attribute_conflict( _HeaderKeyConstants.TYPE, variant_merged_type, self._get_field_type(v)) if variant_merged_type == _HeaderTypeConstants.STRING: break key_id = get_inferred_annotation_type_header_key(field, name) infos[key_id] = Info( key_id, 1, # field count variant_merged_type, ('Inferred type field for annotation {}.'.format(name)), '', # UNKNOWN_SOURCE '') # UNKNOWN_VERSION
def _gen_annotation_name_key_pairs(self, annot_field): # type: (str) -> (str, str) annotation_names = annotation_parser.extract_annotation_names( self._header_fields.infos[annot_field][_HeaderKeyConstants.DESC]) for name in annotation_names: type_key = infer_headers_util.get_inferred_annotation_type_header_key( annot_field, name) yield name, type_key
def __init__( self, annotation_fields, # type: List[str] header_fields, # type: vcf_header_io.VcfHeader counter_factory, # type: metrics_util.CounterFactoryInterface use_allele_num, # type: bool minimal_match, # type: bool infer_annotation_types, # type: bool ): # type: (...) -> None """Creates an instance for adding annotations to `ProcessedVariant` objects. Note this class is intended to be an auxiliary for ProcessedVariantFactory and is used for creating annotation related parts of a `ProcessedVariant` object. So it is an implementation detail and not part of the public API. Args: annotation_fields: The list of INFO field names that store variant annotations. The format of how annotations are stored and their names are extracted from header_fields. header_fields: The VCF header information. infer_annotation_types: If set, then warnings will be provided if header fields fail to contain Info type lines for annotation fields """ self._header_fields = header_fields self._annotation_names_map = {} # type: Dict[str, List[str]] for field in annotation_fields or []: if field not in header_fields.infos: raise ValueError( '{} INFO not found in the header'.format(field)) header_desc = header_fields.infos[field][_HeaderKeyConstants.DESC] self._annotation_names_map[field] = ( annotation_parser.extract_annotation_names(header_desc)) self._alt_match_counter = counter_factory.create_counter( _CounterEnum.ANNOTATION_ALT_MATCH.value) self._alt_minimal_ambiguous_counter = counter_factory.create_counter( _CounterEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value) self._alt_mismatch_counter = counter_factory.create_counter( _CounterEnum.ANNOTATION_ALT_MISMATCH.value) self._allele_num_missing_counter = counter_factory.create_counter( _CounterEnum.ALLELE_NUM_MISSING.value) self._allele_num_incorrect_counter = counter_factory.create_counter( _CounterEnum.ALLELE_NUM_INCORRECT.value) self._use_allele_num = use_allele_num self._minimal_match = minimal_match self._infer_annotation_types = infer_annotation_types
def test_extract_annotation_names_error(self): annotation_str = 'some desc-Consequence-IMPACT-SYMBOL-Gene' with self.assertRaisesRegexp(ValueError, 'Expected at least one.*'): annotation_parser.extract_annotation_names(annotation_str)
def test_extract_annotation_names(self): annotation_str = 'some desc|Consequence|IMPACT|SYMBOL|Gene' name_list = annotation_parser.extract_annotation_names(annotation_str) self.assertEqual(name_list, ['Consequence', 'IMPACT', 'SYMBOL', 'Gene'])
def create_alt_bases_field_schema(self): # type: () -> bigquery.TableFieldSchema """Returns the alternate_bases record compatible with this factory. Depending on how this class is set up to split INFO fields among alternate bases, this function produces a compatible alternate_bases record and returns it which can be added to a bigquery schema by the caller. """ alternate_bases_record = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each alternate base (if any).') alternate_bases_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES_ALT, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Alternate base.')) if self._split_alternate_allele_info_fields: for key, field in self._header_fields.infos.iteritems(): if (field[_HeaderKeyConstants.NUM] == vcf.parser. field_counts[_FIELD_COUNT_ALTERNATE_ALLELE]): alternate_bases_record.fields.append( bigquery.TableFieldSchema( name=_BigQuerySchemaSanitizer. get_sanitized_field_name(key), type=bigquery_util.get_bigquery_type_from_vcf_type( field[_HeaderKeyConstants.TYPE]), mode=bigquery_util.TableFieldConstants. MODE_NULLABLE, description=_BigQuerySchemaSanitizer. get_sanitized_string( field[_HeaderKeyConstants.DESC]))) for annot_field in self._annotation_field_set: if annot_field not in self._header_fields.infos: raise ValueError( 'Annotation field {} not found'.format(annot_field)) annotation_names = annotation_parser.extract_annotation_names( self._header_fields.infos[annot_field][ _HeaderKeyConstants.DESC]) annotation_descs = descriptions.VEP_DESCRIPTIONS annotation_record = bigquery.TableFieldSchema( name=_BigQuerySchemaSanitizer.get_sanitized_field_name( annot_field), type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='List of {} annotations for this alternate.'. format(annot_field)) annotation_record.fields.append( bigquery.TableFieldSchema( name=annotation_parser.ANNOTATION_ALT, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='The ALT part of the annotation field.')) for annotation_name in annotation_names: annotation_record.fields.append( bigquery.TableFieldSchema( name=_BigQuerySchemaSanitizer.get_sanitized_field_name( annotation_name), type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=annotation_descs.get(annotation_name, ''))) alternate_bases_record.fields.append(annotation_record) return alternate_bases_record