def _add_info_fields(field, infos, allow_incompatible_schema=False): # type: (bigquery.TableFieldSchema, Dict[str, _Info], bool) -> None if field.name == bigquery_util.ColumnKeyConstants.ALTERNATE_BASES: _add_info_fields_from_alternate_bases(field, infos, allow_incompatible_schema) elif (field.name in list(vcf_reserved_fields.INFO_FIELDS.keys()) and not allow_incompatible_schema): reserved_definition = vcf_reserved_fields.INFO_FIELDS.get(field.name) _validate_reserved_field(field, reserved_definition) infos.update({ field.name: vcf_header_io.CreateInfoField( field.name, reserved_definition.num, reserved_definition.type, _remove_special_characters(field.description or reserved_definition.desc)) }) else: infos.update({ field.name: vcf_header_io.CreateInfoField( field.name, bigquery_util.get_vcf_num_from_bigquery_schema( field.mode, field.type), bigquery_util.get_vcf_type_from_bigquery_type(field.type), _remove_special_characters(field.description)) })
def _add_info_fields_from_alternate_bases(schema, infos, allow_incompatible_schema=False): # type: (bigquery.TableFieldSchema, Dict[str, _Info], bool) -> None """Adds schema nested fields in alternate bases to `infos`. Notice that the validation of field mode is skipped for reserved fields since the mode (NULLABLE) of field in alternate bases is expected to be different from the mode (REPEATED) in reserved field definition. Any `Record` field within alternate bases is considered as an annotation field. """ for field in schema.fields: if field.name in _CONSTANT_ALTERNATE_BASES_FIELDS: continue if field.type == bigquery_util.TableFieldConstants.TYPE_RECORD: infos.update({ field.name: vcf_header_io.CreateInfoField( field.name, vcfio.MISSING_FIELD_VALUE, bigquery_util._VcfHeaderTypeConstants.STRING, _remove_special_characters( _get_annotation_description(field))) }) elif (field.name in list(vcf_reserved_fields.INFO_FIELDS.keys()) and not allow_incompatible_schema): reserved_definition = vcf_reserved_fields.INFO_FIELDS.get( field.name) _validate_reserved_field_type(field, reserved_definition) infos.update({ field.name: vcf_header_io.CreateInfoField( field.name, reserved_definition.num, reserved_definition.type, _remove_special_characters(field.description or reserved_definition.desc)) }) else: infos.update({ field.name: vcf_header_io.CreateInfoField( field.name, vcf_parser.FIELD_COUNT_ALTERNATE_ALLELE, bigquery_util.get_vcf_type_from_bigquery_type(field.type), _remove_special_characters(field.description)) })
def make_header(header_num_dict): # type: (Dict[str, str]) -> VcfHeader """Builds a VcfHeader based on the header_num_dict. Args: header_num_dict: a dictionary mapping info keys to string num values. """ infos = {} for k, v in header_num_dict.items(): num_field_value = v if v in vcf_header_io.HEADER_SPECIAL_NUMBERS else int( v) infos[k] = vcf_header_io.CreateInfoField(k, num_field_value, '.', '') return vcf_header_io.VcfHeader(infos=infos)
def _infer_non_annotation_info_fields( variant, # type: vcfio.Variant infos, # type: Dict[str, vcf_header_io.VcfHeaderInfoField] defined_headers # type: vcf_header_io.VcfHeader ): # type: (...) -> None """Updates `infos` with inferred info fields. Two types of info fields are inferred: - The info fields are undefined in the headers. - The info fields' definitions provided by the header does not match the field value. Args: variant: variant object infos: dict of (info_key, `Info`) for any info field in `variant` that is not defined in the header or the definition mismatches the field values. defined_headers: header fields defined in header section of VCF files. """ for info_field_key, info_field_value in variant.info.items(): if not defined_headers or info_field_key not in defined_headers.infos: if info_field_key in infos: raise ValueError( 'Duplicate INFO field "{}" in variant "{}"'.format( info_field_key, variant)) logging.warning('Undefined INFO field "%s" in variant "%s"', info_field_key, str(variant)) infos[info_field_key] = vcf_header_io.CreateInfoField( info_field_key, _get_field_count(info_field_value), _get_field_type(info_field_value)) else: defined_header = defined_headers.infos.get(info_field_key) corrected_info = _infer_mismatched_info_field( info_field_key, info_field_value, defined_header, len(variant.alternate_bases)) if corrected_info: logging.warning( 'Incorrect INFO field "%s". Defined as "type=%s,num=%s", ' 'got "%s", in variant "%s"', info_field_key, defined_header.get(_HeaderKeyConstants.TYPE), str(defined_header.get(_HeaderKeyConstants.NUM)), str(info_field_value), str(variant)) infos[info_field_key] = corrected_info
def _infer_mismatched_info_field(field_key, # type: str field_value, # type: Any defined_header, # type: Dict num_alternate_bases # type: int ): # type: (...) -> Optional[vcf_header_io.VcfHeaderInfoField] """Returns corrected info if there are mismatches. Two mismatches are handled: - Defined num is `A`, but the provided values do not have the same cardinality as the alternate bases. Correct the num to be `.`. - Defined type is `Integer`, but the provided value is float. Correct the type to be `Float`. Args: field_key: the info field key. field_value: the value of the field key given in the variant. defined_header: The definition of `field_key` in the header. num_alternate_bases: number of the alternate bases. Returns: Corrected info definition if there are mismatches. """ corrected_num = defined_header.get(_HeaderKeyConstants.NUM) if (corrected_num == _FIELD_COUNT_ALTERNATE_ALLELE and len(field_value) != num_alternate_bases): corrected_num = '.' corrected_type = _get_corrected_type( defined_header.get(_HeaderKeyConstants.TYPE), field_value) if (corrected_type != defined_header.get(_HeaderKeyConstants.TYPE) or corrected_num != defined_header.get(_HeaderKeyConstants.NUM)): return vcf_header_io.CreateInfoField( field_key, corrected_num, corrected_type, defined_header.get(_HeaderKeyConstants.DESC), defined_header.get(_HeaderKeyConstants.SOURCE), defined_header.get(_HeaderKeyConstants.VERSION)) return None
def _infer_annotation_type_info_fields( variant, # type: vcfio.Variant infos, # type: Dict[str, vcf_header_io.VcfHeaderInfoField] defined_headers, # type: vcf_header_io.VcfHeader annotation_fields_to_infer # type: List[str] ): # type: (...) -> None """Updates `infos` with inferred annotation type info fields. All annotation headers in each annotation field are converted to Info header lines where the new ID corresponds to the given annotation field and header, and the new TYPE corresponds to inferred type of the original header. Since each variant potentially contains multiple values for each annotation header, a small 'merge' of value types is performed before VcfHeader creation for each variant. Args: variant: variant object infos: dict of (info_key, `Info`) for any info field in `variant` that is not defined in the header or the definition mismatches the field values. defined_headers: header fields defined in header section of VCF files. annotation_fields_to_infer: list of info fields treated as annotation fields (e.g. ['CSQ', 'CSQ_VT']). """ def _check_annotation_lists_lengths(names, values): lengths = set(len(v) for v in values) lengths.add(len(names)) if len(lengths) != 1: error = ( 'Annotation lists have inconsistent lengths: {}.\nnames={}\n' 'values={}').format(lengths, names, values) raise ValueError(error) resolver = vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=True) for field in annotation_fields_to_infer: if field not in variant.info: continue annotation_names = annotation_parser.extract_annotation_names( defined_headers.infos[field][_HeaderKeyConstants.DESC]) # First element (ALT) is ignored, since its type is hard-coded as string annotation_values = [ annotation_parser.extract_annotation_list_with_alt(annotation)[1:] for annotation in variant.info[field] ] _check_annotation_lists_lengths(annotation_names, annotation_values) annotation_values = zip(*annotation_values) for name, values in zip(annotation_names, annotation_values): variant_merged_type = '.' for v in values: if not v: continue variant_merged_type = resolver.resolve_attribute_conflict( _HeaderKeyConstants.TYPE, variant_merged_type, _get_field_type(v)) if variant_merged_type == _HeaderTypeConstants.STRING: break key_id = get_inferred_annotation_type_header_key(field, name) infos[key_id] = vcf_header_io.CreateInfoField( key_id, 1, # field count variant_merged_type, ('Inferred type field for annotation {}.'.format(name)))