def _add_format_fields(schema, formats, allow_incompatible_schema=False): # type: (bigquery.TableFieldSchema, Dict[str, _Format], bool) -> None for field in schema.fields: if field.name in _CONSTANT_CALL_FIELDS: continue if (field.name in list(vcf_reserved_fields.FORMAT_FIELDS.keys()) and not allow_incompatible_schema): reserved_definition = vcf_reserved_fields.FORMAT_FIELDS.get( field.name) _validate_reserved_field(field, reserved_definition) formats.update({ field.name: vcf_header_io.CreateFormatField( field.name, reserved_definition.num, reserved_definition.type, _remove_special_characters(field.description or reserved_definition.desc)) }) else: formats.update({ field.name: vcf_header_io.CreateFormatField( field.name, bigquery_util.get_vcf_num_from_bigquery_schema( field.mode, field.type), bigquery_util.get_vcf_type_from_bigquery_type(field.type), _remove_special_characters(field.description)) })
def infer_format_fields( variant, # type: vcfio.Variant defined_headers # type: vcf_header_io.VcfHeader ): # type: (...) -> Dict[str, vcf_header_io.VcfHeaderFormatField] """Returns inferred format fields. Two types of format fields are inferred: - The format fields are undefined in the headers. - The format definition provided by the headers does not match the field values. Args: variant: variant object defined_headers: header fields defined in header section of VCF files. Returns: A dict of (format_key, `Format`) for any format key in `variant` that is not defined in the header or the definition mismatches the field values. """ formats = {} if defined_headers and defined_headers.formats: for format_key, format_value in defined_headers.formats.items(): formats[format_key] = vcf_header_io.CreateFormatField( format_key, format_value[_HeaderKeyConstants.NUM], format_value[_HeaderKeyConstants.TYPE], format_value[_HeaderKeyConstants.DESC], ) updated_formats = {} for call in variant.calls: for format_key, format_value in call.info.iteritems(): if format_key not in formats: logging.warning('Undefined FORMAT field "%s" in variant "%s"', format_key, str(variant)) formats[format_key] = vcf_header_io.CreateFormatField( format_key, _get_field_count(format_value), _get_field_type(format_value)) updated_formats[format_key] = formats[format_key] else: defined_header = formats[format_key] corrected_format = _infer_mismatched_format_field( format_key, format_value, defined_header) if corrected_format: logging.warning( 'Adjusting FORMAT field "%s". Defined as "type=%s,num=%s", ' 'got "%s" in variant "%s"', format_key, defined_header.record[_PysamHeaderKeyConstants.TYPE], str(defined_header.record[ _PysamHeaderKeyConstants.NUM]), str(format_value), str(variant)) formats[format_key] = corrected_format updated_formats[format_key] = formats[format_key] return updated_formats
def _infer_mismatched_format_field(field_key, field_value, defined_header): # type: (str, Any, Dict) -> Optional[vcf_header_io.VcfHeaderFormatField] """Returns corrected format if there are mismatches. One type of mismatches is handled: - Defined type is `Integer`, but the provided value is float. Correct the type to be `Float`. Args: field_key: the format field key. field_value: the value of the field key given in the variant. defined_header: The definition of `field_key` in the header. Returns: Corrected format definition if there are mismatches. """ corrected_type = _get_corrected_type( defined_header.record[_PysamHeaderKeyConstants.TYPE], field_value) if corrected_type != defined_header.record[_PysamHeaderKeyConstants.TYPE]: return vcf_header_io.CreateFormatField( field_key, defined_header.record[_PysamHeaderKeyConstants.NUM], corrected_type, defined_header.record[_PysamHeaderKeyConstants.DESC]) return None