def _add_format_fields(schema, formats, allow_incompatible_schema=False):
    # type: (bigquery.TableFieldSchema, Dict[str, _Format], bool) -> None
    for field in schema.fields:
        if field.name in _CONSTANT_CALL_FIELDS:
            continue
        if (field.name in list(vcf_reserved_fields.FORMAT_FIELDS.keys())
                and not allow_incompatible_schema):
            reserved_definition = vcf_reserved_fields.FORMAT_FIELDS.get(
                field.name)
            _validate_reserved_field(field, reserved_definition)
            formats.update({
                field.name:
                vcf_header_io.CreateFormatField(
                    field.name, reserved_definition.num,
                    reserved_definition.type,
                    _remove_special_characters(field.description
                                               or reserved_definition.desc))
            })
        else:
            formats.update({
                field.name:
                vcf_header_io.CreateFormatField(
                    field.name,
                    bigquery_util.get_vcf_num_from_bigquery_schema(
                        field.mode, field.type),
                    bigquery_util.get_vcf_type_from_bigquery_type(field.type),
                    _remove_special_characters(field.description))
            })
Ejemplo n.º 2
0
def infer_format_fields(
        variant,  # type: vcfio.Variant
        defined_headers  # type: vcf_header_io.VcfHeader
):
    # type: (...) -> Dict[str, vcf_header_io.VcfHeaderFormatField]
    """Returns inferred format fields.

  Two types of format fields are inferred:
  - The format fields are undefined in the headers.
  - The format definition provided by the headers does not match the field
    values.

  Args:
    variant: variant object
    defined_headers: header fields defined in header section of VCF files.

  Returns:
    A dict of (format_key, `Format`) for any format key in
    `variant` that is not defined in the header or the definition mismatches
    the field values.
  """
    formats = {}
    if defined_headers and defined_headers.formats:
        for format_key, format_value in defined_headers.formats.items():
            formats[format_key] = vcf_header_io.CreateFormatField(
                format_key,
                format_value[_HeaderKeyConstants.NUM],
                format_value[_HeaderKeyConstants.TYPE],
                format_value[_HeaderKeyConstants.DESC],
            )
    updated_formats = {}
    for call in variant.calls:
        for format_key, format_value in call.info.iteritems():
            if format_key not in formats:
                logging.warning('Undefined FORMAT field "%s" in variant "%s"',
                                format_key, str(variant))
                formats[format_key] = vcf_header_io.CreateFormatField(
                    format_key, _get_field_count(format_value),
                    _get_field_type(format_value))
                updated_formats[format_key] = formats[format_key]
            else:
                defined_header = formats[format_key]
                corrected_format = _infer_mismatched_format_field(
                    format_key, format_value, defined_header)
                if corrected_format:
                    logging.warning(
                        'Adjusting FORMAT field "%s". Defined as "type=%s,num=%s", '
                        'got "%s" in variant "%s"', format_key,
                        defined_header.record[_PysamHeaderKeyConstants.TYPE],
                        str(defined_header.record[
                            _PysamHeaderKeyConstants.NUM]), str(format_value),
                        str(variant))
                    formats[format_key] = corrected_format
                    updated_formats[format_key] = formats[format_key]

    return updated_formats
def _infer_mismatched_format_field(field_key, field_value, defined_header):
  # type: (str, Any, Dict) -> Optional[vcf_header_io.VcfHeaderFormatField]
  """Returns corrected format if there are mismatches.

  One type of mismatches is handled:
  - Defined type is `Integer`, but the provided value is float. Correct the
    type to be `Float`.

  Args:
    field_key: the format field key.
    field_value: the value of the field key given in the variant.
    defined_header: The definition of `field_key` in the header.

  Returns:
    Corrected format definition if there are mismatches.
  """
  corrected_type = _get_corrected_type(
      defined_header.record[_PysamHeaderKeyConstants.TYPE], field_value)
  if corrected_type != defined_header.record[_PysamHeaderKeyConstants.TYPE]:
    return vcf_header_io.CreateFormatField(
        field_key,
        defined_header.record[_PysamHeaderKeyConstants.NUM],
        corrected_type,
        defined_header.record[_PysamHeaderKeyConstants.DESC])
  return None