def _add_info_fields(field, infos, allow_incompatible_schema=False):
    # type: (bigquery.TableFieldSchema, Dict[str, _Info], bool) -> None
    if field.name == bigquery_util.ColumnKeyConstants.ALTERNATE_BASES:
        _add_info_fields_from_alternate_bases(field, infos,
                                              allow_incompatible_schema)
    elif (field.name in list(vcf_reserved_fields.INFO_FIELDS.keys())
          and not allow_incompatible_schema):
        reserved_definition = vcf_reserved_fields.INFO_FIELDS.get(field.name)
        _validate_reserved_field(field, reserved_definition)
        infos.update({
            field.name:
            vcf_header_io.CreateInfoField(
                field.name, reserved_definition.num, reserved_definition.type,
                _remove_special_characters(field.description
                                           or reserved_definition.desc))
        })
    else:
        infos.update({
            field.name:
            vcf_header_io.CreateInfoField(
                field.name,
                bigquery_util.get_vcf_num_from_bigquery_schema(
                    field.mode, field.type),
                bigquery_util.get_vcf_type_from_bigquery_type(field.type),
                _remove_special_characters(field.description))
        })
def _add_info_fields_from_alternate_bases(schema,
                                          infos,
                                          allow_incompatible_schema=False):
    # type: (bigquery.TableFieldSchema, Dict[str, _Info], bool) -> None
    """Adds schema nested fields in alternate bases to `infos`.

  Notice that the validation of field mode is skipped for reserved fields since
  the mode (NULLABLE) of field in alternate bases is expected to be different
  from the mode (REPEATED) in reserved field definition.

  Any `Record` field within alternate bases is considered as an annotation
  field.
  """
    for field in schema.fields:
        if field.name in _CONSTANT_ALTERNATE_BASES_FIELDS:
            continue
        if field.type == bigquery_util.TableFieldConstants.TYPE_RECORD:
            infos.update({
                field.name:
                vcf_header_io.CreateInfoField(
                    field.name, vcfio.MISSING_FIELD_VALUE,
                    bigquery_util._VcfHeaderTypeConstants.STRING,
                    _remove_special_characters(
                        _get_annotation_description(field)))
            })
        elif (field.name in list(vcf_reserved_fields.INFO_FIELDS.keys())
              and not allow_incompatible_schema):
            reserved_definition = vcf_reserved_fields.INFO_FIELDS.get(
                field.name)
            _validate_reserved_field_type(field, reserved_definition)
            infos.update({
                field.name:
                vcf_header_io.CreateInfoField(
                    field.name, reserved_definition.num,
                    reserved_definition.type,
                    _remove_special_characters(field.description
                                               or reserved_definition.desc))
            })
        else:
            infos.update({
                field.name:
                vcf_header_io.CreateInfoField(
                    field.name, vcf_parser.FIELD_COUNT_ALTERNATE_ALLELE,
                    bigquery_util.get_vcf_type_from_bigquery_type(field.type),
                    _remove_special_characters(field.description))
            })
def make_header(header_num_dict):
    # type: (Dict[str, str]) -> VcfHeader
    """Builds a VcfHeader based on the header_num_dict.

  Args:
    header_num_dict: a dictionary mapping info keys to string num values.
  """
    infos = {}
    for k, v in header_num_dict.items():
        num_field_value = v if v in vcf_header_io.HEADER_SPECIAL_NUMBERS else int(
            v)
        infos[k] = vcf_header_io.CreateInfoField(k, num_field_value, '.', '')
    return vcf_header_io.VcfHeader(infos=infos)
def _infer_non_annotation_info_fields(
    variant,  # type: vcfio.Variant
    infos,  # type: Dict[str, vcf_header_io.VcfHeaderInfoField]
    defined_headers  # type: vcf_header_io.VcfHeader
    ):
  # type: (...) -> None
  """Updates `infos` with inferred info fields.

  Two types of info fields are inferred:
  - The info fields are undefined in the headers.
  - The info fields' definitions provided by the header does not match the
    field value.

  Args:
    variant: variant object
    infos: dict of (info_key, `Info`) for any info field in
      `variant` that is not defined in the header or the definition mismatches
      the field values.
    defined_headers: header fields defined in header section of VCF files.
  """
  for info_field_key, info_field_value in variant.info.items():
    if not defined_headers or info_field_key not in defined_headers.infos:
      if info_field_key in infos:
        raise ValueError(
            'Duplicate INFO field "{}" in variant "{}"'.format(
                info_field_key, variant))
      logging.warning('Undefined INFO field "%s" in variant "%s"',
                      info_field_key, str(variant))
      infos[info_field_key] = vcf_header_io.CreateInfoField(
          info_field_key,
          _get_field_count(info_field_value),
          _get_field_type(info_field_value))
    else:
      defined_header = defined_headers.infos.get(info_field_key)
      corrected_info = _infer_mismatched_info_field(
          info_field_key, info_field_value,
          defined_header, len(variant.alternate_bases))
      if corrected_info:
        logging.warning(
            'Incorrect INFO field "%s". Defined as "type=%s,num=%s", '
            'got "%s", in variant "%s"',
            info_field_key, defined_header.get(_HeaderKeyConstants.TYPE),
            str(defined_header.get(_HeaderKeyConstants.NUM)),
            str(info_field_value), str(variant))
        infos[info_field_key] = corrected_info
def _infer_mismatched_info_field(field_key,  # type: str
                                 field_value,  # type: Any
                                 defined_header,  # type: Dict
                                 num_alternate_bases  # type: int
                                ):
  # type: (...) -> Optional[vcf_header_io.VcfHeaderInfoField]
  """Returns corrected info if there are mismatches.

  Two mismatches are handled:
  - Defined num is `A`, but the provided values do not have the same
    cardinality as the alternate bases. Correct the num to be `.`.
  - Defined type is `Integer`, but the provided value is float. Correct the
    type to be `Float`.

  Args:
    field_key: the info field key.
    field_value: the value of the field key given in the variant.
    defined_header: The definition of `field_key` in the header.
    num_alternate_bases: number of the alternate bases.

  Returns:
    Corrected info definition if there are mismatches.
  """
  corrected_num = defined_header.get(_HeaderKeyConstants.NUM)
  if (corrected_num == _FIELD_COUNT_ALTERNATE_ALLELE and
      len(field_value) != num_alternate_bases):
    corrected_num = '.'

  corrected_type = _get_corrected_type(
      defined_header.get(_HeaderKeyConstants.TYPE), field_value)

  if (corrected_type != defined_header.get(_HeaderKeyConstants.TYPE) or
      corrected_num != defined_header.get(_HeaderKeyConstants.NUM)):
    return vcf_header_io.CreateInfoField(
        field_key,
        corrected_num,
        corrected_type,
        defined_header.get(_HeaderKeyConstants.DESC),
        defined_header.get(_HeaderKeyConstants.SOURCE),
        defined_header.get(_HeaderKeyConstants.VERSION))
  return None
Exemple #6
0
def _infer_annotation_type_info_fields(
    variant,  # type: vcfio.Variant
    infos,  # type: Dict[str, vcf_header_io.VcfHeaderInfoField]
    defined_headers,  # type: vcf_header_io.VcfHeader
    annotation_fields_to_infer  # type: List[str]
):
    # type: (...) -> None
    """Updates `infos` with inferred annotation type info fields.

  All annotation headers in each annotation field are converted to Info header
  lines where the new ID corresponds to the given annotation field and header,
  and the new TYPE corresponds to inferred type of the original header. Since
  each variant potentially contains multiple values for each annotation
  header, a small 'merge' of value types is performed before VcfHeader
  creation for each variant.

  Args:
    variant: variant object
    infos: dict of (info_key, `Info`) for any info field in
      `variant` that is not defined in the header or the definition mismatches
      the field values.
    defined_headers: header fields defined in header section of VCF files.
    annotation_fields_to_infer: list of info fields treated as annotation
        fields (e.g. ['CSQ', 'CSQ_VT']).
  """
    def _check_annotation_lists_lengths(names, values):
        lengths = set(len(v) for v in values)
        lengths.add(len(names))
        if len(lengths) != 1:
            error = (
                'Annotation lists have inconsistent lengths: {}.\nnames={}\n'
                'values={}').format(lengths, names, values)
            raise ValueError(error)

    resolver = vcf_field_conflict_resolver.FieldConflictResolver(
        resolve_always=True)
    for field in annotation_fields_to_infer:
        if field not in variant.info:
            continue
        annotation_names = annotation_parser.extract_annotation_names(
            defined_headers.infos[field][_HeaderKeyConstants.DESC])
        # First element (ALT) is ignored, since its type is hard-coded as string
        annotation_values = [
            annotation_parser.extract_annotation_list_with_alt(annotation)[1:]
            for annotation in variant.info[field]
        ]
        _check_annotation_lists_lengths(annotation_names, annotation_values)
        annotation_values = zip(*annotation_values)
        for name, values in zip(annotation_names, annotation_values):
            variant_merged_type = '.'
            for v in values:
                if not v:
                    continue
                variant_merged_type = resolver.resolve_attribute_conflict(
                    _HeaderKeyConstants.TYPE, variant_merged_type,
                    _get_field_type(v))
                if variant_merged_type == _HeaderTypeConstants.STRING:
                    break
            key_id = get_inferred_annotation_type_header_key(field, name)
            infos[key_id] = vcf_header_io.CreateInfoField(
                key_id,
                1,  # field count
                variant_merged_type,
                ('Inferred type field for annotation {}.'.format(name)))