def examples_to_variants(examples_path, max_records=None):
  """Yields Variant protos from the examples in examples_path.

  This function reads in tf.Examples produced by DeepVariant from examples_path,
  which may contain a sharded spec, sorts them, selects a representive example
  when there are multiple versions representing different alt_alleles, and
  yields the example_variant field from those examples.

  Args:
    examples_path: str. Path, or sharded spec, to labeled tf.Examples produced
      by DeepVariant in training mode.
    max_records: int or None. Maximum number of records to read, or None, to
      read all of the records.

  Yields:
    nucleus.protos.Variant protos in coordinate-sorted order.

  Raises:
    ValueError: if we find a Variant in any example that doesn't have genotypes.
  """
  examples = io_utils.read_tfrecords(examples_path, max_records=max_records)
  variants = sorted(
      (tf_utils.example_variant(example) for example in examples),
      key=variant_utils.variant_range_tuple)

  for _, group in itertools.groupby(variants,
                                    variant_utils.variant_range_tuple):
    variant = next(group)
    if not variantcall_utils.has_genotypes(variant_utils.only_call(variant)):
      raise ValueError((
          'Variant {} does not have any genotypes. This tool only works with '
          'variants that have been labeled.').format(
              variant_utils.variant_key(variant)))
    yield variant
def examples_to_variants(examples_path, max_records=None):
    """Yields Variant protos from the examples in examples_path.

  This function reads in tf.Examples produced by DeepVariant from examples_path,
  which may contain a sharded spec, sorts them, selects a representive example
  when there are multiple versions representing different alt_alleles, and
  yields the example_variant field from those examples.

  Args:
    examples_path: str. Path, or sharded spec, to labeled tf.Examples produced
      by DeepVariant in training mode.
    max_records: int or None. Maximum number of records to read, or None, to
      read all of the records.

  Yields:
    nucleus.protos.Variant protos in coordinate-sorted order.

  Raises:
    ValueError: if we find a Variant in any example that doesn't have genotypes.
  """
    examples = io_utils.read_tfrecords(examples_path, max_records=max_records)
    variants = sorted(
        (tf_utils.example_variant(example) for example in examples),
        key=variant_utils.variant_range_tuple)

    for _, group in itertools.groupby(variants,
                                      variant_utils.variant_range_tuple):
        variant = next(group)
        if not variantcall_utils.has_genotypes(
                variant_utils.only_call(variant)):
            raise ValueError((
                'Variant {} does not have any genotypes. This tool only works with '
                'variants that have been labeled.').format(
                    variant_utils.variant_key(variant)))
        yield variant
Example #3
0
def is_variant_call(variant,
                    require_non_ref_genotype=True,
                    no_calls_are_variant=False,
                    call_indices=None):
    """Is variant a non-reference call?

  A Variant proto doesn't always imply that there's a variant present in the
  genome. The call may not have alternate bases, may be filtered, may a have
  hom-ref genotype, etc. This function looks for all of those configurations
  and returns true iff the variant is asserting that a mutation is present
  in the same.

  Note that this code allows a variant without a calls field to be variant,
  but one with a genotype call must have a non-reference genotype to be
  considered variant (if require_non_ref_genotype is True, the default). If
  False, a variant that passes all of the site-level requirements for being
  a variant_call will return a True value, regardless of the genotypes, which
  means that we'll consider a site with a sample with a hom-ref or no-call site
  a variant call.

  Args:
    variant: nucleus.genomics.v1.Variant.
    require_non_ref_genotype: Should we require a site with a genotype call to
      have a non-reference (het, hom-var) genotype for the site to be considered
      a variant call?
    no_calls_are_variant: If a site has genotypes, should we consider no_call
      genotypes as being variant or not?
    call_indices: A list of 0-based indices. If specified, only the calls
      at the given indices will be considered. The function will return
      True if any of those calls are variant.

  Returns:
    True if variant is really a mutation call.
  """
    if not variant.alternate_bases:
        return False
    elif is_filtered(variant):
        return False
    elif not variant.calls or not require_non_ref_genotype:
        return True
    # All tests after this point should only look at genotype-based fields, as
    # we may have aborted out in the prev. line due to require_non_ref_genotype.
    else:
        if call_indices is None:
            call_indices = range(len(variant.calls))
        return any(
            any(g > 0 for g in variant.calls[i].genotype) or (
                no_calls_are_variant
                and not variantcall_utils.has_genotypes(variant.calls[i]))
            for i in call_indices)
def _genotype_from_matched_truth(candidate_variant, truth_variant):
    """Gets the diploid genotype for candidate_variant from matched truth_variant.

  This method figures out the genotype for candidate_variant by matching alleles
  in candidate_variant with those used by the genotype assigned to
  truth_variant. For example, if candidate is A/C and truth is A/C with a 0/1
  genotype, then this function would return (0, 1) indicating that there's one
  copy of the A allele and one of C in truth. If the true genotype is 1/1, then
  this routine would return (1, 1).

  The routine allows candidate_variant and truth_variant to differ in both
  the number of alternate alleles, and even in the representation of the same
  alleles due to those differences. For example, candidate could be:

      AGT/A/AGTGT => 2 bp deletion and 2 bp insertion

  and truth could have:

      A/AGT => just the simplified 2 bp insertion

  And this routine will correctly equate the AGT/AGTGT allele in candidate
  with the A/AGT in truth and use the number of copies of AGT in truth to
  compute the number of copies of AGTGT when determining the returned genotype.

  Args:
    candidate_variant: Our candidate third_party.nucleus.protos.Variant variant.
    truth_variant: Our third_party.nucleus.protos.Variant truth variant
      containing true alleles and genotypes.

  Returns:
    A tuple genotypes with the same semantics at the genotype field of the
    VariantCall proto.

  Raises:
    ValueError: If candidate_variant is None, truth_variant is None, or
      truth_variant doesn't have genotypes.
  """
    if candidate_variant is None:
        raise ValueError('candidate_variant cannot be None')
    if truth_variant is None:
        raise ValueError('truth_variant cannot be None')
    if not variantcall_utils.has_genotypes(
            variant_utils.only_call(truth_variant)):
        raise ValueError(
            'truth_variant needs genotypes to be used for labeling',
            truth_variant)

    def _match_one_allele(true_allele):
        if true_allele == truth_variant.reference_bases:
            return 0
        else:
            simplified_true_allele = variant_utils.simplify_alleles(
                truth_variant.reference_bases, true_allele)
            for alt_index, alt_allele in enumerate(
                    candidate_variant.alternate_bases):
                simplified_alt_allele = variant_utils.simplify_alleles(
                    candidate_variant.reference_bases, alt_allele)
                if simplified_true_allele == simplified_alt_allele:
                    return alt_index + 1
            # If nothing matched, we don't have this alt, so the alt allele index for
            # should be 0 (i.e., not any alt).
            return 0

    # If our candidate_variant is a reference call, return a (0, 0) genotype.
    if variant_utils.is_ref(candidate_variant):
        return (0, 0)
    else:
        return tuple(
            sorted(
                _match_one_allele(true_allele) for true_allele in
                variant_utils.genotype_as_alleles(truth_variant)))
Example #5
0
def _genotype_from_matched_truth(candidate_variant, truth_variant):
  """Gets the diploid genotype for candidate_variant from matched truth_variant.

  This method figures out the genotype for candidate_variant by matching alleles
  in candidate_variant with those used by the genotype assigned to
  truth_variant. For example, if candidate is A/C and truth is A/C with a 0/1
  genotype, then this function would return (0, 1) indicating that there's one
  copy of the A allele and one of C in truth. If the true genotype is 1/1, then
  this routine would return (1, 1).

  The routine allows candidate_variant and truth_variant to differ in both
  the number of alternate alleles, and even in the representation of the same
  alleles due to those differences. For example, candidate could be:

      AGT/A/AGTGT => 2 bp deletion and 2 bp insertion

  and truth could have:

      A/AGT => just the simplified 2 bp insertion

  And this routine will correctly equate the AGT/AGTGT allele in candidate
  with the A/AGT in truth and use the number of copies of AGT in truth to
  compute the number of copies of AGTGT when determining the returned genotype.

  Args:
    candidate_variant: Our candidate third_party.nucleus.protos.Variant variant.
    truth_variant: Our third_party.nucleus.protos.Variant truth variant
      containing true alleles and genotypes.

  Returns:
    A tuple genotypes with the same semantics at the genotype field of the
    VariantCall proto.

  Raises:
    ValueError: If candidate_variant is None, truth_variant is None, or
      truth_variant doesn't have genotypes.
  """
  if candidate_variant is None:
    raise ValueError('candidate_variant cannot be None')
  if truth_variant is None:
    raise ValueError('truth_variant cannot be None')
  if not variantcall_utils.has_genotypes(
      variant_utils.only_call(truth_variant)):
    raise ValueError('truth_variant needs genotypes to be used for labeling',
                     truth_variant)

  def _match_one_allele(true_allele):
    if true_allele == truth_variant.reference_bases:
      return 0
    else:
      simplifed_true_allele = variant_utils.simplify_alleles(
          truth_variant.reference_bases, true_allele)
      for alt_index, alt_allele in enumerate(candidate_variant.alternate_bases):
        simplifed_alt_allele = variant_utils.simplify_alleles(
            candidate_variant.reference_bases, alt_allele)
        if simplifed_true_allele == simplifed_alt_allele:
          return alt_index + 1
      # If nothing matched, we don't have this alt, so the alt allele index for
      # should be 0 (i.e., not any alt).
      return 0

  # If our candidate_variant is a reference call, return a (0, 0) genotype.
  if variant_utils.is_ref(candidate_variant):
    return (0, 0)
  else:
    return tuple(
        sorted(
            _match_one_allele(true_allele) for true_allele in
            variant_utils.genotype_as_alleles(truth_variant)))
 def test_has_genotypes(self, genotype, expected):
     call = variants_pb2.VariantCall(genotype=genotype)
     actual = variantcall_utils.has_genotypes(call)
     self.assertEqual(actual, expected)
 def test_has_genotypes(self, genotype, expected):
   call = variants_pb2.VariantCall(genotype=genotype)
   actual = variantcall_utils.has_genotypes(call)
   self.assertEqual(actual, expected)