Example #1
0
def _get_variant_stats(variant, vaf_available=False, vcf_reader=None):
  """Returns a VariantStats object corresponding to the input variant."""
  vtype = _get_variant_type(variant)
  is_transition, is_transversion = _tstv(variant, vtype)
  vaf = None
  if vaf_available:
    vaf = _get_vaf(variant, vcf_reader)

  return VariantStats(
      reference_name=variant.reference_name,
      position=(variant.start + 1),
      reference_bases=variant.reference_bases,
      alternate_bases=list(variant.alternate_bases),
      variant_type=vtype,
      is_transition=is_transition,
      is_transversion=is_transversion,
      is_variant=variant_utils.is_variant_call(variant),
      depth=variantcall_utils.get_format(
          variant_utils.only_call(variant), 'DP'),
      genotype_quality=variantcall_utils.get_gq(
          variant_utils.only_call(variant)),
      genotype=str(
          sorted(variantcall_utils.get_gt(variant_utils.only_call(variant)))),
      vaf=vaf,
      qual=variant.quality)
Example #2
0
 def test_invalid_only_call(self, num_calls):
   calls = [
       variants_pb2.VariantCall(call_set_name=str(x)) for x in range(num_calls)
   ]
   variant = variants_pb2.Variant(calls=calls)
   with self.assertRaisesRegexp(ValueError,
                                'Expected exactly one VariantCall'):
     variant_utils.only_call(variant)
Example #3
0
 def test_invalid_only_call(self, num_calls):
   calls = [
       variants_pb2.VariantCall(call_set_name=str(x)) for x in range(num_calls)
   ]
   variant = variants_pb2.Variant(calls=calls)
   with self.assertRaisesRegexp(ValueError,
                                'Expected exactly one VariantCall'):
     variant_utils.only_call(variant)
Example #4
0
  def test_add_label_to_example(self, label, expected_label_value):
    example = self._example_for_variant(label.variant)
    labeled = copy.deepcopy(example)
    actual = self.processor.add_label_to_example(labeled, label)

    # The add_label_to_example command modifies labeled and returns it.
    self.assertIs(actual, labeled)

    # Check that all keys from example are present in labeled.
    for key, value in example.features.feature.iteritems():
      if key != 'variant/encoded':  # Special case tested below.
        self.assertEqual(value, labeled.features.feature[key])

    # The genotype of our example_variant should be set to the true genotype
    # according to our label.
    self.assertEqual(expected_label_value, tf_utils.example_label(labeled))
    labeled_variant = tf_utils.example_variant(labeled)
    call = variant_utils.only_call(labeled_variant)
    self.assertEqual(tuple(call.genotype), label.genotype)

    # The original variant and labeled_variant from out tf.Example should be
    # equal except for the genotype field, since this is set by
    # add_label_to_example.
    label.variant.calls[0].genotype[:] = []
    call.genotype[:] = []
    self.assertEqual(label.variant, labeled_variant)
Example #5
0
    def test_add_label_to_example(self, label, expected_label_value):
        example = self._example_for_variant(label.variant)
        labeled = copy.deepcopy(example)
        actual = self.processor.add_label_to_example(labeled, label)

        # The add_label_to_example command modifies labeled and returns it.
        self.assertIs(actual, labeled)

        # Check that all keys from example are present in labeled.
        for key, value in example.features.feature.items():
            if key != 'variant/encoded':  # Special case tested below.
                self.assertEqual(value, labeled.features.feature[key])

        # The genotype of our example_variant should be set to the true genotype
        # according to our label.
        self.assertEqual(expected_label_value, tf_utils.example_label(labeled))
        labeled_variant = tf_utils.example_variant(labeled)
        call = variant_utils.only_call(labeled_variant)
        self.assertEqual(tuple(call.genotype), label.genotype)

        # The original variant and labeled_variant from out tf.Example should be
        # equal except for the genotype field, since this is set by
        # add_label_to_example.
        label.variant.calls[0].genotype[:] = []
        call.genotype[:] = []
        self.assertEqual(label.variant, labeled_variant)
def add_call_to_variant(variant, predictions, qual_filter=0, sample_name=None):
  """Fills in Variant record using the prediction probabilities.

  This functions sets the call[0].genotype, call[0].info['GQ'],
  call[0].genotype_probabilities, variant.filter, and variant.quality fields of
  variant based on the genotype likelihoods in predictions.

  Args:
    variant: third_party.nucleus.protos.Variant protobuf
      to be filled in with info derived from predictions.
    predictions: N element array-like. The real-space probabilities of each
      genotype state for this variant.
    qual_filter: float. If predictions implies that this isn't a reference call
      and the QUAL of the prediction isn't larger than qual_filter variant will
      be marked as FILTERed.
    sample_name: str. The name of the sample to assign to the Variant proto
      call_set_name field.

  Returns:
    A Variant record.

  Raises:
    ValueError: If variant doesn't have exactly one variant.call record.
  """
  call = variant_utils.only_call(variant)
  n_alleles = len(variant.alternate_bases) + 1
  index, genotype = most_likely_genotype(predictions, n_alleles=n_alleles)
  gq, variant.quality = compute_quals(predictions, index)
  call.call_set_name = sample_name
  variantcall_utils.set_gt(call, genotype)
  variantcall_utils.set_gq(call, gq)
  gls = [genomics_math.perror_to_bounded_log10_perror(gp) for gp in predictions]
  variantcall_utils.set_gl(call, gls)
  variant.filter[:] = compute_filter_fields(variant, qual_filter)
  return variant
def examples_to_variants(examples_path, max_records=None):
    """Yields Variant protos from the examples in examples_path.

  This function reads in tf.Examples produced by DeepVariant from examples_path,
  which may contain a sharded spec, sorts them, selects a representive example
  when there are multiple versions representing different alt_alleles, and
  yields the example_variant field from those examples.

  Args:
    examples_path: str. Path, or sharded spec, to labeled tf.Examples produced
      by DeepVariant in training mode.
    max_records: int or None. Maximum number of records to read, or None, to
      read all of the records.

  Yields:
    nucleus.protos.Variant protos in coordinate-sorted order.

  Raises:
    ValueError: if we find a Variant in any example that doesn't have genotypes.
  """
    examples = io_utils.read_tfrecords(examples_path, max_records=max_records)
    variants = sorted(
        (tf_utils.example_variant(example) for example in examples),
        key=variant_utils.variant_range_tuple)

    for _, group in itertools.groupby(variants,
                                      variant_utils.variant_range_tuple):
        variant = next(group)
        if not variantcall_utils.has_genotypes(
                variant_utils.only_call(variant)):
            raise ValueError((
                'Variant {} does not have any genotypes. This tool only works with '
                'variants that have been labeled.').format(
                    variant_utils.variant_key(variant)))
        yield variant
Example #8
0
def _extract_single_sample_name(record):
  """Returns the name of the single sample within the CallVariantsOutput file.

  Args:
    record: A deepvariant_pb2.CallVariantsOutput record.

  Returns:
    The name of the single individual in the first proto in the file.
    If --sample_name is set, use that instead.

  Raises:
    ValueError: There is not exactly one VariantCall in the proto or the
        call_set_name of the VariantCall is not populated.
  """
  if FLAGS.sample_name:
    logging.info(
        '--sample_name is set in postprocess_variant. Using %s as the '
        'sample name.', FLAGS.sample_name)
    return FLAGS.sample_name
  variant = record.variant
  call = variant_utils.only_call(variant)
  name = call.call_set_name
  if not name:
    raise ValueError(
        'Error extracting name: no call_set_name set: {}'.format(record))

  return name
def examples_to_variants(examples_path, max_records=None):
  """Yields Variant protos from the examples in examples_path.

  This function reads in tf.Examples produced by DeepVariant from examples_path,
  which may contain a sharded spec, sorts them, selects a representive example
  when there are multiple versions representing different alt_alleles, and
  yields the example_variant field from those examples.

  Args:
    examples_path: str. Path, or sharded spec, to labeled tf.Examples produced
      by DeepVariant in training mode.
    max_records: int or None. Maximum number of records to read, or None, to
      read all of the records.

  Yields:
    nucleus.protos.Variant protos in coordinate-sorted order.

  Raises:
    ValueError: if we find a Variant in any example that doesn't have genotypes.
  """
  examples = io_utils.read_tfrecords(examples_path, max_records=max_records)
  variants = sorted(
      (tf_utils.example_variant(example) for example in examples),
      key=variant_utils.variant_range_tuple)

  for _, group in itertools.groupby(variants,
                                    variant_utils.variant_range_tuple):
    variant = next(group)
    if not variantcall_utils.has_genotypes(variant_utils.only_call(variant)):
      raise ValueError((
          'Variant {} does not have any genotypes. This tool only works with '
          'variants that have been labeled.').format(
              variant_utils.variant_key(variant)))
    yield variant
Example #10
0
 def assertGVCF(self,
                gvcf,
                ref,
                gq,
                start,
                end,
                min_dp,
                chrom='chr1',
                gls=None,
                sample_name=None):
   if chrom:
     self.assertEqual(gvcf.reference_name, chrom)
   call = variant_utils.only_call(gvcf)
   self.assertNotEmpty(gvcf.reference_name)
   self.assertEqual(gvcf.reference_bases, ref)
   self.assertEqual(gvcf.alternate_bases, ['<*>'])
   self.assertEqual(gvcf.start, start)
   self.assertEqual(gvcf.end, end if end else start + 1)
   self.assertEqual(variantcall_utils.get_gq(call), gq)
   self.assertNotEmpty(call.genotype_likelihood)
   self.assertIn('MIN_DP', call.info)
   self.assertLen(call.info['MIN_DP'].values, 1)
   self.assertEqual(variantcall_utils.get_min_dp(call), min_dp)
   if gls is not None:
     npt.assert_allclose(list(gvcf.calls[0].genotype_likelihood), gls)
   if sample_name:
     self.assertEqual(gvcf.calls[0].call_set_name, sample_name)
Example #11
0
 def test_modify_only_call(self):
   variant = variants_pb2.Variant(calls=[variants_pb2.VariantCall()])
   call = variant_utils.only_call(variant)
   call.call_set_name = 'name'
   call.genotype[:] = [0, 1]
   self.assertLen(variant.calls, 1)
   self.assertEqual(variant.calls[0].call_set_name, 'name')
   self.assertEqual(variant.calls[0].genotype, [0, 1])
Example #12
0
 def test_modify_only_call(self):
   variant = variants_pb2.Variant(calls=[variants_pb2.VariantCall()])
   call = variant_utils.only_call(variant)
   call.call_set_name = 'name'
   call.genotype[:] = [0, 1]
   self.assertLen(variant.calls, 1)
   self.assertEqual(variant.calls[0].call_set_name, 'name')
   self.assertEqual(variant.calls[0].genotype, [0, 1])
 def setUp(self):
     super(VcfStatsTest, self).setUp()
     self.variant = test_utils.make_variant(chrom='chr1',
                                            start=10,
                                            alleles=['A', 'G'],
                                            gt=[0, 1],
                                            gq=59)
     variantcall_utils.set_format(variant_utils.only_call(self.variant),
                                  'DP', 20)
Example #14
0
def uncall_homref_gt_if_lowqual(variant, min_homref_gq):
  """Converts genotype to "./." if variant is CNN RefCall and has low GQ.

  If the variant has "RefCall" filter (which means an example was created for
  this site but CNN didn't call this as variant) and if the GQ is less than
  the given min_homref_gq threshold, set the genotype of the variant proto
  to "./.". See http://internal for more info.

  Args:
    variant: third_party.nucleus.protos.Variant proto.
    min_homref_gq: float.
  """
  vcall = variant_utils.only_call(variant)
  if (variant.filter == [dv_vcf_constants.DEEP_VARIANT_REF_FILTER] and
      variantcall_utils.get_gq(vcall) < min_homref_gq):
    vcall.genotype[:] = [-1, -1]
def _zero_scale_gl(variant):
    """Zero-scales GL to mimic write-then-read.

  When writing variants using VcfWriter, GLs are converted to PLs, which is an
  integer format scaled so the most likely genotype has value 0. This function
  modifies the input variant to mimic this transformation of GL -> PL -> GL.

  Args:
    variant: Variant proto. The variant to scale.

  Returns:
    variant: Variant proto. The input variant with its GLs modified.
  """
    call = variant_utils.only_call(variant)
    max_gl = max(call.genotype_likelihood)
    call.genotype_likelihood[:] = [(gl - max_gl)
                                   for gl in call.genotype_likelihood]
    return variant
Example #16
0
def _extract_single_sample_name(record):
    """Returns the name of the single sample within the CallVariantsOutput file.

  Args:
    record: A deepvariant_pb2.CallVariantsOutput record.

  Returns:
    The name of the single individual in the first proto in the file.

  Raises:
    ValueError: There is not exactly one VariantCall in the proto or the
        call_set_name of the VariantCall is not populated.
  """
    variant = record.variant
    call = variant_utils.only_call(variant)
    name = call.call_set_name
    if not name:
        raise ValueError(
            'Error extracting name: no call_set_name set: {}'.format(record))

    return name
def _extract_single_sample_name(record):
  """Returns the name of the single sample within the CallVariantsOutput file.

  Args:
    record: A deepvariant_pb2.CallVariantsOutput record.

  Returns:
    The name of the single individual in the first proto in the file.

  Raises:
    ValueError: There is not exactly one VariantCall in the proto or the
        call_set_name of the VariantCall is not populated.
  """
  variant = record.variant
  call = variant_utils.only_call(variant)
  name = call.call_set_name
  if not name:
    raise ValueError(
        'Error extracting name: no call_set_name set: {}'.format(record))

  return name
Example #18
0
  def verify_variants(self, variants, region, options, is_gvcf):
    # Verifies simple properties of the Variant protos in variants. For example,
    # checks that the reference_name() is our expected chromosome. The flag
    # is_gvcf determines how we check the VariantCall field of each variant,
    # enforcing expectations for gVCF records if true or variant calls if false.
    for variant in variants:
      self.assertEqual(variant.reference_name, region.reference_name)
      self.assertNotEqual(variant.reference_bases, '')
      self.assertGreater(len(variant.alternate_bases), 0)
      self.assertGreaterEqual(variant.start, region.start)
      self.assertLessEqual(variant.start, region.end)
      self.assertEqual(len(variant.calls), 1)

      call = variant_utils.only_call(variant)
      self.assertEqual(call.call_set_name,
                       options.variant_caller_options.sample_name)
      if is_gvcf:
        # GVCF records should have 0/0 genotypes as they are reference sites,
        # have genotype likelihoods and a GQ value.
        self.assertEqual(call.genotype, [0, 0])
        self.assertEqual(len(call.genotype_likelihood), 3)
        self.assertGreaterEqual(variantcall_utils.get_gq(call), 0)
def add_call_to_variant(variant, predictions, qual_filter=0, sample_name=None):
    """Fills in Variant record using the prediction probabilities.

  This functions sets the call[0].genotype, call[0].info['GQ'],
  call[0].genotype_probabilities, variant.filter, and variant.quality fields of
  variant based on the genotype likelihoods in predictions.

  Args:
    variant: third_party.nucleus.protos.Variant protobuf
      to be filled in with info derived from predictions.
    predictions: N element array-like. The real-space probabilities of each
      genotype state for this variant.
    qual_filter: float. If predictions implies that this isn't a reference call
      and the QUAL of the prediction isn't larger than qual_filter variant will
      be marked as FILTERed.
    sample_name: str. The name of the sample to assign to the Variant proto
      call_set_name field.

  Returns:
    A Variant record.

  Raises:
    ValueError: If variant doesn't have exactly one variant.call record.
  """
    call = variant_utils.only_call(variant)
    n_alleles = len(variant.alternate_bases) + 1
    index, genotype = most_likely_genotype(predictions, n_alleles=n_alleles)
    gq, variant.quality = compute_quals(predictions, index)
    call.call_set_name = sample_name
    variantcall_utils.set_gt(call, genotype)
    variantcall_utils.set_gq(call, gq)
    gls = [
        genomics_math.perror_to_bounded_log10_perror(gp) for gp in predictions
    ]
    variantcall_utils.set_gl(call, gls)
    variant.filter[:] = compute_filter_fields(variant, qual_filter)
    uncall_homref_gt_if_lowqual(variant, FLAGS.cnn_homref_call_min_gq)
    return variant
Example #20
0
def _transform_to_gvcf_record(variant):
  """Modifies a variant to include gVCF allele and associated likelihoods.

  Args:
    variant: third_party.nucleus.protos.Variant. The Variant to modify.

  Returns:
    The variant after applying the modification to its alleles and
    allele-related FORMAT fields.
  """
  if vcf_constants.GVCF_ALT_ALLELE not in variant.alternate_bases:
    variant.alternate_bases.append(vcf_constants.GVCF_ALT_ALLELE)
    # Add one new GL for het allele/gVCF for each of the other alleles, plus one
    # for the homozygous gVCF allele.
    num_new_gls = len(variant.alternate_bases) + 1
    call = variant_utils.only_call(variant)
    call.genotype_likelihood.extend([_GVCF_ALT_ALLELE_GL] * num_new_gls)
    if call.info and 'AD' in call.info:
      call.info['AD'].values.extend([struct_pb2.Value(int_value=0)])
    if call.info and 'VAF' in call.info:
      call.info['VAF'].values.extend([struct_pb2.Value(number_value=0)])

  return variant
def _transform_to_gvcf_record(variant):
  """Modifies a variant to include gVCF allele and associated likelihoods.

  Args:
    variant: third_party.nucleus.protos.Variant. The Variant
      to modify.

  Returns:
    The variant after applying the modification to its alleles and
    allele-related FORMAT fields.
  """
  if vcf_constants.GVCF_ALT_ALLELE not in variant.alternate_bases:
    variant.alternate_bases.append(vcf_constants.GVCF_ALT_ALLELE)
    # Add one new GL for het allele/gVCF for each of the other alleles, plus one
    # for the homozygous gVCF allele.
    num_new_gls = len(variant.alternate_bases) + 1
    call = variant_utils.only_call(variant)
    call.genotype_likelihood.extend([_GVCF_ALT_ALLELE_GL] * num_new_gls)
    if call.info and 'AD' in call.info:
      call.info['AD'].values.extend([struct_pb2.Value(int_value=0)])
    if call.info and 'VAF' in call.info:
      call.info['VAF'].values.extend([struct_pb2.Value(number_value=0)])

  return variant
Example #22
0
  def verify_variants(self, variants, region, options, is_gvcf):
    # Verifies simple properties of the Variant protos in variants. For example,
    # checks that the reference_name() is our expected chromosome. The flag
    # is_gvcf determines how we check the VariantCall field of each variant,
    # enforcing expectations for gVCF records if true or variant calls if false.
    for variant in variants:
      if region:
        self.assertEqual(variant.reference_name, region.reference_name)
        self.assertGreaterEqual(variant.start, region.start)
        self.assertLessEqual(variant.start, region.end)
      self.assertNotEqual(variant.reference_bases, '')
      self.assertNotEmpty(variant.alternate_bases)
      self.assertLen(variant.calls, 1)

      call = variant_utils.only_call(variant)
      self.assertEqual(
          call.call_set_name,
          options.sample_options[0].variant_caller_options.sample_name)
      if is_gvcf:
        # GVCF records should have 0/0 or ./. (un-called) genotypes as they are
        # reference sites, have genotype likelihoods and a GQ value.
        self.assertIn(list(call.genotype), [[0, 0], [-1, -1]])
        self.assertLen(call.genotype_likelihood, 3)
        self.assertGreaterEqual(variantcall_utils.get_gq(call), 0)
Example #23
0
def _allele_indices_configuration_likelihood(variants, allele_indices_config):
  """Returns the joint likelihood of the alleles given to the variants.

  Args:
    variants: list(Variant). The variants with associated likelihoods.
    allele_indices_config: list((int, int)). The allele indices to assign to
      each variant.

  Returns:
    The joint likelihood of the particular allele configuration.

  Raises:
    ValueError: variants and allele_indices_config do not have the same length.
  """
  if len(variants) != len(allele_indices_config):
    raise ValueError(
        'len(variants) must equal len(allele_indices_config): {} vs {}'.format(
            len(variants), len(allele_indices_config)))

  retval = 0
  for variant, alleles in zip(variants, allele_indices_config):
    retval += variant_utils.genotype_likelihood(
        variant_utils.only_call(variant), alleles)
  return retval
def _genotype_from_matched_truth(candidate_variant, truth_variant):
    """Gets the diploid genotype for candidate_variant from matched truth_variant.

  This method figures out the genotype for candidate_variant by matching alleles
  in candidate_variant with those used by the genotype assigned to
  truth_variant. For example, if candidate is A/C and truth is A/C with a 0/1
  genotype, then this function would return (0, 1) indicating that there's one
  copy of the A allele and one of C in truth. If the true genotype is 1/1, then
  this routine would return (1, 1).

  The routine allows candidate_variant and truth_variant to differ in both
  the number of alternate alleles, and even in the representation of the same
  alleles due to those differences. For example, candidate could be:

      AGT/A/AGTGT => 2 bp deletion and 2 bp insertion

  and truth could have:

      A/AGT => just the simplified 2 bp insertion

  And this routine will correctly equate the AGT/AGTGT allele in candidate
  with the A/AGT in truth and use the number of copies of AGT in truth to
  compute the number of copies of AGTGT when determining the returned genotype.

  Args:
    candidate_variant: Our candidate third_party.nucleus.protos.Variant variant.
    truth_variant: Our third_party.nucleus.protos.Variant truth variant
      containing true alleles and genotypes.

  Returns:
    A tuple genotypes with the same semantics at the genotype field of the
    VariantCall proto.

  Raises:
    ValueError: If candidate_variant is None, truth_variant is None, or
      truth_variant doesn't have genotypes.
  """
    if candidate_variant is None:
        raise ValueError('candidate_variant cannot be None')
    if truth_variant is None:
        raise ValueError('truth_variant cannot be None')
    if not variantcall_utils.has_genotypes(
            variant_utils.only_call(truth_variant)):
        raise ValueError(
            'truth_variant needs genotypes to be used for labeling',
            truth_variant)

    def _match_one_allele(true_allele):
        if true_allele == truth_variant.reference_bases:
            return 0
        else:
            simplified_true_allele = variant_utils.simplify_alleles(
                truth_variant.reference_bases, true_allele)
            for alt_index, alt_allele in enumerate(
                    candidate_variant.alternate_bases):
                simplified_alt_allele = variant_utils.simplify_alleles(
                    candidate_variant.reference_bases, alt_allele)
                if simplified_true_allele == simplified_alt_allele:
                    return alt_index + 1
            # If nothing matched, we don't have this alt, so the alt allele index for
            # should be 0 (i.e., not any alt).
            return 0

    # If our candidate_variant is a reference call, return a (0, 0) genotype.
    if variant_utils.is_ref(candidate_variant):
        return (0, 0)
    else:
        return tuple(
            sorted(
                _match_one_allele(true_allele) for true_allele in
                variant_utils.genotype_as_alleles(truth_variant)))
Example #25
0
def _genotype_from_matched_truth(candidate_variant, truth_variant):
  """Gets the diploid genotype for candidate_variant from matched truth_variant.

  This method figures out the genotype for candidate_variant by matching alleles
  in candidate_variant with those used by the genotype assigned to
  truth_variant. For example, if candidate is A/C and truth is A/C with a 0/1
  genotype, then this function would return (0, 1) indicating that there's one
  copy of the A allele and one of C in truth. If the true genotype is 1/1, then
  this routine would return (1, 1).

  The routine allows candidate_variant and truth_variant to differ in both
  the number of alternate alleles, and even in the representation of the same
  alleles due to those differences. For example, candidate could be:

      AGT/A/AGTGT => 2 bp deletion and 2 bp insertion

  and truth could have:

      A/AGT => just the simplified 2 bp insertion

  And this routine will correctly equate the AGT/AGTGT allele in candidate
  with the A/AGT in truth and use the number of copies of AGT in truth to
  compute the number of copies of AGTGT when determining the returned genotype.

  Args:
    candidate_variant: Our candidate third_party.nucleus.protos.Variant variant.
    truth_variant: Our third_party.nucleus.protos.Variant truth variant
      containing true alleles and genotypes.

  Returns:
    A tuple genotypes with the same semantics at the genotype field of the
    VariantCall proto.

  Raises:
    ValueError: If candidate_variant is None, truth_variant is None, or
      truth_variant doesn't have genotypes.
  """
  if candidate_variant is None:
    raise ValueError('candidate_variant cannot be None')
  if truth_variant is None:
    raise ValueError('truth_variant cannot be None')
  if not variantcall_utils.has_genotypes(
      variant_utils.only_call(truth_variant)):
    raise ValueError('truth_variant needs genotypes to be used for labeling',
                     truth_variant)

  def _match_one_allele(true_allele):
    if true_allele == truth_variant.reference_bases:
      return 0
    else:
      simplifed_true_allele = variant_utils.simplify_alleles(
          truth_variant.reference_bases, true_allele)
      for alt_index, alt_allele in enumerate(candidate_variant.alternate_bases):
        simplifed_alt_allele = variant_utils.simplify_alleles(
            candidate_variant.reference_bases, alt_allele)
        if simplifed_true_allele == simplifed_alt_allele:
          return alt_index + 1
      # If nothing matched, we don't have this alt, so the alt allele index for
      # should be 0 (i.e., not any alt).
      return 0

  # If our candidate_variant is a reference call, return a (0, 0) genotype.
  if variant_utils.is_ref(candidate_variant):
    return (0, 0)
  else:
    return tuple(
        sorted(
            _match_one_allele(true_allele) for true_allele in
            variant_utils.genotype_as_alleles(truth_variant)))
Example #26
0
 def test_only_call(self):
   expected = variants_pb2.VariantCall(call_set_name='name', genotype=[0, 1])
   variant = variants_pb2.Variant(calls=[expected])
   actual = variant_utils.only_call(variant)
   self.assertEqual(actual, expected)
Example #27
0
def _nonref_genotype_count(variant):
  """Returns the number of non-reference alleles in the called genotype."""
  return sum(g > 0 for g in variant_utils.only_call(variant).genotype)
Example #28
0
def _resolve_overlapping_variants(overlapping_variants):
  """Yields variants with compatible haplotypes, if possible.

  Args:
    overlapping_variants: list(Variant). A non-empty list of Variant protos in
      coordinate-sorted order that overlap on the reference genome and are
      predicted to contain alternate allele genotypes.

  Yields:
    Variant protos in coordinate-sorted order that try to resolve incompatible
    haplotypes.
  """
  # Short circuit the simplest case: A single variant in a region is compatible
  # with itself by definition.
  if len(overlapping_variants) == 1:
    yield overlapping_variants[0]
    return

  # If the actual genotype calls are compatible, we can safely return those
  # since they would be the most likely configuration also when restricting to
  # only valid configurations of genotype calls.
  calculator = _VariantCompatibilityCalculator(overlapping_variants)
  nonref_counts = [_nonref_genotype_count(v) for v in overlapping_variants]
  if calculator.all_variants_compatible(nonref_counts):
    logging.info('Overlapping variants are naturally compatible: %s',
                 overlapping_variants)
    for variant in overlapping_variants:
      yield variant
    return

  # The actual genotype calls produce an inconsistent haplotype. If the number
  # of affected variants is "too large", avoid processing since this is an
  # exponential process.
  if len(overlapping_variants) > _MAX_OVERLAPPING_VARIANTS_TO_RESOLVE:
    logging.warning(
        'Overlapping variants are not naturally compatible, and there are too '
        'many to exhaustively search (%s). Returning variants without '
        'modification, beginning with %s.', len(overlapping_variants),
        overlapping_variants[0])
    for variant in overlapping_variants:
      yield variant
    return

  # Otherwise, the actual genotype calls are incompatible. Since the genotype
  # likelihoods are generally well-calibrated, we examine all configurations of
  # genotypes that create compatible haplotypes and retain the single
  # configuration with the highest joint likelihood across all variants as the
  # proposed genotype assignment. Separately, we rescale the likelihood of each
  # individual variant using only the valid genotype configurations. If the
  # results are concordant (i.e., the genotype predicted by the marginal
  # likelihood for each variant is the same as the genotype predicted when
  # maximizing the joint likelihood across all variants), we return variants
  # with those calls and the rescaled likelihoods. Otherwise, we log a warning
  # and emit the original (incompatible) variants.
  #
  # For example, a biallelic deletion with probabilities of homref, het, homalt
  # = 0.01, 0.9, 0.09 and inside it a biallelic SNP with probs 0.02, 0.48, 0.5.
  # Naively this would be called as a heterozygous indel and a homozygous SNP,
  # which is impossible as there are three total alternate genotypes. The
  # algorithm does the following:
  #
  #   Indel    SNP    Joint prob
  #   0/0      0/0    0.01 * 0.02 = 0.0002
  #   0/0      0/1    0.01 * 0.48 = 0.0048
  #   0/0      1/1    0.01 * 0.50 = 0.0050
  #   0/1      0/0    0.90 * 0.02 = 0.0180
  #   0/1      0/1    0.90 * 0.48 = 0.4320*
  #   0/1      1/1    <invalid>   = 0
  #   1/1      0/0    0.09 * 0.02 = 0.0018
  #   1/1      0/1    <invalid>   = 0
  #   1/1      1/1    <invalid>   = 0
  #
  #   So using the highest joint likelihood, we predict het indel and het SNP.
  #
  #   The marginal probability of each genotype for the indel is:
  #   0/0:  0.0002 + 0.0048 + 0.0050 = 0.01
  #   0/1:  0.0180 + 0.4320          = 0.45
  #   1/1:  0.0018                   = 0.0018
  #
  #   which after normalizing to sum to 1 is roughly 0.022, 0.974, 0.004.
  #   The marginal probability for the SNP, after performing similar
  #   calculations, is 0.043, 0.946, 0.011. So the marginals also predict a het
  #   indel and a het SNP. Since the two calculations agree, we use this
  #   genotype call and modified likelihoods.
  #
  # First, we find all non-reference count configurations that are compatible.
  # This represents each variant solely based on its number of non-reference
  # genotypes, and assumes that variants are compatible if the total number of
  # non-reference genotypes at a single position is at most two. By using
  # non-reference counts, we avoid testing multiple allele configurations that
  # will return the same result (e.g. a variant with two possible alternate
  # alleles has three allele configurations that are homozygous alternate
  # [1/1, 1/2, 2/2] and either all or none of them will be valid depending on
  # the variants it interacts with).
  valid_nonref_count_configurations = [
      conf for conf in itertools.product(
          [0, 1, 2], repeat=len(overlapping_variants))
      if calculator.all_variants_compatible(conf)
  ]

  # Next, we find the single compatible variant assignment with the individually
  # highest likelihood and track the total likelihood distributed to all variant
  # genotypes.
  likelihood_aggregators = [
      _LikelihoodAggregator(len(v.alternate_bases))
      for v in overlapping_variants
  ]
  most_likely_allele_indices_config = None
  most_likely_likelihood = None
  for nonref_count_config in valid_nonref_count_configurations:
    for allele_indices_config in _get_all_allele_indices_configurations(
        overlapping_variants, nonref_count_config):
      config_likelihood = _allele_indices_configuration_likelihood(
          overlapping_variants, allele_indices_config)
      if (most_likely_likelihood is None or
          config_likelihood > most_likely_likelihood):
        most_likely_likelihood = config_likelihood
        most_likely_allele_indices_config = allele_indices_config
      for aggregator, allele_indices in zip(likelihood_aggregators,
                                            allele_indices_config):
        aggregator.add(allele_indices, config_likelihood)

  marginal_allele_indices_config = tuple(
      agg.most_likely_allele_indices() for agg in likelihood_aggregators)
  if marginal_allele_indices_config == most_likely_allele_indices_config:
    logging.info(
        'Overlapping variants are not naturally compatible, but the genotype '
        'configuration with the most likely joint likelihood is the same as '
        'that from the scaled marginal likelihoods: %s',
        overlapping_variants[0])
    # Collapse the probabilities of all configurations to a single GL for each
    # allele, independently for each variant.
    scaled_gls = [agg.scaled_likelihoods() for agg in likelihood_aggregators]

    for variant, allele_indices, gls in zip(
        overlapping_variants, most_likely_allele_indices_config, scaled_gls):
      newvariant = copy.deepcopy(variant)
      call = variant_utils.only_call(newvariant)
      call.genotype[:] = allele_indices
      call.genotype_likelihood[:] = gls
      yield newvariant
  else:
    logging.warning(
        'Overlapping variants are not naturally compatible, and the genotype '
        'configuration with the most likely joint likelihood is different from '
        'that using the scaled marginal likelihoods: %s',
        overlapping_variants[0])
    # redacted
    for variant in overlapping_variants:
      yield variant
Example #29
0
def _get_vaf(variant, vcf_reader):
  """Gets the VAF (variant allele frequency)."""
  vafs = variantcall_utils.get_format(
      variant_utils.only_call(variant), 'VAF', vcf_reader)
  return sum(vafs)
Example #30
0
 def test_only_call(self):
   expected = variants_pb2.VariantCall(call_set_name='name', genotype=[0, 1])
   variant = variants_pb2.Variant(calls=[expected])
   actual = variant_utils.only_call(variant)
   self.assertEqual(actual, expected)