def verify_examples(self, examples_filename, region, options,
                        verify_labels):
        # Do some simple structural checks on the tf.Examples in the file.
        expected_labels = [
            'variant/encoded', 'locus', 'image/format', 'image/encoded',
            'alt_allele_indices/encoded'
        ]
        if verify_labels:
            expected_labels += ['label', 'truth_variant/encoded']

        examples = list(io_utils.read_tfrecords(examples_filename))
        for example in examples:
            for label_feature in expected_labels:
                self.assertIn(label_feature, example.features.feature)
            # pylint: disable=g-explicit-length-test
            self.assertGreater(
                len(tf_utils.example_alt_alleles_indices(example)), 0)

            if verify_labels:
                # Check that our variant and our truth_variant both have the same start.
                self.assertEqual(
                    variantutils.variant_position(
                        tf_utils.example_variant(example)),
                    variantutils.variant_position(
                        tf_utils.example_truth_variant(example)))

        # Check that the variants in the examples are good.
        variants = [tf_utils.example_variant(x) for x in examples]
        self.verify_variants(variants, region, options, is_gvcf=False)

        return examples
def examples_to_variants(examples_path, max_records=None):
    """Yields Variant protos from the examples in examples_path.

  This function reads in tf.Examples produced by DeepVariant from examples_path,
  which may contain a sharded spec, sorts them, selects a representive example
  when there are multiple versions representing different alt_alleles, and
  yields the example_variant field from those examples.

  Args:
    examples_path: str. Path, or sharded spec, to labeled tf.Examples produced
      by DeepVariant in training mode.
    max_records: int or None. Maximum number of records to read, or None, to
      read all of the records.

  Yields:
    nucleus.protos.Variant protos in coordinate-sorted order.

  Raises:
    ValueError: if we find a Variant in any example that doesn't have genotypes.
  """
    examples = io_utils.read_tfrecords(examples_path, max_records=max_records)
    variants = sorted(
        (tf_utils.example_variant(example) for example in examples),
        key=variant_utils.variant_range_tuple)

    for _, group in itertools.groupby(variants,
                                      variant_utils.variant_range_tuple):
        variant = next(group)
        if not variantcall_utils.has_genotypes(
                variant_utils.only_call(variant)):
            raise ValueError((
                'Variant {} does not have any genotypes. This tool only works with '
                'variants that have been labeled.').format(
                    variant_utils.variant_key(variant)))
        yield variant
    def test_create_pileup_examples(self):
        self.processor.pic = mock.Mock()
        self.add_mock('_encode_tensor',
                      side_effect=[
                          ('tensor1', self.default_shape, self.default_format),
                          ('tensor2', self.default_shape, self.default_format)
                      ])
        dv_call = mock.Mock()
        dv_call.variant = test_utils.make_variant(start=10,
                                                  alleles=['A', 'C', 'G'])
        ex = mock.Mock()
        alt1, alt2 = ['C'], ['G']
        self.processor.pic.create_pileup_images.return_value = [
            (alt1, 'tensor1'), (alt2, 'tensor2')
        ]

        actual = self.processor.create_pileup_examples(dv_call)

        self.processor.pic.create_pileup_images.assert_called_once_with(
            dv_call)

        self.assertEquals(len(actual), 2)
        for ex, (alt, img) in zip(actual, [(alt1, 'tensor1'),
                                           (alt2, 'tensor2')]):
            self.assertEqual(tf_utils.example_alt_alleles(ex), alt)
            self.assertEqual(tf_utils.example_variant(ex), dv_call.variant)
            self.assertEqual(tf_utils.example_encoded_image(ex), img)
            self.assertEqual(tf_utils.example_image_shape(ex),
                             self.default_shape)
            self.assertEqual(tf_utils.example_image_format(ex),
                             self.default_format)
Example #4
0
  def test_add_label_to_example(self, label, expected_label_value):
    example = self._example_for_variant(label.variant)
    labeled = copy.deepcopy(example)
    actual = self.processor.add_label_to_example(labeled, label)

    # The add_label_to_example command modifies labeled and returns it.
    self.assertIs(actual, labeled)

    # Check that all keys from example are present in labeled.
    for key, value in example.features.feature.iteritems():
      if key != 'variant/encoded':  # Special case tested below.
        self.assertEqual(value, labeled.features.feature[key])

    # The genotype of our example_variant should be set to the true genotype
    # according to our label.
    self.assertEqual(expected_label_value, tf_utils.example_label(labeled))
    labeled_variant = tf_utils.example_variant(labeled)
    call = variant_utils.only_call(labeled_variant)
    self.assertEqual(tuple(call.genotype), label.genotype)

    # The original variant and labeled_variant from out tf.Example should be
    # equal except for the genotype field, since this is set by
    # add_label_to_example.
    label.variant.calls[0].genotype[:] = []
    call.genotype[:] = []
    self.assertEqual(label.variant, labeled_variant)
Example #5
0
    def test_add_label_to_example(self, label, expected_label_value):
        example = self._example_for_variant(label.variant)
        labeled = copy.deepcopy(example)
        actual = self.processor.add_label_to_example(labeled, label)

        # The add_label_to_example command modifies labeled and returns it.
        self.assertIs(actual, labeled)

        # Check that all keys from example are present in labeled.
        for key, value in example.features.feature.items():
            if key != 'variant/encoded':  # Special case tested below.
                self.assertEqual(value, labeled.features.feature[key])

        # The genotype of our example_variant should be set to the true genotype
        # according to our label.
        self.assertEqual(expected_label_value, tf_utils.example_label(labeled))
        labeled_variant = tf_utils.example_variant(labeled)
        call = variant_utils.only_call(labeled_variant)
        self.assertEqual(tuple(call.genotype), label.genotype)

        # The original variant and labeled_variant from out tf.Example should be
        # equal except for the genotype field, since this is set by
        # add_label_to_example.
        label.variant.calls[0].genotype[:] = []
        call.genotype[:] = []
        self.assertEqual(label.variant, labeled_variant)
def examples_to_variants(examples_path, max_records=None):
  """Yields Variant protos from the examples in examples_path.

  This function reads in tf.Examples produced by DeepVariant from examples_path,
  which may contain a sharded spec, sorts them, selects a representive example
  when there are multiple versions representing different alt_alleles, and
  yields the example_variant field from those examples.

  Args:
    examples_path: str. Path, or sharded spec, to labeled tf.Examples produced
      by DeepVariant in training mode.
    max_records: int or None. Maximum number of records to read, or None, to
      read all of the records.

  Yields:
    nucleus.protos.Variant protos in coordinate-sorted order.

  Raises:
    ValueError: if we find a Variant in any example that doesn't have genotypes.
  """
  examples = io_utils.read_tfrecords(examples_path, max_records=max_records)
  variants = sorted(
      (tf_utils.example_variant(example) for example in examples),
      key=variant_utils.variant_range_tuple)

  for _, group in itertools.groupby(variants,
                                    variant_utils.variant_range_tuple):
    variant = next(group)
    if not variantcall_utils.has_genotypes(variant_utils.only_call(variant)):
      raise ValueError((
          'Variant {} does not have any genotypes. This tool only works with '
          'variants that have been labeled.').format(
              variant_utils.variant_key(variant)))
    yield variant
Example #7
0
  def testMakeExample(self):
    example = tf_utils.make_example(self.variant, self.alts, self.encoded_image,
                                    self.default_shape, self.default_format)

    self.assertEqual(self.encoded_image,
                     tf_utils.example_encoded_image(example))
    self.assertEqual(
        'raw', example.features.feature['image/format'].bytes_list.value[0])
    self.assertEqual(self.variant, tf_utils.example_variant(example))
    self.assertEqual('1:11-11', tf_utils.example_locus(example))
    self.assertEqual([0], tf_utils.example_alt_alleles_indices(example))
    self.assertEqual('1:11:C->A', tf_utils.example_key(example))
Example #8
0
  def verify_examples(self, examples_filename, region, options, verify_labels):
    # Do some simple structural checks on the tf.Examples in the file.
    expected_features = [
        'variant/encoded', 'locus', 'image/format', 'image/encoded',
        'alt_allele_indices/encoded'
    ]
    if verify_labels:
      expected_features += ['label']

    examples = list(io_utils.read_tfrecords(examples_filename))
    for example in examples:
      for label_feature in expected_features:
        self.assertIn(label_feature, example.features.feature)
      # pylint: disable=g-explicit-length-test
      self.assertGreater(len(tf_utils.example_alt_alleles_indices(example)), 0)

    # Check that the variants in the examples are good.
    variants = [tf_utils.example_variant(x) for x in examples]
    self.verify_variants(variants, region, options, is_gvcf=False)

    return examples
Example #9
0
  def verify_examples(self, examples_filename, region, options, verify_labels):
    # Do some simple structural checks on the tf.Examples in the file.
    expected_features = [
        'variant/encoded', 'locus', 'image/format', 'image/encoded',
        'alt_allele_indices/encoded'
    ]
    if verify_labels:
      expected_features += ['label']

    examples = list(tfrecord.read_tfrecords(examples_filename))
    for example in examples:
      for label_feature in expected_features:
        self.assertIn(label_feature, example.features.feature)
      # pylint: disable=g-explicit-length-test
      self.assertNotEmpty(tf_utils.example_alt_alleles_indices(example))

    # Check that the variants in the examples are good.
    variants = [tf_utils.example_variant(x) for x in examples]
    self.verify_variants(variants, region, options, is_gvcf=False)

    return examples
Example #10
0
  def test_create_pileup_examples(self):
    self.processor.pic = mock.Mock()
    self.add_mock(
        '_encode_tensor',
        side_effect=[('tensor1', self.default_shape, self.default_format),
                     ('tensor2', self.default_shape, self.default_format)])
    dv_call = mock.Mock()
    dv_call.variant = test_utils.make_variant(start=10, alleles=['A', 'C', 'G'])
    ex = mock.Mock()
    alt1, alt2 = ['C'], ['G']
    self.processor.pic.create_pileup_images.return_value = [(alt1, 'tensor1'),
                                                            (alt2, 'tensor2')]

    actual = self.processor.create_pileup_examples(dv_call)

    self.processor.pic.create_pileup_images.assert_called_once_with(dv_call)

    self.assertEquals(len(actual), 2)
    for ex, (alt, img) in zip(actual, [(alt1, 'tensor1'), (alt2, 'tensor2')]):
      self.assertEqual(tf_utils.example_alt_alleles(ex), alt)
      self.assertEqual(tf_utils.example_variant(ex), dv_call.variant)
      self.assertEqual(tf_utils.example_encoded_image(ex), img)
      self.assertEqual(tf_utils.example_image_shape(ex), self.default_shape)
      self.assertEqual(tf_utils.example_image_format(ex), self.default_format)
Example #11
0
    def test_create_pileup_examples(self):
        self.processor.pic = mock.Mock()
        self.processor.pic.get_reads.return_value = []
        self.add_mock('_encode_tensor',
                      side_effect=[(six.b('tensor1'), self.default_shape,
                                    self.default_format),
                                   (six.b('tensor2'), self.default_shape,
                                    self.default_format)])
        dv_call = mock.Mock()
        dv_call.variant = test_utils.make_variant(start=10,
                                                  alleles=['A', 'C', 'G'])
        ex = mock.Mock()
        alt1, alt2 = ['C'], ['G']
        self.processor.pic.create_pileup_images.return_value = [
            (alt1, six.b('tensor1')), (alt2, six.b('tensor2'))
        ]

        actual = self.processor.create_pileup_examples(dv_call)

        self.processor.pic.create_pileup_images.assert_called_once_with(
            dv_call=dv_call,
            reads_for_samples=[[]],
            haplotype_alignments_for_samples=None,
            haplotype_sequences=None,
            sample_order=None)

        self.assertLen(actual, 2)
        for ex, (alt, img) in zip(actual, [(alt1, six.b('tensor1')),
                                           (alt2, six.b('tensor2'))]):
            self.assertEqual(tf_utils.example_alt_alleles(ex), alt)
            self.assertEqual(tf_utils.example_variant(ex), dv_call.variant)
            self.assertEqual(tf_utils.example_encoded_image(ex), img)
            self.assertEqual(tf_utils.example_image_shape(ex),
                             self.default_shape)
            self.assertEqual(tf_utils.example_image_format(ex),
                             six.b(self.default_format))
Example #12
0
  def test_make_examples_end2end(self, mode, num_shards,
                                 labeler_algorithm=None):
    self.maxDiff = None
    self.assertIn(mode, {'calling', 'training'})
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('vsc.tfrecord', num_shards))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = mode
    FLAGS.gvcf_gq_binsize = 5
    if labeler_algorithm is not None:
      FLAGS.labeler_algorithm = labeler_algorithm

    if mode == 'calling':
      FLAGS.gvcf = test_utils.test_tmpfile(
          _sharded('gvcf.tfrecord', num_shards))
    else:
      FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
      FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED

    for task_id in range(max(num_shards, 1)):
      FLAGS.task = task_id
      options = make_examples.default_options(add_flags=True)
      make_examples.make_examples_runner(options)

    # Test that our candidates are reasonable, calling specific helper functions
    # to check lots of properties of the output.
    candidates = sorted(
        io_utils.read_tfrecords(
            FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall),
        key=lambda c: variant_utils.variant_range_tuple(c.variant))
    self.verify_deepvariant_calls(candidates, options)
    self.verify_variants(
        [call.variant for call in candidates], region, options, is_gvcf=False)

    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=mode == 'training')
    example_variants = [tf_utils.example_variant(ex) for ex in examples]
    self.verify_variants(example_variants, region, options, is_gvcf=False)

    # Verify the integrity of the examples and then check that they match our
    # golden labeled examples. Note we expect the order for both training and
    # calling modes to produce deterministic order because we fix the random
    # seed.
    if mode == 'calling':
      golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards)
    else:
      golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards)
    self.assertDeepVariantExamplesEqual(
        examples, list(io_utils.read_tfrecords(golden_file)))

    if mode == 'calling':
      nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF)
      nist_variants = list(nist_reader.query(region))
      self.verify_nist_concordance(example_variants, nist_variants)

      # Check the quality of our generated gvcf file.
      gvcfs = variant_utils.sorted_variants(
          io_utils.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant))
      self.verify_variants(gvcfs, region, options, is_gvcf=True)
      self.verify_contiguity(gvcfs, region)
      gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT,
                                  num_shards)
      expected_gvcfs = list(
          io_utils.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant))
      self.assertItemsEqual(gvcfs, expected_gvcfs)
    def test_make_examples_end2end(self, mode, num_shards):
        self.assertIn(mode, {'calling', 'training'})
        region = ranges.parse_literal('chr20:10,000,000-10,010,000')
        FLAGS.ref = test_utils.CHR20_FASTA
        FLAGS.reads = test_utils.CHR20_BAM
        FLAGS.candidates = test_utils.test_tmpfile(
            _sharded('vsc.tfrecord', num_shards))
        FLAGS.examples = test_utils.test_tmpfile(
            _sharded('examples.tfrecord', num_shards))
        FLAGS.regions = [ranges.to_literal(region)]
        FLAGS.partition_size = 1000
        FLAGS.mode = mode

        if mode == 'calling':
            FLAGS.gvcf = test_utils.test_tmpfile(
                _sharded('gvcf.tfrecord', num_shards))
        else:
            FLAGS.truth_variants = test_utils.TRUTH_VARIANTS_VCF
            FLAGS.confident_regions = test_utils.CONFIDENT_REGIONS_BED

        for task_id in range(max(num_shards, 1)):
            FLAGS.task = task_id
            options = make_examples.default_options(add_flags=True)
            make_examples.make_examples_runner(options)

        # Test that our candidates are reasonable, calling specific helper functions
        # to check lots of properties of the output.
        candidates = _sort_candidates(
            io_utils.read_tfrecords(FLAGS.candidates,
                                    proto=deepvariant_pb2.DeepVariantCall))
        self.verify_deepvariant_calls(candidates, options)
        self.verify_variants([call.variant for call in candidates],
                             region,
                             options,
                             is_gvcf=False)

        # Verify that the variants in the examples are all good.
        examples = self.verify_examples(FLAGS.examples,
                                        region,
                                        options,
                                        verify_labels=mode == 'training')
        example_variants = [tf_utils.example_variant(ex) for ex in examples]
        self.verify_variants(example_variants, region, options, is_gvcf=False)

        # Verify the integrity of the examples and then check that they match our
        # golden labeled examples. Note we expect the order for both training and
        # calling modes to produce deterministic order because we fix the random
        # seed.
        if mode == 'calling':
            golden_file = _sharded(test_utils.GOLDEN_CALLING_EXAMPLES,
                                   num_shards)
        else:
            golden_file = _sharded(test_utils.GOLDEN_TRAINING_EXAMPLES,
                                   num_shards)
        self.assertDeepVariantExamplesEqual(
            examples, list(io_utils.read_tfrecords(golden_file)))

        if mode == 'calling':
            nist_reader = genomics_io.make_vcf_reader(
                test_utils.TRUTH_VARIANTS_VCF)
            nist_variants = list(nist_reader.query(region))
            self.verify_nist_concordance(example_variants, nist_variants)

            # Check the quality of our generated gvcf file.
            gvcfs = _sort_variants(
                io_utils.read_tfrecords(FLAGS.gvcf,
                                        proto=variants_pb2.Variant))
            self.verify_variants(gvcfs, region, options, is_gvcf=True)
            self.verify_contiguity(gvcfs, region)
Example #14
0
 def setUpClass(cls):
   cls.examples = list(
       io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES))
   cls.variants = [tf_utils.example_variant(ex) for ex in cls.examples]
   cls.model = modeling.get_model('random_guess')
Example #15
0
 def example_matches_call_variants_output(example,
                                          call_variants_output):
     return (tf_utils.example_variant(example)
             == call_variants_output.variant
             and tf_utils.example_alt_alleles_indices(example)
             == call_variants_output.alt_allele_indices.indices)
    def test_call_end2end(self, model, shard_inputs, include_debug_info):
        FLAGS.include_debug_info = include_debug_info
        examples = list(
            io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES))

        if shard_inputs:
            # Create a sharded version of our golden examples.
            source_path = test_utils.test_tmpfile('sharded@{}'.format(3))
            io_utils.write_tfrecords(examples, source_path)
        else:
            source_path = testdata.GOLDEN_CALLING_EXAMPLES

        batch_size = 4
        if model.name == 'random_guess':
            # For the random guess model we can run everything.
            max_batches = None
        else:
            # For all other models we only run a single batch for inference.
            max_batches = 1

        outfile = test_utils.test_tmpfile('call_variants.tfrecord')
        call_variants.call_variants(
            examples_filename=source_path,
            checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST,
            model=model,
            output_file=outfile,
            batch_size=batch_size,
            max_batches=max_batches)

        call_variants_outputs = list(
            io_utils.read_tfrecords(outfile,
                                    deepvariant_pb2.CallVariantsOutput))

        # Check that we have the right number of output protos.
        self.assertEqual(
            len(call_variants_outputs),
            batch_size * max_batches if max_batches else len(examples))

        # Check that our CallVariantsOutput (CVO) have the following critical
        # properties:
        # - we have one CVO for each example we processed.
        # - the variant in the CVO is exactly what was in the example.
        # - the alt_allele_indices of the CVO match those of its corresponding
        #   example.
        # - there are 3 genotype probabilities and these are between 0.0 and 1.0.
        # We can only do this test when processing all of the variants (max_batches
        # is None), since we processed all of the examples with that model.
        if max_batches is None:
            self.assertItemsEqual(
                [cvo.variant for cvo in call_variants_outputs],
                [tf_utils.example_variant(ex) for ex in examples])

        # Check the CVO debug_info: not filled if include_debug_info is False;
        # else, filled by logic based on CVO.
        if not include_debug_info:
            for cvo in call_variants_outputs:
                self.assertEqual(
                    cvo.debug_info,
                    deepvariant_pb2.CallVariantsOutput.DebugInfo())
        else:
            for cvo in call_variants_outputs:
                self.assertEqual(cvo.debug_info.has_insertion,
                                 variant_utils.has_insertion(cvo.variant))
                self.assertEqual(cvo.debug_info.has_deletion,
                                 variant_utils.has_deletion(cvo.variant))
                self.assertEqual(cvo.debug_info.is_snp,
                                 variant_utils.is_snp(cvo.variant))
                self.assertEqual(cvo.debug_info.predicted_label,
                                 np.argmax(cvo.genotype_probabilities))

        def example_matches_call_variants_output(example,
                                                 call_variants_output):
            return (tf_utils.example_variant(example)
                    == call_variants_output.variant
                    and tf_utils.example_alt_alleles_indices(example)
                    == call_variants_output.alt_allele_indices.indices)

        for call_variants_output in call_variants_outputs:
            # Find all matching examples.
            matches = [
                ex for ex in examples if example_matches_call_variants_output(
                    ex, call_variants_output)
            ]
            # We should have exactly one match.
            self.assertEqual(len(matches), 1)
            example = matches[0]
            # Check that we've faithfully copied in the alt alleles (though currently
            # as implemented we find our example using this information so it cannot
            # fail). Included here in case that changes in the future.
            self.assertEqual(
                list(tf_utils.example_alt_alleles_indices(example)),
                list(call_variants_output.alt_allele_indices.indices))
            # We should have exactly three genotype probabilities (assuming our
            # ploidy == 2).
            self.assertEqual(len(call_variants_output.genotype_probabilities),
                             3)
            # These are probabilities so they should be between 0 and 1.
            self.assertTrue(
                0 <= gp <= 1
                for gp in call_variants_output.genotype_probabilities)
def _example_sort_key(example):
  return variant_utils.variant_range_tuple(tf_utils.example_variant(example))
def _example_sort_key(example):
    return variant_utils.variant_range_tuple(tf_utils.example_variant(example))
Example #19
0
  def test_call_end2end(self, model, shard_inputs, include_debug_info):
    FLAGS.include_debug_info = include_debug_info
    examples = list(io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES))

    if shard_inputs:
      # Create a sharded version of our golden examples.
      source_path = test_utils.test_tmpfile('sharded@{}'.format(3))
      io_utils.write_tfrecords(examples, source_path)
    else:
      source_path = testdata.GOLDEN_CALLING_EXAMPLES

    batch_size = 4
    if model.name == 'random_guess':
      # For the random guess model we can run everything.
      max_batches = None
    else:
      # For all other models we only run a single batch for inference.
      max_batches = 1

    outfile = test_utils.test_tmpfile('call_variants.tfrecord')
    call_variants.call_variants(
        examples_filename=source_path,
        checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST,
        model=model,
        output_file=outfile,
        batch_size=batch_size,
        max_batches=max_batches)

    call_variants_outputs = list(
        io_utils.read_tfrecords(outfile, deepvariant_pb2.CallVariantsOutput))

    # Check that we have the right number of output protos.
    self.assertEqual(
        len(call_variants_outputs), batch_size * max_batches
        if max_batches else len(examples))

    # Check that our CallVariantsOutput (CVO) have the following critical
    # properties:
    # - we have one CVO for each example we processed.
    # - the variant in the CVO is exactly what was in the example.
    # - the alt_allele_indices of the CVO match those of its corresponding
    #   example.
    # - there are 3 genotype probabilities and these are between 0.0 and 1.0.
    # We can only do this test when processing all of the variants (max_batches
    # is None), since we processed all of the examples with that model.
    if max_batches is None:
      self.assertItemsEqual([cvo.variant for cvo in call_variants_outputs],
                            [tf_utils.example_variant(ex) for ex in examples])

    # Check the CVO debug_info: not filled if include_debug_info is False;
    # else, filled by logic based on CVO.
    if not include_debug_info:
      for cvo in call_variants_outputs:
        self.assertEqual(cvo.debug_info,
                         deepvariant_pb2.CallVariantsOutput.DebugInfo())
    else:
      for cvo in call_variants_outputs:
        self.assertEqual(cvo.debug_info.has_insertion,
                         variant_utils.has_insertion(cvo.variant))
        self.assertEqual(cvo.debug_info.has_deletion,
                         variant_utils.has_deletion(cvo.variant))
        self.assertEqual(cvo.debug_info.is_snp, variant_utils.is_snp(
            cvo.variant))
        self.assertEqual(cvo.debug_info.predicted_label,
                         np.argmax(cvo.genotype_probabilities))

    def example_matches_call_variants_output(example, call_variants_output):
      return (tf_utils.example_variant(example) == call_variants_output.variant
              and tf_utils.example_alt_alleles_indices(
                  example) == call_variants_output.alt_allele_indices.indices)

    for call_variants_output in call_variants_outputs:
      # Find all matching examples.
      matches = [
          ex for ex in examples
          if example_matches_call_variants_output(ex, call_variants_output)
      ]
      # We should have exactly one match.
      self.assertEqual(len(matches), 1)
      example = matches[0]
      # Check that we've faithfully copied in the alt alleles (though currently
      # as implemented we find our example using this information so it cannot
      # fail). Included here in case that changes in the future.
      self.assertEqual(
          list(tf_utils.example_alt_alleles_indices(example)),
          list(call_variants_output.alt_allele_indices.indices))
      # We should have exactly three genotype probabilities (assuming our
      # ploidy == 2).
      self.assertEqual(len(call_variants_output.genotype_probabilities), 3)
      # These are probabilities so they should be between 0 and 1.
      self.assertTrue(
          0 <= gp <= 1 for gp in call_variants_output.genotype_probabilities)
Example #20
0
    def test_call_end2end(self, model, shard_inputs, include_debug_info):
        FLAGS.include_debug_info = include_debug_info
        (call_variants_outputs, examples, batch_size,
         max_batches) = self._call_end2end_helper(
             testdata.GOLDEN_CALLING_EXAMPLES, model, shard_inputs)
        # Check that we have the right number of output protos.
        self.assertEqual(
            len(call_variants_outputs),
            batch_size * max_batches if max_batches else len(examples))

        # Check that our CallVariantsOutput (CVO) have the following critical
        # properties:
        # - we have one CVO for each example we processed.
        # - the variant in the CVO is exactly what was in the example.
        # - the alt_allele_indices of the CVO match those of its corresponding
        #   example.
        # - there are 3 genotype probabilities and these are between 0.0 and 1.0.
        # We can only do this test when processing all of the variants (max_batches
        # is None), since we processed all of the examples with that model.
        if max_batches is None:
            self.assertItemsEqual(
                [cvo.variant for cvo in call_variants_outputs],
                [tf_utils.example_variant(ex) for ex in examples])

        # Check the CVO debug_info: not filled if include_debug_info is False;
        # else, filled by logic based on CVO.
        if not include_debug_info:
            for cvo in call_variants_outputs:
                self.assertEqual(
                    cvo.debug_info,
                    deepvariant_pb2.CallVariantsOutput.DebugInfo())
        else:
            for cvo in call_variants_outputs:
                self.assertEqual(cvo.debug_info.has_insertion,
                                 variant_utils.has_insertion(cvo.variant))
                self.assertEqual(cvo.debug_info.has_deletion,
                                 variant_utils.has_deletion(cvo.variant))
                self.assertEqual(cvo.debug_info.is_snp,
                                 variant_utils.is_snp(cvo.variant))
                self.assertEqual(cvo.debug_info.predicted_label,
                                 np.argmax(cvo.genotype_probabilities))

        def example_matches_call_variants_output(example,
                                                 call_variants_output):
            return (tf_utils.example_variant(example)
                    == call_variants_output.variant
                    and tf_utils.example_alt_alleles_indices(example)
                    == call_variants_output.alt_allele_indices.indices)

        for call_variants_output in call_variants_outputs:
            # Find all matching examples.
            matches = [
                ex for ex in examples if example_matches_call_variants_output(
                    ex, call_variants_output)
            ]
            # We should have exactly one match.
            self.assertEqual(len(matches), 1)
            example = matches[0]
            # Check that we've faithfully copied in the alt alleles (though currently
            # as implemented we find our example using this information so it cannot
            # fail). Included here in case that changes in the future.
            self.assertEqual(
                list(tf_utils.example_alt_alleles_indices(example)),
                list(call_variants_output.alt_allele_indices.indices))
            # We should have exactly three genotype probabilities (assuming our
            # ploidy == 2).
            self.assertEqual(len(call_variants_output.genotype_probabilities),
                             3)
            # These are probabilities so they should be between 0 and 1.
            self.assertTrue(
                0 <= gp <= 1
                for gp in call_variants_output.genotype_probabilities)
Example #21
0
 def example_matches_call_variants_output(example, call_variants_output):
   return (tf_utils.example_variant(example) == call_variants_output.variant
           and tf_utils.example_alt_alleles_indices(
               example) == call_variants_output.alt_allele_indices.indices)
Example #22
0
 def setUpClass(cls):
     cls.examples = list(
         io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES))
     cls.variants = [tf_utils.example_variant(ex) for ex in cls.examples]
     cls.model = modeling.get_model('random_guess')
Example #23
0
  def test_make_examples_end2end(self,
                                 mode,
                                 num_shards,
                                 test_condition=TestConditions.USE_BAM,
                                 labeler_algorithm=None,
                                 use_fast_pass_aligner=True):
    self.assertIn(mode, {'calling', 'training'})
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.write_run_info = True
    FLAGS.ref = testdata.CHR20_FASTA
    if test_condition == TestConditions.USE_BAM:
      FLAGS.reads = testdata.CHR20_BAM
    elif test_condition == TestConditions.USE_CRAM:
      FLAGS.reads = testdata.CHR20_CRAM
    elif test_condition == TestConditions.USE_MULTI_BAMS:
      FLAGS.reads = ','.join(
          [testdata.CHR20_BAM_FIRST_HALF, testdata.CHR20_BAM_SECOND_HALF])

    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('vsc.tfrecord', num_shards))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = mode
    FLAGS.gvcf_gq_binsize = 5
    FLAGS.use_fast_pass_aligner = use_fast_pass_aligner
    if labeler_algorithm is not None:
      FLAGS.labeler_algorithm = labeler_algorithm

    if mode == 'calling':
      FLAGS.gvcf = test_utils.test_tmpfile(
          _sharded('gvcf.tfrecord', num_shards))
    else:
      FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
      FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED

    for task_id in range(max(num_shards, 1)):
      FLAGS.task = task_id
      options = make_examples.default_options(add_flags=True)
      # We need to overwrite bam_fname for USE_CRAM test since Golden Set
      # generated from BAM file. BAM filename is stored in candidates. If we
      # don't overwrite default_options variants won't match and test fail.
      options.bam_fname = 'NA12878_S1.chr20.10_10p1mb.bam'
      make_examples_core.make_examples_runner(options)

      # Check that our run_info proto contains the basic fields we'd expect:
      # (a) our options are written to the run_info.options field.
      run_info = make_examples_core.read_make_examples_run_info(
          options.run_info_filename)
      self.assertEqual(run_info.options, options)
      # (b) run_info.resource_metrics is present and contains our hostname.
      self.assertTrue(run_info.HasField('resource_metrics'))
      self.assertEqual(run_info.resource_metrics.host_name, platform.node())

    # Test that our candidates are reasonable, calling specific helper functions
    # to check lots of properties of the output.
    candidates = sorted(
        tfrecord.read_tfrecords(
            FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall),
        key=lambda c: variant_utils.variant_range_tuple(c.variant))
    self.verify_deepvariant_calls(candidates, options)
    self.verify_variants([call.variant for call in candidates],
                         region,
                         options,
                         is_gvcf=False)

    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=mode == 'training')
    example_variants = [tf_utils.example_variant(ex) for ex in examples]
    self.verify_variants(example_variants, region, options, is_gvcf=False)

    # Verify the integrity of the examples and then check that they match our
    # golden labeled examples. Note we expect the order for both training and
    # calling modes to produce deterministic order because we fix the random
    # seed.
    if mode == 'calling':
      golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards)
    else:
      golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards)
    self.assertDeepVariantExamplesEqual(
        examples, list(tfrecord.read_tfrecords(golden_file)))

    if mode == 'calling':
      nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF)
      nist_variants = list(nist_reader.query(region))
      self.verify_nist_concordance(example_variants, nist_variants)

      # Check the quality of our generated gvcf file.
      gvcfs = variant_utils.sorted_variants(
          tfrecord.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant))
      self.verify_variants(gvcfs, region, options, is_gvcf=True)
      self.verify_contiguity(gvcfs, region)
      gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT,
                                  num_shards)
      expected_gvcfs = list(
          tfrecord.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant))
      # Despite the name, assertCountEqual checks that all elements match.
      self.assertCountEqual(gvcfs, expected_gvcfs)

    if (mode == 'training' and num_shards == 0 and
        labeler_algorithm != 'positional_labeler'):
      # The positional labeler doesn't track metrics, so don't try to read them
      # in when that's the mode.
      self.assertEqual(
          make_examples_core.read_make_examples_run_info(
              testdata.GOLDEN_MAKE_EXAMPLES_RUN_INFO).labeling_metrics,
          run_info.labeling_metrics)