def verify_examples(self, examples_filename, region, options, verify_labels): # Do some simple structural checks on the tf.Examples in the file. expected_labels = [ 'variant/encoded', 'locus', 'image/format', 'image/encoded', 'alt_allele_indices/encoded' ] if verify_labels: expected_labels += ['label', 'truth_variant/encoded'] examples = list(io_utils.read_tfrecords(examples_filename)) for example in examples: for label_feature in expected_labels: self.assertIn(label_feature, example.features.feature) # pylint: disable=g-explicit-length-test self.assertGreater( len(tf_utils.example_alt_alleles_indices(example)), 0) if verify_labels: # Check that our variant and our truth_variant both have the same start. self.assertEqual( variantutils.variant_position( tf_utils.example_variant(example)), variantutils.variant_position( tf_utils.example_truth_variant(example))) # Check that the variants in the examples are good. variants = [tf_utils.example_variant(x) for x in examples] self.verify_variants(variants, region, options, is_gvcf=False) return examples
def examples_to_variants(examples_path, max_records=None): """Yields Variant protos from the examples in examples_path. This function reads in tf.Examples produced by DeepVariant from examples_path, which may contain a sharded spec, sorts them, selects a representive example when there are multiple versions representing different alt_alleles, and yields the example_variant field from those examples. Args: examples_path: str. Path, or sharded spec, to labeled tf.Examples produced by DeepVariant in training mode. max_records: int or None. Maximum number of records to read, or None, to read all of the records. Yields: nucleus.protos.Variant protos in coordinate-sorted order. Raises: ValueError: if we find a Variant in any example that doesn't have genotypes. """ examples = io_utils.read_tfrecords(examples_path, max_records=max_records) variants = sorted( (tf_utils.example_variant(example) for example in examples), key=variant_utils.variant_range_tuple) for _, group in itertools.groupby(variants, variant_utils.variant_range_tuple): variant = next(group) if not variantcall_utils.has_genotypes( variant_utils.only_call(variant)): raise ValueError(( 'Variant {} does not have any genotypes. This tool only works with ' 'variants that have been labeled.').format( variant_utils.variant_key(variant))) yield variant
def test_create_pileup_examples(self): self.processor.pic = mock.Mock() self.add_mock('_encode_tensor', side_effect=[ ('tensor1', self.default_shape, self.default_format), ('tensor2', self.default_shape, self.default_format) ]) dv_call = mock.Mock() dv_call.variant = test_utils.make_variant(start=10, alleles=['A', 'C', 'G']) ex = mock.Mock() alt1, alt2 = ['C'], ['G'] self.processor.pic.create_pileup_images.return_value = [ (alt1, 'tensor1'), (alt2, 'tensor2') ] actual = self.processor.create_pileup_examples(dv_call) self.processor.pic.create_pileup_images.assert_called_once_with( dv_call) self.assertEquals(len(actual), 2) for ex, (alt, img) in zip(actual, [(alt1, 'tensor1'), (alt2, 'tensor2')]): self.assertEqual(tf_utils.example_alt_alleles(ex), alt) self.assertEqual(tf_utils.example_variant(ex), dv_call.variant) self.assertEqual(tf_utils.example_encoded_image(ex), img) self.assertEqual(tf_utils.example_image_shape(ex), self.default_shape) self.assertEqual(tf_utils.example_image_format(ex), self.default_format)
def test_add_label_to_example(self, label, expected_label_value): example = self._example_for_variant(label.variant) labeled = copy.deepcopy(example) actual = self.processor.add_label_to_example(labeled, label) # The add_label_to_example command modifies labeled and returns it. self.assertIs(actual, labeled) # Check that all keys from example are present in labeled. for key, value in example.features.feature.iteritems(): if key != 'variant/encoded': # Special case tested below. self.assertEqual(value, labeled.features.feature[key]) # The genotype of our example_variant should be set to the true genotype # according to our label. self.assertEqual(expected_label_value, tf_utils.example_label(labeled)) labeled_variant = tf_utils.example_variant(labeled) call = variant_utils.only_call(labeled_variant) self.assertEqual(tuple(call.genotype), label.genotype) # The original variant and labeled_variant from out tf.Example should be # equal except for the genotype field, since this is set by # add_label_to_example. label.variant.calls[0].genotype[:] = [] call.genotype[:] = [] self.assertEqual(label.variant, labeled_variant)
def test_add_label_to_example(self, label, expected_label_value): example = self._example_for_variant(label.variant) labeled = copy.deepcopy(example) actual = self.processor.add_label_to_example(labeled, label) # The add_label_to_example command modifies labeled and returns it. self.assertIs(actual, labeled) # Check that all keys from example are present in labeled. for key, value in example.features.feature.items(): if key != 'variant/encoded': # Special case tested below. self.assertEqual(value, labeled.features.feature[key]) # The genotype of our example_variant should be set to the true genotype # according to our label. self.assertEqual(expected_label_value, tf_utils.example_label(labeled)) labeled_variant = tf_utils.example_variant(labeled) call = variant_utils.only_call(labeled_variant) self.assertEqual(tuple(call.genotype), label.genotype) # The original variant and labeled_variant from out tf.Example should be # equal except for the genotype field, since this is set by # add_label_to_example. label.variant.calls[0].genotype[:] = [] call.genotype[:] = [] self.assertEqual(label.variant, labeled_variant)
def examples_to_variants(examples_path, max_records=None): """Yields Variant protos from the examples in examples_path. This function reads in tf.Examples produced by DeepVariant from examples_path, which may contain a sharded spec, sorts them, selects a representive example when there are multiple versions representing different alt_alleles, and yields the example_variant field from those examples. Args: examples_path: str. Path, or sharded spec, to labeled tf.Examples produced by DeepVariant in training mode. max_records: int or None. Maximum number of records to read, or None, to read all of the records. Yields: nucleus.protos.Variant protos in coordinate-sorted order. Raises: ValueError: if we find a Variant in any example that doesn't have genotypes. """ examples = io_utils.read_tfrecords(examples_path, max_records=max_records) variants = sorted( (tf_utils.example_variant(example) for example in examples), key=variant_utils.variant_range_tuple) for _, group in itertools.groupby(variants, variant_utils.variant_range_tuple): variant = next(group) if not variantcall_utils.has_genotypes(variant_utils.only_call(variant)): raise ValueError(( 'Variant {} does not have any genotypes. This tool only works with ' 'variants that have been labeled.').format( variant_utils.variant_key(variant))) yield variant
def testMakeExample(self): example = tf_utils.make_example(self.variant, self.alts, self.encoded_image, self.default_shape, self.default_format) self.assertEqual(self.encoded_image, tf_utils.example_encoded_image(example)) self.assertEqual( 'raw', example.features.feature['image/format'].bytes_list.value[0]) self.assertEqual(self.variant, tf_utils.example_variant(example)) self.assertEqual('1:11-11', tf_utils.example_locus(example)) self.assertEqual([0], tf_utils.example_alt_alleles_indices(example)) self.assertEqual('1:11:C->A', tf_utils.example_key(example))
def verify_examples(self, examples_filename, region, options, verify_labels): # Do some simple structural checks on the tf.Examples in the file. expected_features = [ 'variant/encoded', 'locus', 'image/format', 'image/encoded', 'alt_allele_indices/encoded' ] if verify_labels: expected_features += ['label'] examples = list(io_utils.read_tfrecords(examples_filename)) for example in examples: for label_feature in expected_features: self.assertIn(label_feature, example.features.feature) # pylint: disable=g-explicit-length-test self.assertGreater(len(tf_utils.example_alt_alleles_indices(example)), 0) # Check that the variants in the examples are good. variants = [tf_utils.example_variant(x) for x in examples] self.verify_variants(variants, region, options, is_gvcf=False) return examples
def verify_examples(self, examples_filename, region, options, verify_labels): # Do some simple structural checks on the tf.Examples in the file. expected_features = [ 'variant/encoded', 'locus', 'image/format', 'image/encoded', 'alt_allele_indices/encoded' ] if verify_labels: expected_features += ['label'] examples = list(tfrecord.read_tfrecords(examples_filename)) for example in examples: for label_feature in expected_features: self.assertIn(label_feature, example.features.feature) # pylint: disable=g-explicit-length-test self.assertNotEmpty(tf_utils.example_alt_alleles_indices(example)) # Check that the variants in the examples are good. variants = [tf_utils.example_variant(x) for x in examples] self.verify_variants(variants, region, options, is_gvcf=False) return examples
def test_create_pileup_examples(self): self.processor.pic = mock.Mock() self.add_mock( '_encode_tensor', side_effect=[('tensor1', self.default_shape, self.default_format), ('tensor2', self.default_shape, self.default_format)]) dv_call = mock.Mock() dv_call.variant = test_utils.make_variant(start=10, alleles=['A', 'C', 'G']) ex = mock.Mock() alt1, alt2 = ['C'], ['G'] self.processor.pic.create_pileup_images.return_value = [(alt1, 'tensor1'), (alt2, 'tensor2')] actual = self.processor.create_pileup_examples(dv_call) self.processor.pic.create_pileup_images.assert_called_once_with(dv_call) self.assertEquals(len(actual), 2) for ex, (alt, img) in zip(actual, [(alt1, 'tensor1'), (alt2, 'tensor2')]): self.assertEqual(tf_utils.example_alt_alleles(ex), alt) self.assertEqual(tf_utils.example_variant(ex), dv_call.variant) self.assertEqual(tf_utils.example_encoded_image(ex), img) self.assertEqual(tf_utils.example_image_shape(ex), self.default_shape) self.assertEqual(tf_utils.example_image_format(ex), self.default_format)
def test_create_pileup_examples(self): self.processor.pic = mock.Mock() self.processor.pic.get_reads.return_value = [] self.add_mock('_encode_tensor', side_effect=[(six.b('tensor1'), self.default_shape, self.default_format), (six.b('tensor2'), self.default_shape, self.default_format)]) dv_call = mock.Mock() dv_call.variant = test_utils.make_variant(start=10, alleles=['A', 'C', 'G']) ex = mock.Mock() alt1, alt2 = ['C'], ['G'] self.processor.pic.create_pileup_images.return_value = [ (alt1, six.b('tensor1')), (alt2, six.b('tensor2')) ] actual = self.processor.create_pileup_examples(dv_call) self.processor.pic.create_pileup_images.assert_called_once_with( dv_call=dv_call, reads_for_samples=[[]], haplotype_alignments_for_samples=None, haplotype_sequences=None, sample_order=None) self.assertLen(actual, 2) for ex, (alt, img) in zip(actual, [(alt1, six.b('tensor1')), (alt2, six.b('tensor2'))]): self.assertEqual(tf_utils.example_alt_alleles(ex), alt) self.assertEqual(tf_utils.example_variant(ex), dv_call.variant) self.assertEqual(tf_utils.example_encoded_image(ex), img) self.assertEqual(tf_utils.example_image_shape(ex), self.default_shape) self.assertEqual(tf_utils.example_image_format(ex), six.b(self.default_format))
def test_make_examples_end2end(self, mode, num_shards, labeler_algorithm=None): self.maxDiff = None self.assertIn(mode, {'calling', 'training'}) region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile( _sharded('vsc.tfrecord', num_shards)) FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = mode FLAGS.gvcf_gq_binsize = 5 if labeler_algorithm is not None: FLAGS.labeler_algorithm = labeler_algorithm if mode == 'calling': FLAGS.gvcf = test_utils.test_tmpfile( _sharded('gvcf.tfrecord', num_shards)) else: FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED for task_id in range(max(num_shards, 1)): FLAGS.task = task_id options = make_examples.default_options(add_flags=True) make_examples.make_examples_runner(options) # Test that our candidates are reasonable, calling specific helper functions # to check lots of properties of the output. candidates = sorted( io_utils.read_tfrecords( FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall), key=lambda c: variant_utils.variant_range_tuple(c.variant)) self.verify_deepvariant_calls(candidates, options) self.verify_variants( [call.variant for call in candidates], region, options, is_gvcf=False) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=mode == 'training') example_variants = [tf_utils.example_variant(ex) for ex in examples] self.verify_variants(example_variants, region, options, is_gvcf=False) # Verify the integrity of the examples and then check that they match our # golden labeled examples. Note we expect the order for both training and # calling modes to produce deterministic order because we fix the random # seed. if mode == 'calling': golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards) else: golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards) self.assertDeepVariantExamplesEqual( examples, list(io_utils.read_tfrecords(golden_file))) if mode == 'calling': nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) nist_variants = list(nist_reader.query(region)) self.verify_nist_concordance(example_variants, nist_variants) # Check the quality of our generated gvcf file. gvcfs = variant_utils.sorted_variants( io_utils.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant)) self.verify_variants(gvcfs, region, options, is_gvcf=True) self.verify_contiguity(gvcfs, region) gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT, num_shards) expected_gvcfs = list( io_utils.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant)) self.assertItemsEqual(gvcfs, expected_gvcfs)
def test_make_examples_end2end(self, mode, num_shards): self.assertIn(mode, {'calling', 'training'}) region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.ref = test_utils.CHR20_FASTA FLAGS.reads = test_utils.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile( _sharded('vsc.tfrecord', num_shards)) FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = mode if mode == 'calling': FLAGS.gvcf = test_utils.test_tmpfile( _sharded('gvcf.tfrecord', num_shards)) else: FLAGS.truth_variants = test_utils.TRUTH_VARIANTS_VCF FLAGS.confident_regions = test_utils.CONFIDENT_REGIONS_BED for task_id in range(max(num_shards, 1)): FLAGS.task = task_id options = make_examples.default_options(add_flags=True) make_examples.make_examples_runner(options) # Test that our candidates are reasonable, calling specific helper functions # to check lots of properties of the output. candidates = _sort_candidates( io_utils.read_tfrecords(FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall)) self.verify_deepvariant_calls(candidates, options) self.verify_variants([call.variant for call in candidates], region, options, is_gvcf=False) # Verify that the variants in the examples are all good. examples = self.verify_examples(FLAGS.examples, region, options, verify_labels=mode == 'training') example_variants = [tf_utils.example_variant(ex) for ex in examples] self.verify_variants(example_variants, region, options, is_gvcf=False) # Verify the integrity of the examples and then check that they match our # golden labeled examples. Note we expect the order for both training and # calling modes to produce deterministic order because we fix the random # seed. if mode == 'calling': golden_file = _sharded(test_utils.GOLDEN_CALLING_EXAMPLES, num_shards) else: golden_file = _sharded(test_utils.GOLDEN_TRAINING_EXAMPLES, num_shards) self.assertDeepVariantExamplesEqual( examples, list(io_utils.read_tfrecords(golden_file))) if mode == 'calling': nist_reader = genomics_io.make_vcf_reader( test_utils.TRUTH_VARIANTS_VCF) nist_variants = list(nist_reader.query(region)) self.verify_nist_concordance(example_variants, nist_variants) # Check the quality of our generated gvcf file. gvcfs = _sort_variants( io_utils.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant)) self.verify_variants(gvcfs, region, options, is_gvcf=True) self.verify_contiguity(gvcfs, region)
def setUpClass(cls): cls.examples = list( io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES)) cls.variants = [tf_utils.example_variant(ex) for ex in cls.examples] cls.model = modeling.get_model('random_guess')
def example_matches_call_variants_output(example, call_variants_output): return (tf_utils.example_variant(example) == call_variants_output.variant and tf_utils.example_alt_alleles_indices(example) == call_variants_output.alt_allele_indices.indices)
def test_call_end2end(self, model, shard_inputs, include_debug_info): FLAGS.include_debug_info = include_debug_info examples = list( io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES)) if shard_inputs: # Create a sharded version of our golden examples. source_path = test_utils.test_tmpfile('sharded@{}'.format(3)) io_utils.write_tfrecords(examples, source_path) else: source_path = testdata.GOLDEN_CALLING_EXAMPLES batch_size = 4 if model.name == 'random_guess': # For the random guess model we can run everything. max_batches = None else: # For all other models we only run a single batch for inference. max_batches = 1 outfile = test_utils.test_tmpfile('call_variants.tfrecord') call_variants.call_variants( examples_filename=source_path, checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST, model=model, output_file=outfile, batch_size=batch_size, max_batches=max_batches) call_variants_outputs = list( io_utils.read_tfrecords(outfile, deepvariant_pb2.CallVariantsOutput)) # Check that we have the right number of output protos. self.assertEqual( len(call_variants_outputs), batch_size * max_batches if max_batches else len(examples)) # Check that our CallVariantsOutput (CVO) have the following critical # properties: # - we have one CVO for each example we processed. # - the variant in the CVO is exactly what was in the example. # - the alt_allele_indices of the CVO match those of its corresponding # example. # - there are 3 genotype probabilities and these are between 0.0 and 1.0. # We can only do this test when processing all of the variants (max_batches # is None), since we processed all of the examples with that model. if max_batches is None: self.assertItemsEqual( [cvo.variant for cvo in call_variants_outputs], [tf_utils.example_variant(ex) for ex in examples]) # Check the CVO debug_info: not filled if include_debug_info is False; # else, filled by logic based on CVO. if not include_debug_info: for cvo in call_variants_outputs: self.assertEqual( cvo.debug_info, deepvariant_pb2.CallVariantsOutput.DebugInfo()) else: for cvo in call_variants_outputs: self.assertEqual(cvo.debug_info.has_insertion, variant_utils.has_insertion(cvo.variant)) self.assertEqual(cvo.debug_info.has_deletion, variant_utils.has_deletion(cvo.variant)) self.assertEqual(cvo.debug_info.is_snp, variant_utils.is_snp(cvo.variant)) self.assertEqual(cvo.debug_info.predicted_label, np.argmax(cvo.genotype_probabilities)) def example_matches_call_variants_output(example, call_variants_output): return (tf_utils.example_variant(example) == call_variants_output.variant and tf_utils.example_alt_alleles_indices(example) == call_variants_output.alt_allele_indices.indices) for call_variants_output in call_variants_outputs: # Find all matching examples. matches = [ ex for ex in examples if example_matches_call_variants_output( ex, call_variants_output) ] # We should have exactly one match. self.assertEqual(len(matches), 1) example = matches[0] # Check that we've faithfully copied in the alt alleles (though currently # as implemented we find our example using this information so it cannot # fail). Included here in case that changes in the future. self.assertEqual( list(tf_utils.example_alt_alleles_indices(example)), list(call_variants_output.alt_allele_indices.indices)) # We should have exactly three genotype probabilities (assuming our # ploidy == 2). self.assertEqual(len(call_variants_output.genotype_probabilities), 3) # These are probabilities so they should be between 0 and 1. self.assertTrue( 0 <= gp <= 1 for gp in call_variants_output.genotype_probabilities)
def _example_sort_key(example): return variant_utils.variant_range_tuple(tf_utils.example_variant(example))
def test_call_end2end(self, model, shard_inputs, include_debug_info): FLAGS.include_debug_info = include_debug_info examples = list(io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES)) if shard_inputs: # Create a sharded version of our golden examples. source_path = test_utils.test_tmpfile('sharded@{}'.format(3)) io_utils.write_tfrecords(examples, source_path) else: source_path = testdata.GOLDEN_CALLING_EXAMPLES batch_size = 4 if model.name == 'random_guess': # For the random guess model we can run everything. max_batches = None else: # For all other models we only run a single batch for inference. max_batches = 1 outfile = test_utils.test_tmpfile('call_variants.tfrecord') call_variants.call_variants( examples_filename=source_path, checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST, model=model, output_file=outfile, batch_size=batch_size, max_batches=max_batches) call_variants_outputs = list( io_utils.read_tfrecords(outfile, deepvariant_pb2.CallVariantsOutput)) # Check that we have the right number of output protos. self.assertEqual( len(call_variants_outputs), batch_size * max_batches if max_batches else len(examples)) # Check that our CallVariantsOutput (CVO) have the following critical # properties: # - we have one CVO for each example we processed. # - the variant in the CVO is exactly what was in the example. # - the alt_allele_indices of the CVO match those of its corresponding # example. # - there are 3 genotype probabilities and these are between 0.0 and 1.0. # We can only do this test when processing all of the variants (max_batches # is None), since we processed all of the examples with that model. if max_batches is None: self.assertItemsEqual([cvo.variant for cvo in call_variants_outputs], [tf_utils.example_variant(ex) for ex in examples]) # Check the CVO debug_info: not filled if include_debug_info is False; # else, filled by logic based on CVO. if not include_debug_info: for cvo in call_variants_outputs: self.assertEqual(cvo.debug_info, deepvariant_pb2.CallVariantsOutput.DebugInfo()) else: for cvo in call_variants_outputs: self.assertEqual(cvo.debug_info.has_insertion, variant_utils.has_insertion(cvo.variant)) self.assertEqual(cvo.debug_info.has_deletion, variant_utils.has_deletion(cvo.variant)) self.assertEqual(cvo.debug_info.is_snp, variant_utils.is_snp( cvo.variant)) self.assertEqual(cvo.debug_info.predicted_label, np.argmax(cvo.genotype_probabilities)) def example_matches_call_variants_output(example, call_variants_output): return (tf_utils.example_variant(example) == call_variants_output.variant and tf_utils.example_alt_alleles_indices( example) == call_variants_output.alt_allele_indices.indices) for call_variants_output in call_variants_outputs: # Find all matching examples. matches = [ ex for ex in examples if example_matches_call_variants_output(ex, call_variants_output) ] # We should have exactly one match. self.assertEqual(len(matches), 1) example = matches[0] # Check that we've faithfully copied in the alt alleles (though currently # as implemented we find our example using this information so it cannot # fail). Included here in case that changes in the future. self.assertEqual( list(tf_utils.example_alt_alleles_indices(example)), list(call_variants_output.alt_allele_indices.indices)) # We should have exactly three genotype probabilities (assuming our # ploidy == 2). self.assertEqual(len(call_variants_output.genotype_probabilities), 3) # These are probabilities so they should be between 0 and 1. self.assertTrue( 0 <= gp <= 1 for gp in call_variants_output.genotype_probabilities)
def test_call_end2end(self, model, shard_inputs, include_debug_info): FLAGS.include_debug_info = include_debug_info (call_variants_outputs, examples, batch_size, max_batches) = self._call_end2end_helper( testdata.GOLDEN_CALLING_EXAMPLES, model, shard_inputs) # Check that we have the right number of output protos. self.assertEqual( len(call_variants_outputs), batch_size * max_batches if max_batches else len(examples)) # Check that our CallVariantsOutput (CVO) have the following critical # properties: # - we have one CVO for each example we processed. # - the variant in the CVO is exactly what was in the example. # - the alt_allele_indices of the CVO match those of its corresponding # example. # - there are 3 genotype probabilities and these are between 0.0 and 1.0. # We can only do this test when processing all of the variants (max_batches # is None), since we processed all of the examples with that model. if max_batches is None: self.assertItemsEqual( [cvo.variant for cvo in call_variants_outputs], [tf_utils.example_variant(ex) for ex in examples]) # Check the CVO debug_info: not filled if include_debug_info is False; # else, filled by logic based on CVO. if not include_debug_info: for cvo in call_variants_outputs: self.assertEqual( cvo.debug_info, deepvariant_pb2.CallVariantsOutput.DebugInfo()) else: for cvo in call_variants_outputs: self.assertEqual(cvo.debug_info.has_insertion, variant_utils.has_insertion(cvo.variant)) self.assertEqual(cvo.debug_info.has_deletion, variant_utils.has_deletion(cvo.variant)) self.assertEqual(cvo.debug_info.is_snp, variant_utils.is_snp(cvo.variant)) self.assertEqual(cvo.debug_info.predicted_label, np.argmax(cvo.genotype_probabilities)) def example_matches_call_variants_output(example, call_variants_output): return (tf_utils.example_variant(example) == call_variants_output.variant and tf_utils.example_alt_alleles_indices(example) == call_variants_output.alt_allele_indices.indices) for call_variants_output in call_variants_outputs: # Find all matching examples. matches = [ ex for ex in examples if example_matches_call_variants_output( ex, call_variants_output) ] # We should have exactly one match. self.assertEqual(len(matches), 1) example = matches[0] # Check that we've faithfully copied in the alt alleles (though currently # as implemented we find our example using this information so it cannot # fail). Included here in case that changes in the future. self.assertEqual( list(tf_utils.example_alt_alleles_indices(example)), list(call_variants_output.alt_allele_indices.indices)) # We should have exactly three genotype probabilities (assuming our # ploidy == 2). self.assertEqual(len(call_variants_output.genotype_probabilities), 3) # These are probabilities so they should be between 0 and 1. self.assertTrue( 0 <= gp <= 1 for gp in call_variants_output.genotype_probabilities)
def example_matches_call_variants_output(example, call_variants_output): return (tf_utils.example_variant(example) == call_variants_output.variant and tf_utils.example_alt_alleles_indices( example) == call_variants_output.alt_allele_indices.indices)
def test_make_examples_end2end(self, mode, num_shards, test_condition=TestConditions.USE_BAM, labeler_algorithm=None, use_fast_pass_aligner=True): self.assertIn(mode, {'calling', 'training'}) region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.write_run_info = True FLAGS.ref = testdata.CHR20_FASTA if test_condition == TestConditions.USE_BAM: FLAGS.reads = testdata.CHR20_BAM elif test_condition == TestConditions.USE_CRAM: FLAGS.reads = testdata.CHR20_CRAM elif test_condition == TestConditions.USE_MULTI_BAMS: FLAGS.reads = ','.join( [testdata.CHR20_BAM_FIRST_HALF, testdata.CHR20_BAM_SECOND_HALF]) FLAGS.candidates = test_utils.test_tmpfile( _sharded('vsc.tfrecord', num_shards)) FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = mode FLAGS.gvcf_gq_binsize = 5 FLAGS.use_fast_pass_aligner = use_fast_pass_aligner if labeler_algorithm is not None: FLAGS.labeler_algorithm = labeler_algorithm if mode == 'calling': FLAGS.gvcf = test_utils.test_tmpfile( _sharded('gvcf.tfrecord', num_shards)) else: FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED for task_id in range(max(num_shards, 1)): FLAGS.task = task_id options = make_examples.default_options(add_flags=True) # We need to overwrite bam_fname for USE_CRAM test since Golden Set # generated from BAM file. BAM filename is stored in candidates. If we # don't overwrite default_options variants won't match and test fail. options.bam_fname = 'NA12878_S1.chr20.10_10p1mb.bam' make_examples_core.make_examples_runner(options) # Check that our run_info proto contains the basic fields we'd expect: # (a) our options are written to the run_info.options field. run_info = make_examples_core.read_make_examples_run_info( options.run_info_filename) self.assertEqual(run_info.options, options) # (b) run_info.resource_metrics is present and contains our hostname. self.assertTrue(run_info.HasField('resource_metrics')) self.assertEqual(run_info.resource_metrics.host_name, platform.node()) # Test that our candidates are reasonable, calling specific helper functions # to check lots of properties of the output. candidates = sorted( tfrecord.read_tfrecords( FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall), key=lambda c: variant_utils.variant_range_tuple(c.variant)) self.verify_deepvariant_calls(candidates, options) self.verify_variants([call.variant for call in candidates], region, options, is_gvcf=False) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=mode == 'training') example_variants = [tf_utils.example_variant(ex) for ex in examples] self.verify_variants(example_variants, region, options, is_gvcf=False) # Verify the integrity of the examples and then check that they match our # golden labeled examples. Note we expect the order for both training and # calling modes to produce deterministic order because we fix the random # seed. if mode == 'calling': golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards) else: golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards) self.assertDeepVariantExamplesEqual( examples, list(tfrecord.read_tfrecords(golden_file))) if mode == 'calling': nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) nist_variants = list(nist_reader.query(region)) self.verify_nist_concordance(example_variants, nist_variants) # Check the quality of our generated gvcf file. gvcfs = variant_utils.sorted_variants( tfrecord.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant)) self.verify_variants(gvcfs, region, options, is_gvcf=True) self.verify_contiguity(gvcfs, region) gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT, num_shards) expected_gvcfs = list( tfrecord.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant)) # Despite the name, assertCountEqual checks that all elements match. self.assertCountEqual(gvcfs, expected_gvcfs) if (mode == 'training' and num_shards == 0 and labeler_algorithm != 'positional_labeler'): # The positional labeler doesn't track metrics, so don't try to read them # in when that's the mode. self.assertEqual( make_examples_core.read_make_examples_run_info( testdata.GOLDEN_MAKE_EXAMPLES_RUN_INFO).labeling_metrics, run_info.labeling_metrics)