def verify_examples(self, examples_filename, region, options, verify_labels): # Do some simple structural checks on the tf.Examples in the file. expected_labels = [ 'variant/encoded', 'locus', 'image/format', 'image/encoded', 'alt_allele_indices/encoded' ] if verify_labels: expected_labels += ['label', 'truth_variant/encoded'] examples = list(io_utils.read_tfrecords(examples_filename)) for example in examples: for label_feature in expected_labels: self.assertIn(label_feature, example.features.feature) # pylint: disable=g-explicit-length-test self.assertGreater( len(tf_utils.example_alt_alleles_indices(example)), 0) if verify_labels: # Check that our variant and our truth_variant both have the same start. self.assertEqual( variantutils.variant_position( tf_utils.example_variant(example)), variantutils.variant_position( tf_utils.example_truth_variant(example))) # Check that the variants in the examples are good. variants = [tf_utils.example_variant(x) for x in examples] self.verify_variants(variants, region, options, is_gvcf=False) return examples
def add_label_to_example(self, example, label): """Adds label information about the assigned label to our example. Args: example: A tf.Example proto. We will write truth_variant and label into this proto. label: A variant_labeler.Label object containing the labeling information to add to our example. Returns: The example proto with label fields added. Raises: ValueError: if label isn't confident. """ if not label.is_confident: raise ValueError('Cannot add a non-confident label to an example', example, label) alt_alleles_indices = tf_utils.example_alt_alleles_indices(example) # Set the genotype of the candidate variant to the labeled value. candidate = label.variant _set_variant_genotype(candidate, label.genotype) tf_utils.example_set_variant(example, candidate) # Set the label of the example to the # alts given our alt_alleles_indices. tf_utils.example_set_label(example, label.label_for_alt_alleles(alt_alleles_indices)) return example
def testMakeExampleMultiAllelic(self): alts = ['AA', 'CC', 'GG'] self.variant.alternate_bases[:] = alts # Providing GG, AA checks that we're sorting the indices. example = tf_utils.make_example(self.variant, ['GG', 'AA'], 'foo', self.default_shape, self.default_format) self.assertEqual([0, 2], tf_utils.example_alt_alleles_indices(example)) self.assertEqual(['AA', 'GG'], tf_utils.example_alt_alleles(example)) self.assertEqual('1:11:C->AA/GG', tf_utils.example_key(example))
def testMakeExample(self): example = tf_utils.make_example(self.variant, self.alts, self.encoded_image, self.default_shape, self.default_format) self.assertEqual(self.encoded_image, tf_utils.example_encoded_image(example)) self.assertEqual( 'raw', example.features.feature['image/format'].bytes_list.value[0]) self.assertEqual(self.variant, tf_utils.example_variant(example)) self.assertEqual('1:11-11', tf_utils.example_locus(example)) self.assertEqual([0], tf_utils.example_alt_alleles_indices(example)) self.assertEqual('1:11:C->A', tf_utils.example_key(example))
def testAltAllelesWithVariant(self): alts = list(self.variant.alternate_bases) example = tf_utils.make_example(self.variant, alts, six.b('foo'), self.default_shape, self.default_format) self.assertEqual([0], tf_utils.example_alt_alleles_indices(example)) with mock.patch( 'deepvariant.tf_utils.example_variant' ) as mock_ex_variant: # Providing variant directly avoids the call to example_variant(). self.assertEqual( alts, tf_utils.example_alt_alleles(example, variant=self.variant)) mock_ex_variant.assert_not_called() # Checks that we load the variant if needed and that our mock is working. mock_ex_variant.return_value = self.variant self.assertEqual(alts, tf_utils.example_alt_alleles(example)) mock_ex_variant.assert_called_once_with(example)
def verify_examples(self, examples_filename, region, options, verify_labels): # Do some simple structural checks on the tf.Examples in the file. expected_features = [ 'variant/encoded', 'locus', 'image/format', 'image/encoded', 'alt_allele_indices/encoded' ] if verify_labels: expected_features += ['label'] examples = list(io_utils.read_tfrecords(examples_filename)) for example in examples: for label_feature in expected_features: self.assertIn(label_feature, example.features.feature) # pylint: disable=g-explicit-length-test self.assertGreater(len(tf_utils.example_alt_alleles_indices(example)), 0) # Check that the variants in the examples are good. variants = [tf_utils.example_variant(x) for x in examples] self.verify_variants(variants, region, options, is_gvcf=False) return examples
def verify_examples(self, examples_filename, region, options, verify_labels): # Do some simple structural checks on the tf.Examples in the file. expected_features = [ 'variant/encoded', 'locus', 'image/format', 'image/encoded', 'alt_allele_indices/encoded' ] if verify_labels: expected_features += ['label'] examples = list(tfrecord.read_tfrecords(examples_filename)) for example in examples: for label_feature in expected_features: self.assertIn(label_feature, example.features.feature) # pylint: disable=g-explicit-length-test self.assertNotEmpty(tf_utils.example_alt_alleles_indices(example)) # Check that the variants in the examples are good. variants = [tf_utils.example_variant(x) for x in examples] self.verify_variants(variants, region, options, is_gvcf=False) return examples
def example_matches_call_variants_output(example, call_variants_output): return (tf_utils.example_variant(example) == call_variants_output.variant and tf_utils.example_alt_alleles_indices(example) == call_variants_output.alt_allele_indices.indices)
def test_call_end2end(self, model, shard_inputs, include_debug_info): FLAGS.include_debug_info = include_debug_info (call_variants_outputs, examples, batch_size, max_batches) = self._call_end2end_helper( testdata.GOLDEN_CALLING_EXAMPLES, model, shard_inputs) # Check that we have the right number of output protos. self.assertEqual( len(call_variants_outputs), batch_size * max_batches if max_batches else len(examples)) # Check that our CallVariantsOutput (CVO) have the following critical # properties: # - we have one CVO for each example we processed. # - the variant in the CVO is exactly what was in the example. # - the alt_allele_indices of the CVO match those of its corresponding # example. # - there are 3 genotype probabilities and these are between 0.0 and 1.0. # We can only do this test when processing all of the variants (max_batches # is None), since we processed all of the examples with that model. if max_batches is None: self.assertItemsEqual( [cvo.variant for cvo in call_variants_outputs], [tf_utils.example_variant(ex) for ex in examples]) # Check the CVO debug_info: not filled if include_debug_info is False; # else, filled by logic based on CVO. if not include_debug_info: for cvo in call_variants_outputs: self.assertEqual( cvo.debug_info, deepvariant_pb2.CallVariantsOutput.DebugInfo()) else: for cvo in call_variants_outputs: self.assertEqual(cvo.debug_info.has_insertion, variant_utils.has_insertion(cvo.variant)) self.assertEqual(cvo.debug_info.has_deletion, variant_utils.has_deletion(cvo.variant)) self.assertEqual(cvo.debug_info.is_snp, variant_utils.is_snp(cvo.variant)) self.assertEqual(cvo.debug_info.predicted_label, np.argmax(cvo.genotype_probabilities)) def example_matches_call_variants_output(example, call_variants_output): return (tf_utils.example_variant(example) == call_variants_output.variant and tf_utils.example_alt_alleles_indices(example) == call_variants_output.alt_allele_indices.indices) for call_variants_output in call_variants_outputs: # Find all matching examples. matches = [ ex for ex in examples if example_matches_call_variants_output( ex, call_variants_output) ] # We should have exactly one match. self.assertEqual(len(matches), 1) example = matches[0] # Check that we've faithfully copied in the alt alleles (though currently # as implemented we find our example using this information so it cannot # fail). Included here in case that changes in the future. self.assertEqual( list(tf_utils.example_alt_alleles_indices(example)), list(call_variants_output.alt_allele_indices.indices)) # We should have exactly three genotype probabilities (assuming our # ploidy == 2). self.assertEqual(len(call_variants_output.genotype_probabilities), 3) # These are probabilities so they should be between 0 and 1. self.assertTrue( 0 <= gp <= 1 for gp in call_variants_output.genotype_probabilities)
def test_call_end2end(self, model, shard_inputs, include_debug_info): FLAGS.include_debug_info = include_debug_info examples = list( io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES)) if shard_inputs: # Create a sharded version of our golden examples. source_path = test_utils.test_tmpfile('sharded@{}'.format(3)) io_utils.write_tfrecords(examples, source_path) else: source_path = testdata.GOLDEN_CALLING_EXAMPLES batch_size = 4 if model.name == 'random_guess': # For the random guess model we can run everything. max_batches = None else: # For all other models we only run a single batch for inference. max_batches = 1 outfile = test_utils.test_tmpfile('call_variants.tfrecord') call_variants.call_variants( examples_filename=source_path, checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST, model=model, output_file=outfile, batch_size=batch_size, max_batches=max_batches) call_variants_outputs = list( io_utils.read_tfrecords(outfile, deepvariant_pb2.CallVariantsOutput)) # Check that we have the right number of output protos. self.assertEqual( len(call_variants_outputs), batch_size * max_batches if max_batches else len(examples)) # Check that our CallVariantsOutput (CVO) have the following critical # properties: # - we have one CVO for each example we processed. # - the variant in the CVO is exactly what was in the example. # - the alt_allele_indices of the CVO match those of its corresponding # example. # - there are 3 genotype probabilities and these are between 0.0 and 1.0. # We can only do this test when processing all of the variants (max_batches # is None), since we processed all of the examples with that model. if max_batches is None: self.assertItemsEqual( [cvo.variant for cvo in call_variants_outputs], [tf_utils.example_variant(ex) for ex in examples]) # Check the CVO debug_info: not filled if include_debug_info is False; # else, filled by logic based on CVO. if not include_debug_info: for cvo in call_variants_outputs: self.assertEqual( cvo.debug_info, deepvariant_pb2.CallVariantsOutput.DebugInfo()) else: for cvo in call_variants_outputs: self.assertEqual(cvo.debug_info.has_insertion, variant_utils.has_insertion(cvo.variant)) self.assertEqual(cvo.debug_info.has_deletion, variant_utils.has_deletion(cvo.variant)) self.assertEqual(cvo.debug_info.is_snp, variant_utils.is_snp(cvo.variant)) self.assertEqual(cvo.debug_info.predicted_label, np.argmax(cvo.genotype_probabilities)) def example_matches_call_variants_output(example, call_variants_output): return (tf_utils.example_variant(example) == call_variants_output.variant and tf_utils.example_alt_alleles_indices(example) == call_variants_output.alt_allele_indices.indices) for call_variants_output in call_variants_outputs: # Find all matching examples. matches = [ ex for ex in examples if example_matches_call_variants_output( ex, call_variants_output) ] # We should have exactly one match. self.assertEqual(len(matches), 1) example = matches[0] # Check that we've faithfully copied in the alt alleles (though currently # as implemented we find our example using this information so it cannot # fail). Included here in case that changes in the future. self.assertEqual( list(tf_utils.example_alt_alleles_indices(example)), list(call_variants_output.alt_allele_indices.indices)) # We should have exactly three genotype probabilities (assuming our # ploidy == 2). self.assertEqual(len(call_variants_output.genotype_probabilities), 3) # These are probabilities so they should be between 0 and 1. self.assertTrue( 0 <= gp <= 1 for gp in call_variants_output.genotype_probabilities)
def example_matches_call_variants_output(example, call_variants_output): return (tf_utils.example_variant(example) == call_variants_output.variant and tf_utils.example_alt_alleles_indices( example) == call_variants_output.alt_allele_indices.indices)
def test_call_end2end(self, model, shard_inputs, include_debug_info): FLAGS.include_debug_info = include_debug_info examples = list(io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES)) if shard_inputs: # Create a sharded version of our golden examples. source_path = test_utils.test_tmpfile('sharded@{}'.format(3)) io_utils.write_tfrecords(examples, source_path) else: source_path = testdata.GOLDEN_CALLING_EXAMPLES batch_size = 4 if model.name == 'random_guess': # For the random guess model we can run everything. max_batches = None else: # For all other models we only run a single batch for inference. max_batches = 1 outfile = test_utils.test_tmpfile('call_variants.tfrecord') call_variants.call_variants( examples_filename=source_path, checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST, model=model, output_file=outfile, batch_size=batch_size, max_batches=max_batches) call_variants_outputs = list( io_utils.read_tfrecords(outfile, deepvariant_pb2.CallVariantsOutput)) # Check that we have the right number of output protos. self.assertEqual( len(call_variants_outputs), batch_size * max_batches if max_batches else len(examples)) # Check that our CallVariantsOutput (CVO) have the following critical # properties: # - we have one CVO for each example we processed. # - the variant in the CVO is exactly what was in the example. # - the alt_allele_indices of the CVO match those of its corresponding # example. # - there are 3 genotype probabilities and these are between 0.0 and 1.0. # We can only do this test when processing all of the variants (max_batches # is None), since we processed all of the examples with that model. if max_batches is None: self.assertItemsEqual([cvo.variant for cvo in call_variants_outputs], [tf_utils.example_variant(ex) for ex in examples]) # Check the CVO debug_info: not filled if include_debug_info is False; # else, filled by logic based on CVO. if not include_debug_info: for cvo in call_variants_outputs: self.assertEqual(cvo.debug_info, deepvariant_pb2.CallVariantsOutput.DebugInfo()) else: for cvo in call_variants_outputs: self.assertEqual(cvo.debug_info.has_insertion, variant_utils.has_insertion(cvo.variant)) self.assertEqual(cvo.debug_info.has_deletion, variant_utils.has_deletion(cvo.variant)) self.assertEqual(cvo.debug_info.is_snp, variant_utils.is_snp( cvo.variant)) self.assertEqual(cvo.debug_info.predicted_label, np.argmax(cvo.genotype_probabilities)) def example_matches_call_variants_output(example, call_variants_output): return (tf_utils.example_variant(example) == call_variants_output.variant and tf_utils.example_alt_alleles_indices( example) == call_variants_output.alt_allele_indices.indices) for call_variants_output in call_variants_outputs: # Find all matching examples. matches = [ ex for ex in examples if example_matches_call_variants_output(ex, call_variants_output) ] # We should have exactly one match. self.assertEqual(len(matches), 1) example = matches[0] # Check that we've faithfully copied in the alt alleles (though currently # as implemented we find our example using this information so it cannot # fail). Included here in case that changes in the future. self.assertEqual( list(tf_utils.example_alt_alleles_indices(example)), list(call_variants_output.alt_allele_indices.indices)) # We should have exactly three genotype probabilities (assuming our # ploidy == 2). self.assertEqual(len(call_variants_output.genotype_probabilities), 3) # These are probabilities so they should be between 0 and 1. self.assertTrue( 0 <= gp <= 1 for gp in call_variants_output.genotype_probabilities)
def convert_to_class(self, example): """Convert label to the class id.""" alt_alleles_indices = tf_utils.example_alt_alleles_indices(example) # Set the label of the example to the # alts given our alt_alleles_indices. return self.label_for_alt_alleles(alt_alleles_indices)