def _get_variant_type(variant): """Returns the type of variant as a string.""" if variant_utils.is_variant_call(variant): biallelic = variant_utils.is_biallelic(variant) snp = variant_utils.is_snp(variant) insertion = variant_utils.variant_is_insertion(variant) deletion = variant_utils.variant_is_deletion(variant) if biallelic: if snp: return BIALLELIC_SNP elif insertion: return BIALLELIC_INSERTION elif deletion: return BIALLELIC_DELETION else: return BIALLELIC_MNP else: if snp: return MULTIALLELIC_SNP elif insertion: return MULTIALLELIC_INSERTION elif deletion: return MULTIALLELIC_DELETION else: return MULTIALLELIC_COMPLEX else: return REFCALL
def generate_positions(vcf_reader, ref_reader, baseline_contig): """Gets all INDELs position and an equal amount of SNPs and random positions. Args: vcf_reader: a nucleus.io.VcfReader. ref_reader: a nucleus.io.IndexedFastaReader. baseline_contig: contig from which to sample baseline positions. Returns: A list of PositionWrapper. """ variants = [variant for variant in vcf_reader] indels_positions = [ PositionWrapper(var.reference_name, var.start, _INDEL_LABEL) for var in variants if variant_utils.is_indel(var) ] n_indels = len(indels_positions) # We sort by position for better data locality. snps = [var for var in variants if variant_utils.is_snp(var)] snps_positions = [ PositionWrapper(var.reference_name, var.start, _SNP_LABEL) for var in random.sample(snps, min(len(snps), n_indels)) ] contig_size = ref_reader.contig(baseline_contig).n_bases # NOTE: Though unlikely, these random positions can end up on actual # variants. baseline_positions = [ PositionWrapper(baseline_contig, pos, _REF_LABEL) for pos in random.sample(xrange(contig_size), min( contig_size, n_indels)) ] return sorted(indels_positions + snps_positions + baseline_positions)
def _create_cvo_proto(encoded_variant, gls, encoded_alt_allele_indices, true_labels=None, logits=None, prelogits=None): """Returns a CallVariantsOutput proto from the relevant input information.""" variant = variants_pb2.Variant.FromString(encoded_variant) alt_allele_indices = (deepvariant_pb2.CallVariantsOutput.AltAlleleIndices. FromString(encoded_alt_allele_indices)) debug_info = None if FLAGS.include_debug_info or FLAGS.debugging_true_label_mode: if prelogits is not None: assert prelogits.shape == (1, 1, 2048) prelogits = prelogits[0][0] debug_info = deepvariant_pb2.CallVariantsOutput.DebugInfo( has_insertion=variant_utils.has_insertion(variant), has_deletion=variant_utils.has_deletion(variant), is_snp=variant_utils.is_snp(variant), predicted_label=np.argmax(gls), true_label=true_labels, logits=logits, prelogits=prelogits) call_variants_output = deepvariant_pb2.CallVariantsOutput( variant=variant, alt_allele_indices=alt_allele_indices, genotype_probabilities=gls, debug_info=debug_info) return call_variants_output
def _create_cvo_proto(encoded_variant, gls, encoded_alt_allele_indices): """Returns a CallVariantsOutput proto from the relevant input information.""" variant = variants_pb2.Variant.FromString(encoded_variant) alt_allele_indices = ( deepvariant_pb2.CallVariantsOutput.AltAlleleIndices.FromString( encoded_alt_allele_indices)) debug_info = None if FLAGS.include_debug_info: debug_info = deepvariant_pb2.CallVariantsOutput.DebugInfo( has_insertion=variant_utils.has_insertion(variant), has_deletion=variant_utils.has_deletion(variant), is_snp=variant_utils.is_snp(variant), predicted_label=np.argmax(gls)) call_variants_output = deepvariant_pb2.CallVariantsOutput( variant=variant, alt_allele_indices=alt_allele_indices, genotype_probabilities=gls, debug_info=debug_info) return call_variants_output
def encoded_variant_type(variant): """Gets the EncodedVariantType for variant. This function examines variant and returns the EncodedVariantType that best describes the variation type of variant. For example, if variant has `reference_bases = "A"` and `alternative_bases = ["C"]` this function would return EncodedVariantType.SNP. Args: variant: nucleus.Variant proto. The variant whose EncodedVariantType we want to get. Returns: EncodedVariantType enum value. """ if variant_utils.is_snp(variant): return EncodedVariantType.SNP elif variant_utils.is_indel(variant): return EncodedVariantType.INDEL else: return EncodedVariantType.UNKNOWN
def test_is_snp(self, variant, expected): self.assertEqual(variant_utils.is_snp(variant), expected)
def test_call_end2end(self, model, shard_inputs, include_debug_info): FLAGS.include_debug_info = include_debug_info (call_variants_outputs, examples, batch_size, max_batches) = self._call_end2end_helper( testdata.GOLDEN_CALLING_EXAMPLES, model, shard_inputs) # Check that we have the right number of output protos. self.assertEqual( len(call_variants_outputs), batch_size * max_batches if max_batches else len(examples)) # Check that our CallVariantsOutput (CVO) have the following critical # properties: # - we have one CVO for each example we processed. # - the variant in the CVO is exactly what was in the example. # - the alt_allele_indices of the CVO match those of its corresponding # example. # - there are 3 genotype probabilities and these are between 0.0 and 1.0. # We can only do this test when processing all of the variants (max_batches # is None), since we processed all of the examples with that model. if max_batches is None: self.assertItemsEqual( [cvo.variant for cvo in call_variants_outputs], [tf_utils.example_variant(ex) for ex in examples]) # Check the CVO debug_info: not filled if include_debug_info is False; # else, filled by logic based on CVO. if not include_debug_info: for cvo in call_variants_outputs: self.assertEqual( cvo.debug_info, deepvariant_pb2.CallVariantsOutput.DebugInfo()) else: for cvo in call_variants_outputs: self.assertEqual(cvo.debug_info.has_insertion, variant_utils.has_insertion(cvo.variant)) self.assertEqual(cvo.debug_info.has_deletion, variant_utils.has_deletion(cvo.variant)) self.assertEqual(cvo.debug_info.is_snp, variant_utils.is_snp(cvo.variant)) self.assertEqual(cvo.debug_info.predicted_label, np.argmax(cvo.genotype_probabilities)) def example_matches_call_variants_output(example, call_variants_output): return (tf_utils.example_variant(example) == call_variants_output.variant and tf_utils.example_alt_alleles_indices(example) == call_variants_output.alt_allele_indices.indices) for call_variants_output in call_variants_outputs: # Find all matching examples. matches = [ ex for ex in examples if example_matches_call_variants_output( ex, call_variants_output) ] # We should have exactly one match. self.assertEqual(len(matches), 1) example = matches[0] # Check that we've faithfully copied in the alt alleles (though currently # as implemented we find our example using this information so it cannot # fail). Included here in case that changes in the future. self.assertEqual( list(tf_utils.example_alt_alleles_indices(example)), list(call_variants_output.alt_allele_indices.indices)) # We should have exactly three genotype probabilities (assuming our # ploidy == 2). self.assertEqual(len(call_variants_output.genotype_probabilities), 3) # These are probabilities so they should be between 0 and 1. self.assertTrue( 0 <= gp <= 1 for gp in call_variants_output.genotype_probabilities)
def test_is_snp_symbolic_allele(self, variant, exclude_alleles, expected): self.assertEqual( variant_utils.is_snp(variant, exclude_alleles=exclude_alleles), expected)
def test_call_end2end(self, model, shard_inputs, include_debug_info): FLAGS.include_debug_info = include_debug_info examples = list( io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES)) if shard_inputs: # Create a sharded version of our golden examples. source_path = test_utils.test_tmpfile('sharded@{}'.format(3)) io_utils.write_tfrecords(examples, source_path) else: source_path = testdata.GOLDEN_CALLING_EXAMPLES batch_size = 4 if model.name == 'random_guess': # For the random guess model we can run everything. max_batches = None else: # For all other models we only run a single batch for inference. max_batches = 1 outfile = test_utils.test_tmpfile('call_variants.tfrecord') call_variants.call_variants( examples_filename=source_path, checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST, model=model, output_file=outfile, batch_size=batch_size, max_batches=max_batches) call_variants_outputs = list( io_utils.read_tfrecords(outfile, deepvariant_pb2.CallVariantsOutput)) # Check that we have the right number of output protos. self.assertEqual( len(call_variants_outputs), batch_size * max_batches if max_batches else len(examples)) # Check that our CallVariantsOutput (CVO) have the following critical # properties: # - we have one CVO for each example we processed. # - the variant in the CVO is exactly what was in the example. # - the alt_allele_indices of the CVO match those of its corresponding # example. # - there are 3 genotype probabilities and these are between 0.0 and 1.0. # We can only do this test when processing all of the variants (max_batches # is None), since we processed all of the examples with that model. if max_batches is None: self.assertItemsEqual( [cvo.variant for cvo in call_variants_outputs], [tf_utils.example_variant(ex) for ex in examples]) # Check the CVO debug_info: not filled if include_debug_info is False; # else, filled by logic based on CVO. if not include_debug_info: for cvo in call_variants_outputs: self.assertEqual( cvo.debug_info, deepvariant_pb2.CallVariantsOutput.DebugInfo()) else: for cvo in call_variants_outputs: self.assertEqual(cvo.debug_info.has_insertion, variant_utils.has_insertion(cvo.variant)) self.assertEqual(cvo.debug_info.has_deletion, variant_utils.has_deletion(cvo.variant)) self.assertEqual(cvo.debug_info.is_snp, variant_utils.is_snp(cvo.variant)) self.assertEqual(cvo.debug_info.predicted_label, np.argmax(cvo.genotype_probabilities)) def example_matches_call_variants_output(example, call_variants_output): return (tf_utils.example_variant(example) == call_variants_output.variant and tf_utils.example_alt_alleles_indices(example) == call_variants_output.alt_allele_indices.indices) for call_variants_output in call_variants_outputs: # Find all matching examples. matches = [ ex for ex in examples if example_matches_call_variants_output( ex, call_variants_output) ] # We should have exactly one match. self.assertEqual(len(matches), 1) example = matches[0] # Check that we've faithfully copied in the alt alleles (though currently # as implemented we find our example using this information so it cannot # fail). Included here in case that changes in the future. self.assertEqual( list(tf_utils.example_alt_alleles_indices(example)), list(call_variants_output.alt_allele_indices.indices)) # We should have exactly three genotype probabilities (assuming our # ploidy == 2). self.assertEqual(len(call_variants_output.genotype_probabilities), 3) # These are probabilities so they should be between 0 and 1. self.assertTrue( 0 <= gp <= 1 for gp in call_variants_output.genotype_probabilities)
def test_call_end2end(self, model, shard_inputs, include_debug_info): FLAGS.include_debug_info = include_debug_info examples = list(io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES)) if shard_inputs: # Create a sharded version of our golden examples. source_path = test_utils.test_tmpfile('sharded@{}'.format(3)) io_utils.write_tfrecords(examples, source_path) else: source_path = testdata.GOLDEN_CALLING_EXAMPLES batch_size = 4 if model.name == 'random_guess': # For the random guess model we can run everything. max_batches = None else: # For all other models we only run a single batch for inference. max_batches = 1 outfile = test_utils.test_tmpfile('call_variants.tfrecord') call_variants.call_variants( examples_filename=source_path, checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST, model=model, output_file=outfile, batch_size=batch_size, max_batches=max_batches) call_variants_outputs = list( io_utils.read_tfrecords(outfile, deepvariant_pb2.CallVariantsOutput)) # Check that we have the right number of output protos. self.assertEqual( len(call_variants_outputs), batch_size * max_batches if max_batches else len(examples)) # Check that our CallVariantsOutput (CVO) have the following critical # properties: # - we have one CVO for each example we processed. # - the variant in the CVO is exactly what was in the example. # - the alt_allele_indices of the CVO match those of its corresponding # example. # - there are 3 genotype probabilities and these are between 0.0 and 1.0. # We can only do this test when processing all of the variants (max_batches # is None), since we processed all of the examples with that model. if max_batches is None: self.assertItemsEqual([cvo.variant for cvo in call_variants_outputs], [tf_utils.example_variant(ex) for ex in examples]) # Check the CVO debug_info: not filled if include_debug_info is False; # else, filled by logic based on CVO. if not include_debug_info: for cvo in call_variants_outputs: self.assertEqual(cvo.debug_info, deepvariant_pb2.CallVariantsOutput.DebugInfo()) else: for cvo in call_variants_outputs: self.assertEqual(cvo.debug_info.has_insertion, variant_utils.has_insertion(cvo.variant)) self.assertEqual(cvo.debug_info.has_deletion, variant_utils.has_deletion(cvo.variant)) self.assertEqual(cvo.debug_info.is_snp, variant_utils.is_snp( cvo.variant)) self.assertEqual(cvo.debug_info.predicted_label, np.argmax(cvo.genotype_probabilities)) def example_matches_call_variants_output(example, call_variants_output): return (tf_utils.example_variant(example) == call_variants_output.variant and tf_utils.example_alt_alleles_indices( example) == call_variants_output.alt_allele_indices.indices) for call_variants_output in call_variants_outputs: # Find all matching examples. matches = [ ex for ex in examples if example_matches_call_variants_output(ex, call_variants_output) ] # We should have exactly one match. self.assertEqual(len(matches), 1) example = matches[0] # Check that we've faithfully copied in the alt alleles (though currently # as implemented we find our example using this information so it cannot # fail). Included here in case that changes in the future. self.assertEqual( list(tf_utils.example_alt_alleles_indices(example)), list(call_variants_output.alt_allele_indices.indices)) # We should have exactly three genotype probabilities (assuming our # ploidy == 2). self.assertEqual(len(call_variants_output.genotype_probabilities), 3) # These are probabilities so they should be between 0 and 1. self.assertTrue( 0 <= gp <= 1 for gp in call_variants_output.genotype_probabilities)