def examples_to_variants(examples_path, max_records=None): """Yields Variant protos from the examples in examples_path. This function reads in tf.Examples produced by DeepVariant from examples_path, which may contain a sharded spec, sorts them, selects a representive example when there are multiple versions representing different alt_alleles, and yields the example_variant field from those examples. Args: examples_path: str. Path, or sharded spec, to labeled tf.Examples produced by DeepVariant in training mode. max_records: int or None. Maximum number of records to read, or None, to read all of the records. Yields: nucleus.protos.Variant protos in coordinate-sorted order. Raises: ValueError: if we find a Variant in any example that doesn't have genotypes. """ examples = io_utils.read_tfrecords(examples_path, max_records=max_records) variants = sorted( (tf_utils.example_variant(example) for example in examples), key=variant_utils.variant_range_tuple) for _, group in itertools.groupby(variants, variant_utils.variant_range_tuple): variant = next(group) if not variantcall_utils.has_genotypes(variant_utils.only_call(variant)): raise ValueError(( 'Variant {} does not have any genotypes. This tool only works with ' 'variants that have been labeled.').format( variant_utils.variant_key(variant))) yield variant
def examples_to_variants(examples_path, max_records=None): """Yields Variant protos from the examples in examples_path. This function reads in tf.Examples produced by DeepVariant from examples_path, which may contain a sharded spec, sorts them, selects a representive example when there are multiple versions representing different alt_alleles, and yields the example_variant field from those examples. Args: examples_path: str. Path, or sharded spec, to labeled tf.Examples produced by DeepVariant in training mode. max_records: int or None. Maximum number of records to read, or None, to read all of the records. Yields: nucleus.protos.Variant protos in coordinate-sorted order. Raises: ValueError: if we find a Variant in any example that doesn't have genotypes. """ examples = io_utils.read_tfrecords(examples_path, max_records=max_records) variants = sorted( (tf_utils.example_variant(example) for example in examples), key=variant_utils.variant_range_tuple) for _, group in itertools.groupby(variants, variant_utils.variant_range_tuple): variant = next(group) if not variantcall_utils.has_genotypes( variant_utils.only_call(variant)): raise ValueError(( 'Variant {} does not have any genotypes. This tool only works with ' 'variants that have been labeled.').format( variant_utils.variant_key(variant))) yield variant
def test_vcf_caller_end2end_outputs(self): # Confirming that the proposed VCF (input) has the same variants # as the VCF output converted from the output of make_examples. variants = list( labeled_examples_to_vcf.examples_to_variants( testdata.GOLDEN_VCF_CALLER_TRAINING_EXAMPLES)) with vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) as proposed_vcf_reader: # This checks the keys (like chr20:10099832:A->G) are the same. self.assertEqual([variant_utils.variant_key(v1) for v1 in variants], [ variant_utils.variant_key(v2) for v2 in proposed_vcf_reader.iterate() ]) with vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) as proposed_vcf_reader: self.assertEqual( [variant_utils.genotype_as_alleles(v1) for v1 in variants], [ variant_utils.genotype_as_alleles( variant_utils.unphase_all_genotypes(v2)) for v2 in proposed_vcf_reader.iterate() ])
def main(argv): del argv contigs = fasta.RefFastaReader(FLAGS.ref).header.contigs max_records = FLAGS.max_records if FLAGS.max_records >= 0 else None variants_iter = examples_to_variants(FLAGS.examples, max_records=max_records) if not FLAGS.sample_name: sample_name, variants_iter = peek_sample_name(variants_iter) else: sample_name = FLAGS.sample_name header = dv_vcf_constants.deepvariant_header( contigs=contigs, sample_names=[sample_name]) with vcf.VcfWriter(FLAGS.output_vcf, header=header) as writer: for variant in variants_iter: variant.calls[0].call_set_name = sample_name logging.log_every_n(logging.INFO, 'Converted %s', FLAGS.log_every, variant_utils.variant_key(variant)) writer.write(variant)
def test_variant_key(self, variant, expected_key, sort_alleles=True): self.assertEqual( variant_utils.variant_key(variant, sort_alleles=sort_alleles), expected_key)
def test_variant_key(self, variant, expected_key, sort_alleles=True): self.assertEqual( variant_utils.variant_key(variant, sort_alleles=sort_alleles), expected_key)
def print_variants(name, variants): logging.info('variants: %s [%d]', name, len(variants)) for v in variants: logging.info(' %s gt=%s', variant_utils.variant_key(v), _variant_genotypes([v])[0])
def _log_variants(name, variants): """Write basic information about variants to logging.info.""" logging.info('variants: %s [%d]', name, len(variants)) for v in variants: logging.info(' %s gt=%s', variant_utils.variant_key(v), _variant_genotypes([v])[0])
def print_variants(name, variants): logging.info('variants: %s [%d]', name, len(variants)) for v in variants: logging.info(' %s gt=%s', variant_utils.variant_key(v), _variant_genotypes([v])[0])