def test_variant_position_and_range(self): v1 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=10) v2 = test_utils.make_variant(chrom='1', alleles=['AGCT', 'C'], start=10) pos = ranges.make_range('1', 10, 11) range_ = ranges.make_range('1', 10, 14) v1_range_tuple = ('1', 10, 11) v2_range_tuple = ('1', 10, 14) self.assertEqual(pos, variant_utils.variant_position(v1)) self.assertEqual(pos, variant_utils.variant_position(v2)) self.assertEqual(pos, variant_utils.variant_range(v1)) self.assertEqual(range_, variant_utils.variant_range(v2)) self.assertEqual(v1_range_tuple, variant_utils.variant_range_tuple(v1)) self.assertEqual(v2_range_tuple, variant_utils.variant_range_tuple(v2))
def _range_tuples(variants): return [variant_utils.variant_range_tuple(v) for v in variants]
def _example_sort_key(example): return variant_utils.variant_range_tuple(tf_utils.example_variant(example))
def test_make_examples_end2end(self, mode, num_shards, labeler_algorithm=None): self.maxDiff = None self.assertIn(mode, {'calling', 'training'}) region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile( _sharded('vsc.tfrecord', num_shards)) FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = mode FLAGS.gvcf_gq_binsize = 5 if labeler_algorithm is not None: FLAGS.labeler_algorithm = labeler_algorithm if mode == 'calling': FLAGS.gvcf = test_utils.test_tmpfile( _sharded('gvcf.tfrecord', num_shards)) else: FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED for task_id in range(max(num_shards, 1)): FLAGS.task = task_id options = make_examples.default_options(add_flags=True) make_examples.make_examples_runner(options) # Test that our candidates are reasonable, calling specific helper functions # to check lots of properties of the output. candidates = sorted( io_utils.read_tfrecords( FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall), key=lambda c: variant_utils.variant_range_tuple(c.variant)) self.verify_deepvariant_calls(candidates, options) self.verify_variants( [call.variant for call in candidates], region, options, is_gvcf=False) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=mode == 'training') example_variants = [tf_utils.example_variant(ex) for ex in examples] self.verify_variants(example_variants, region, options, is_gvcf=False) # Verify the integrity of the examples and then check that they match our # golden labeled examples. Note we expect the order for both training and # calling modes to produce deterministic order because we fix the random # seed. if mode == 'calling': golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards) else: golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards) self.assertDeepVariantExamplesEqual( examples, list(io_utils.read_tfrecords(golden_file))) if mode == 'calling': nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) nist_variants = list(nist_reader.query(region)) self.verify_nist_concordance(example_variants, nist_variants) # Check the quality of our generated gvcf file. gvcfs = variant_utils.sorted_variants( io_utils.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant)) self.verify_variants(gvcfs, region, options, is_gvcf=True) self.verify_contiguity(gvcfs, region) gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT, num_shards) expected_gvcfs = list( io_utils.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant)) self.assertItemsEqual(gvcfs, expected_gvcfs)
def test_make_examples_end2end(self, mode, num_shards, test_condition=TestConditions.USE_BAM, labeler_algorithm=None, use_fast_pass_aligner=True): self.assertIn(mode, {'calling', 'training'}) region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.write_run_info = True FLAGS.ref = testdata.CHR20_FASTA if test_condition == TestConditions.USE_BAM: FLAGS.reads = testdata.CHR20_BAM elif test_condition == TestConditions.USE_CRAM: FLAGS.reads = testdata.CHR20_CRAM elif test_condition == TestConditions.USE_MULTI_BAMS: FLAGS.reads = ','.join( [testdata.CHR20_BAM_FIRST_HALF, testdata.CHR20_BAM_SECOND_HALF]) FLAGS.candidates = test_utils.test_tmpfile( _sharded('vsc.tfrecord', num_shards)) FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = mode FLAGS.gvcf_gq_binsize = 5 FLAGS.use_fast_pass_aligner = use_fast_pass_aligner if labeler_algorithm is not None: FLAGS.labeler_algorithm = labeler_algorithm if mode == 'calling': FLAGS.gvcf = test_utils.test_tmpfile( _sharded('gvcf.tfrecord', num_shards)) else: FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED for task_id in range(max(num_shards, 1)): FLAGS.task = task_id options = make_examples.default_options(add_flags=True) # We need to overwrite bam_fname for USE_CRAM test since Golden Set # generated from BAM file. BAM filename is stored in candidates. If we # don't overwrite default_options variants won't match and test fail. options.bam_fname = 'NA12878_S1.chr20.10_10p1mb.bam' make_examples_core.make_examples_runner(options) # Check that our run_info proto contains the basic fields we'd expect: # (a) our options are written to the run_info.options field. run_info = make_examples_core.read_make_examples_run_info( options.run_info_filename) self.assertEqual(run_info.options, options) # (b) run_info.resource_metrics is present and contains our hostname. self.assertTrue(run_info.HasField('resource_metrics')) self.assertEqual(run_info.resource_metrics.host_name, platform.node()) # Test that our candidates are reasonable, calling specific helper functions # to check lots of properties of the output. candidates = sorted( tfrecord.read_tfrecords( FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall), key=lambda c: variant_utils.variant_range_tuple(c.variant)) self.verify_deepvariant_calls(candidates, options) self.verify_variants([call.variant for call in candidates], region, options, is_gvcf=False) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=mode == 'training') example_variants = [tf_utils.example_variant(ex) for ex in examples] self.verify_variants(example_variants, region, options, is_gvcf=False) # Verify the integrity of the examples and then check that they match our # golden labeled examples. Note we expect the order for both training and # calling modes to produce deterministic order because we fix the random # seed. if mode == 'calling': golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards) else: golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards) self.assertDeepVariantExamplesEqual( examples, list(tfrecord.read_tfrecords(golden_file))) if mode == 'calling': nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) nist_variants = list(nist_reader.query(region)) self.verify_nist_concordance(example_variants, nist_variants) # Check the quality of our generated gvcf file. gvcfs = variant_utils.sorted_variants( tfrecord.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant)) self.verify_variants(gvcfs, region, options, is_gvcf=True) self.verify_contiguity(gvcfs, region) gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT, num_shards) expected_gvcfs = list( tfrecord.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant)) # Despite the name, assertCountEqual checks that all elements match. self.assertCountEqual(gvcfs, expected_gvcfs) if (mode == 'training' and num_shards == 0 and labeler_algorithm != 'positional_labeler'): # The positional labeler doesn't track metrics, so don't try to read them # in when that's the mode. self.assertEqual( make_examples_core.read_make_examples_run_info( testdata.GOLDEN_MAKE_EXAMPLES_RUN_INFO).labeling_metrics, run_info.labeling_metrics)