def test_make_examples_training_end2end_with_alt_aligned_pileup( self, alt_align, expected_shape): region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.regions = [ranges.to_literal(region)] FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord')) FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord')) FLAGS.partition_size = 1000 FLAGS.mode = 'training' FLAGS.gvcf_gq_binsize = 5 FLAGS.alt_aligned_pileup = alt_align # This is the only input change. FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED options = make_examples.default_options(add_flags=True) # Run make_examples with the flags above. make_examples_core.make_examples_runner(options) # Check the output for shape and against the golden file. if alt_align == 'rows': golden_file = _sharded(testdata.ALT_ALIGNED_ROWS_EXAMPLES) elif alt_align == 'diff_channels': golden_file = _sharded(testdata.ALT_ALIGNED_DIFF_CHANNELS_EXAMPLES) else: raise ValueError("Golden data doesn't exist for this alt_align option: " '{}'.format(alt_align)) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=True) self.assertDeepVariantExamplesEqual( examples, list(tfrecord.read_tfrecords(golden_file))) # Pileup image should have 3 rows of height 100, so resulting height is 300. self.assertEqual(decode_example(examples[0])['image/shape'], expected_shape)
def test_make_examples_runtime_by_region(self): region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.regions = [ranges.to_literal(region)] FLAGS.mode = 'calling' num_shards = 4 FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) # Use same number of shards for profiling files as examples. output_prefix = test_utils.test_tmpfile('runtime_profile') FLAGS.runtime_by_region = output_prefix + '@{}'.format(num_shards) FLAGS.task = 2 # Run make_examples with those FLAGS. options = make_examples.default_options(add_flags=True) make_examples_core.make_examples_runner(options) # Sharded output ending in @4 becomes -00002-of-00004 for task 2. expected_output_path = output_prefix + '-0000{}-of-00004'.format(FLAGS.task) expected_columns = [ 'region', 'get reads', 'find candidates', 'make pileup images', 'write outputs', 'num reads', 'num candidates', 'num examples' ] with gfile.Open(expected_output_path, 'r') as fin: header = fin.readline() column_names = header.strip().split('\t') self.assertEqual(expected_columns, column_names) non_header_lines = fin.readlines() self.assertLen(non_header_lines, 3) one_row = non_header_lines[0].strip().split('\t') self.assertEqual(len(one_row), len(column_names)) self.assertGreater(int(one_row[5]), 0, msg='num reads > 0') self.assertGreater(int(one_row[6]), 0, msg='num candidates > 0') self.assertGreater(int(one_row[7]), 0, msg='num examples > 0')
def test_make_examples_end2end_vcf_candidate_importer(self, mode): FLAGS.variant_caller = 'vcf_candidate_importer' FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile( _sharded('vcf_candidate_importer.{}.tfrecord'.format(mode))) FLAGS.examples = test_utils.test_tmpfile( _sharded('vcf_candidate_importer.examples.{}.tfrecord'.format(mode))) FLAGS.mode = mode if mode == 'calling': golden_file = _sharded( testdata.GOLDEN_VCF_CANDIDATE_IMPORTER_CALLING_EXAMPLES) FLAGS.proposed_variants = testdata.VCF_CANDIDATE_IMPORTER_VARIANTS # Adding the following flags to match how the testdata was created. FLAGS.regions = 'chr20:59,777,000-60,000,000' FLAGS.realign_reads = False else: golden_file = _sharded( testdata.GOLDEN_VCF_CANDIDATE_IMPORTER_TRAINING_EXAMPLES) FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF options = make_examples.default_options(add_flags=True) make_examples_core.make_examples_runner(options) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, None, options, verify_labels=mode == 'training') self.assertDeepVariantExamplesEqual( examples, list(tfrecord.read_tfrecords(golden_file))) self.assertEqual( decode_example(examples[0])['image/shape'], [100, 221, dv_constants.PILEUP_NUM_CHANNELS])
def _get_examples(use_confident_regions=False): # `flag_name` can be either 'confident_regions' or 'regions'. Both should # be used to constrain the set of candidates generated, and as a result # generating the same examples. bed_path = test_utils.test_tmpfile('vcf_candidate_importer.bed') with gfile.Open(bed_path, 'w') as fout: fout.write('\t'.join(['chr20', '10000000', '10001000']) + '\n') if use_confident_regions: FLAGS.confident_regions = bed_path FLAGS.regions = '' else: FLAGS.confident_regions = '' FLAGS.regions = bed_path FLAGS.examples = test_utils.test_tmpfile( _sharded('vcf_candidate_importer.tfrecord')) FLAGS.mode = 'training' FLAGS.reads = testdata.CHR20_BAM FLAGS.ref = testdata.CHR20_FASTA FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.variant_caller = 'vcf_candidate_importer' options = make_examples.default_options(add_flags=True) make_examples_core.make_examples_runner(options) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, None, options, verify_labels=False) return examples
def _get_examples(downsample_fraction=None): if downsample_fraction is not None: FLAGS.downsample_fraction = downsample_fraction options = make_examples.default_options(add_flags=True) make_examples_core.make_examples_runner(options) examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=False) return examples
def test_make_examples_with_allele_frequency(self, mode): FLAGS.mode = 'calling' FLAGS.ref = testdata.GRCH38_FASTA FLAGS.reads = testdata.GRCH38_CHR20_AND_21_BAM num_shards = 1 FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) region = ranges.parse_literal('chr20:61001-62000') FLAGS.use_allele_frequency = True FLAGS.regions = [ranges.to_literal(region)] if mode == 'one vcf': FLAGS.population_vcfs = testdata.AF_VCF_CHR20_AND_21 elif mode == 'two vcfs': FLAGS.population_vcfs = ' '.join( [testdata.AF_VCF_CHR20, testdata.AF_VCF_CHR21]) else: raise ValueError('Invalid mode for parameterized test.') options = make_examples.default_options(add_flags=True) # Run make_examples with the flags above. make_examples_core.make_examples_runner(options) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=False) # Pileup images should have one extra channel. self.assertEqual([100, 221, dv_constants.PILEUP_NUM_CHANNELS + 1], decode_example(examples[0])['image/shape']) # Test there is something in the added channel. # Values capture whether each loci has been seen in the observed examples. population_matched_loci = { 'chr20:61539_A': False, 'chr20:61634_G': False, 'chr20:61644_G': False } for example in examples: locus_id = vis.locus_id_from_variant(vis.variant_from_example(example)) if locus_id in population_matched_loci.keys(): channels = vis.channels_from_example(example) self.assertGreater( np.sum(channels[dv_constants.PILEUP_NUM_CHANNELS]), 0, msg='There should be ' 'something in the %s-th channel for variant ' '%s' % (dv_constants.PILEUP_NUM_CHANNELS + 1, locus_id)) population_matched_loci[locus_id] = True self.assertTrue( all(population_matched_loci.values()), msg='Check that all ' '3 sample loci appeared in the examples.') # Check against the golden file (same for both modes). golden_file = _sharded(testdata.GOLDEN_ALLELE_FREQUENCY_EXAMPLES) examples_from_golden = list(tfrecord.read_tfrecords(golden_file)) self.assertDeepVariantExamplesEqual(examples_from_golden, examples)
def test_make_examples_end2end_failed_on_cram(self): region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.use_ref_for_cram = False FLAGS.write_run_info = True FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_CRAM FLAGS.candidates = test_utils.test_tmpfile(_sharded('failed.vsc.tfrecord')) FLAGS.examples = test_utils.test_tmpfile( _sharded('failed.examples.tfrecord')) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = 'calling' FLAGS.gvcf_gq_binsize = 5 options = make_examples.default_options(add_flags=True) with six.assertRaisesRegex(self, ValueError, 'Failed to parse BAM/CRAM file.'): make_examples_core.make_examples_runner(options)
def main(argv=()): with errors.clean_commandline_error_exit(): if len(argv) > 1: errors.log_and_raise( 'Command line parsing failure: make_examples does not accept ' 'positional arguments but some are present on the command line: ' '"{}".'.format(str(argv)), errors.CommandLineError) del argv # Unused. proto_utils.uses_fast_cpp_protos_or_die() logging_level.set_from_flag() hts_verbose.set(hts_verbose.htsLogLevel[FLAGS.hts_logging_level]) # Set up options; may do I/O. options = default_options(add_flags=True, flags_obj=FLAGS) check_options_are_valid(options) # Run! make_examples_core.make_examples_runner(options)
def test_make_examples_with_variant_selection(self, select_types, expected_count, keep_legacy_behavior=False): if select_types is not None: FLAGS.select_variant_types = select_types region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.regions = [ranges.to_literal(region)] FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord')) FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord')) FLAGS.partition_size = 1000 FLAGS.mode = 'calling' FLAGS.keep_legacy_allele_counter_behavior = keep_legacy_behavior options = make_examples.default_options(add_flags=True) make_examples_core.make_examples_runner(options) candidates = list(tfrecord.read_tfrecords(FLAGS.candidates)) self.assertLen(candidates, expected_count)
def test_make_examples_end2end_failed_on_mismatched_multi_bam(self): region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.write_run_info = True FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = ','.join([testdata.CHR20_BAM, testdata.NOCHR20_BAM]) FLAGS.candidates = test_utils.test_tmpfile( _sharded('mismatched_multi_bam.vsc.tfrecord')) FLAGS.examples = test_utils.test_tmpfile( _sharded('mismatched_multi_bam.examples.tfrecord')) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = 'calling' FLAGS.gvcf_gq_binsize = 5 options = make_examples.default_options(add_flags=True) # This shows an example of what the error message looks like: # redacted with six.assertRaisesRegex( self, ValueError, 'NOT_FOUND: Unknown reference_name ' 'reference_name: "chr20" start: 9999999 end: 10000999'): make_examples_core.make_examples_runner(options)
def test_make_examples_training_end2end_with_customized_classes_labeler(self): FLAGS.labeler_algorithm = 'customized_classes_labeler' FLAGS.customized_classes_labeler_classes_list = 'ref,class1,class2' FLAGS.customized_classes_labeler_info_field_name = 'type' region = ranges.parse_literal('chr20:10,000,000-10,004,000') FLAGS.regions = [ranges.to_literal(region)] FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord')) FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord')) FLAGS.partition_size = 1000 FLAGS.mode = 'training' FLAGS.gvcf_gq_binsize = 5 FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF_WITH_TYPES FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED options = make_examples.default_options(add_flags=True) make_examples_core.make_examples_runner(options) golden_file = _sharded(testdata.CUSTOMIZED_CLASSES_GOLDEN_TRAINING_EXAMPLES) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=True) self.assertDeepVariantExamplesEqual( examples, list(tfrecord.read_tfrecords(golden_file)))
def test_make_examples_end2end(self, mode, num_shards, test_condition=TestConditions.USE_BAM, labeler_algorithm=None, use_fast_pass_aligner=True): self.assertIn(mode, {'calling', 'training'}) region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.write_run_info = True FLAGS.ref = testdata.CHR20_FASTA if test_condition == TestConditions.USE_BAM: FLAGS.reads = testdata.CHR20_BAM elif test_condition == TestConditions.USE_CRAM: FLAGS.reads = testdata.CHR20_CRAM elif test_condition == TestConditions.USE_MULTI_BAMS: FLAGS.reads = ','.join( [testdata.CHR20_BAM_FIRST_HALF, testdata.CHR20_BAM_SECOND_HALF]) FLAGS.candidates = test_utils.test_tmpfile( _sharded('vsc.tfrecord', num_shards)) FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = mode FLAGS.gvcf_gq_binsize = 5 FLAGS.use_fast_pass_aligner = use_fast_pass_aligner if labeler_algorithm is not None: FLAGS.labeler_algorithm = labeler_algorithm if mode == 'calling': FLAGS.gvcf = test_utils.test_tmpfile( _sharded('gvcf.tfrecord', num_shards)) else: FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED for task_id in range(max(num_shards, 1)): FLAGS.task = task_id options = make_examples.default_options(add_flags=True) # We need to overwrite bam_fname for USE_CRAM test since Golden Set # generated from BAM file. BAM filename is stored in candidates. If we # don't overwrite default_options variants won't match and test fail. options.bam_fname = 'NA12878_S1.chr20.10_10p1mb.bam' make_examples_core.make_examples_runner(options) # Check that our run_info proto contains the basic fields we'd expect: # (a) our options are written to the run_info.options field. run_info = make_examples_core.read_make_examples_run_info( options.run_info_filename) self.assertEqual(run_info.options, options) # (b) run_info.resource_metrics is present and contains our hostname. self.assertTrue(run_info.HasField('resource_metrics')) self.assertEqual(run_info.resource_metrics.host_name, platform.node()) # Test that our candidates are reasonable, calling specific helper functions # to check lots of properties of the output. candidates = sorted( tfrecord.read_tfrecords( FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall), key=lambda c: variant_utils.variant_range_tuple(c.variant)) self.verify_deepvariant_calls(candidates, options) self.verify_variants([call.variant for call in candidates], region, options, is_gvcf=False) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=mode == 'training') example_variants = [tf_utils.example_variant(ex) for ex in examples] self.verify_variants(example_variants, region, options, is_gvcf=False) # Verify the integrity of the examples and then check that they match our # golden labeled examples. Note we expect the order for both training and # calling modes to produce deterministic order because we fix the random # seed. if mode == 'calling': golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards) else: golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards) self.assertDeepVariantExamplesEqual( examples, list(tfrecord.read_tfrecords(golden_file))) if mode == 'calling': nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) nist_variants = list(nist_reader.query(region)) self.verify_nist_concordance(example_variants, nist_variants) # Check the quality of our generated gvcf file. gvcfs = variant_utils.sorted_variants( tfrecord.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant)) self.verify_variants(gvcfs, region, options, is_gvcf=True) self.verify_contiguity(gvcfs, region) gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT, num_shards) expected_gvcfs = list( tfrecord.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant)) # Despite the name, assertCountEqual checks that all elements match. self.assertCountEqual(gvcfs, expected_gvcfs) if (mode == 'training' and num_shards == 0 and labeler_algorithm != 'positional_labeler'): # The positional labeler doesn't track metrics, so don't try to read them # in when that's the mode. self.assertEqual( make_examples_core.read_make_examples_run_info( testdata.GOLDEN_MAKE_EXAMPLES_RUN_INFO).labeling_metrics, run_info.labeling_metrics)