def testGetNoneShapeFromEmptyExamplesPath(self, file_name_to_write, tfrecord_path_to_match): output_file = test_utils.test_tmpfile(file_name_to_write) io_utils.write_tfrecords([], output_file) self.assertIsNone( tf_utils.get_shape_from_examples_path( test_utils.test_tmpfile(tfrecord_path_to_match)))
def test_make_examples_runtime_by_region(self): region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.regions = [ranges.to_literal(region)] FLAGS.mode = 'calling' num_shards = 4 FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) # Use same number of shards for profiling files as examples. output_prefix = test_utils.test_tmpfile('runtime_profile') FLAGS.runtime_by_region = output_prefix + '@{}'.format(num_shards) FLAGS.task = 2 # Run make_examples with those FLAGS. options = make_examples.default_options(add_flags=True) make_examples_core.make_examples_runner(options) # Sharded output ending in @4 becomes -00002-of-00004 for task 2. expected_output_path = output_prefix + '-0000{}-of-00004'.format(FLAGS.task) expected_columns = [ 'region', 'get reads', 'find candidates', 'make pileup images', 'write outputs', 'num reads', 'num candidates', 'num examples' ] with gfile.Open(expected_output_path, 'r') as fin: header = fin.readline() column_names = header.strip().split('\t') self.assertEqual(expected_columns, column_names) non_header_lines = fin.readlines() self.assertLen(non_header_lines, 3) one_row = non_header_lines[0].strip().split('\t') self.assertEqual(len(one_row), len(column_names)) self.assertGreater(int(one_row[5]), 0, msg='num reads > 0') self.assertGreater(int(one_row[6]), 0, msg='num candidates > 0') self.assertGreater(int(one_row[7]), 0, msg='num examples > 0')
def test_conversion_to_tfrecord_and_back(self, original_input_file): """Test conversion from a native file format to tfrecord.gz, then back.""" input_path = test_utils.genomics_core_testdata(original_input_file) tfrecord_output_path = test_utils.test_tmpfile(original_input_file + ".tfrecord.gz") native_output_path = test_utils.test_tmpfile(original_input_file) # Test conversion from native format to tfrecord. self._convert(input_path, tfrecord_output_path) # redacted if native_output_path.endswith(".sam"): raise unittest.SkipTest("SAM writing not yet supported") # Test conversion from tfrecord format back to native format. Ensure that # conversions where we would need a header, but don't have one from the # input, trigger an error message. if any( native_output_path.endswith(ext) for ext in FORMATS_REQUIRING_HEADER): with self.assertRaisesRegexp( converter.ConversionError, "Input file does not have a header, which is needed to construct " "output file"): self._convert(tfrecord_output_path, native_output_path) else: self._convert(tfrecord_output_path, native_output_path)
def test_prepare_inputs(self, filename_to_write, file_string_input): source_path = test_utils.test_tmpfile(filename_to_write) io_utils.write_tfrecords(self.examples, source_path) # file_string_input could be a comma-separated list. Add the prefix to all # of them, and join it back to a string. file_string_input = ','.join( [test_utils.test_tmpfile(f) for f in file_string_input.split(',')]) with self.test_session() as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) ds = call_variants.prepare_inputs(file_string_input) _, variants, _ = data_providers.get_infer_batches(ds, model=self.model, batch_size=1) seen_variants = [] try: while True: seen_variants.extend(sess.run(variants)) except tf.errors.OutOfRangeError: pass self.assertItemsEqual(self.variants, variant_utils.decode_variants(seen_variants))
def test_make_examples_training_end2end_with_alt_aligned_pileup( self, alt_align, expected_shape): region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.regions = [ranges.to_literal(region)] FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord')) FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord')) FLAGS.partition_size = 1000 FLAGS.mode = 'training' FLAGS.gvcf_gq_binsize = 5 FLAGS.alt_aligned_pileup = alt_align # This is the only input change. FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED options = make_examples.default_options(add_flags=True) # Run make_examples with the flags above. make_examples_core.make_examples_runner(options) # Check the output for shape and against the golden file. if alt_align == 'rows': golden_file = _sharded(testdata.ALT_ALIGNED_ROWS_EXAMPLES) elif alt_align == 'diff_channels': golden_file = _sharded(testdata.ALT_ALIGNED_DIFF_CHANNELS_EXAMPLES) else: raise ValueError("Golden data doesn't exist for this alt_align option: " '{}'.format(alt_align)) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=True) self.assertDeepVariantExamplesEqual( examples, list(tfrecord.read_tfrecords(golden_file))) # Pileup image should have 3 rows of height 100, so resulting height is 300. self.assertEqual(decode_example(examples[0])['image/shape'], expected_shape)
def _get_examples(use_confident_regions=False): # `flag_name` can be either 'confident_regions' or 'regions'. Both should # be used to constrain the set of candidates generated, and as a result # generating the same examples. bed_path = test_utils.test_tmpfile('vcf_candidate_importer.bed') with gfile.Open(bed_path, 'w') as fout: fout.write('\t'.join(['chr20', '10000000', '10001000']) + '\n') if use_confident_regions: FLAGS.confident_regions = bed_path FLAGS.regions = '' else: FLAGS.confident_regions = '' FLAGS.regions = bed_path FLAGS.examples = test_utils.test_tmpfile( _sharded('vcf_candidate_importer.tfrecord')) FLAGS.mode = 'training' FLAGS.reads = testdata.CHR20_BAM FLAGS.ref = testdata.CHR20_FASTA FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.variant_caller = 'vcf_candidate_importer' options = make_examples.default_options(add_flags=True) make_examples_core.make_examples_runner(options) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, None, options, verify_labels=False) return examples
def test_make_examples_end2end_vcf_candidate_importer(self, mode): FLAGS.variant_caller = 'vcf_candidate_importer' FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile( _sharded('vcf_candidate_importer.{}.tfrecord'.format(mode))) FLAGS.examples = test_utils.test_tmpfile( _sharded('vcf_candidate_importer.examples.{}.tfrecord'.format(mode))) FLAGS.mode = mode if mode == 'calling': golden_file = _sharded( testdata.GOLDEN_VCF_CANDIDATE_IMPORTER_CALLING_EXAMPLES) FLAGS.proposed_variants = testdata.VCF_CANDIDATE_IMPORTER_VARIANTS # Adding the following flags to match how the testdata was created. FLAGS.regions = 'chr20:59,777,000-60,000,000' FLAGS.realign_reads = False else: golden_file = _sharded( testdata.GOLDEN_VCF_CANDIDATE_IMPORTER_TRAINING_EXAMPLES) FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF options = make_examples.default_options(add_flags=True) make_examples_core.make_examples_runner(options) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, None, options, verify_labels=mode == 'training') self.assertDeepVariantExamplesEqual( examples, list(tfrecord.read_tfrecords(golden_file))) self.assertEqual( decode_example(examples[0])['image/shape'], [100, 221, dv_constants.PILEUP_NUM_CHANNELS])
def testGetShapeFromExamplesPath(self, file_name_to_write, tfrecord_path_to_match): example = example_pb2.Example() valid_shape = [1, 2, 3] example.features.feature['image/shape'].int64_list.value.extend(valid_shape) output_file = test_utils.test_tmpfile(file_name_to_write) io_utils.write_tfrecords([example], output_file) tf_utils.get_shape_from_examples_path( test_utils.test_tmpfile(tfrecord_path_to_match))
def testGlobListShardedFilePatterns(self, specs, expected_files): # First, create all expected_files so Glob will work later. expected_full_files = [ test_utils.test_tmpfile(f, '') for f in expected_files ] # Create the full spec names. This one doesn't create the files. full_specs = ','.join( [test_utils.test_tmpfile(spec) for spec in specs.split(',')]) self.assertEqual(sorted(set(expected_full_files)), io.glob_list_sharded_file_patterns(full_specs))
def test_call_end2end_empty_first_shard(self): # Get only up to 10 examples. examples = list( io_utils.read_tfrecords( testdata.GOLDEN_CALLING_EXAMPLES, max_records=10)) empty_first_file = test_utils.test_tmpfile('empty_1st_shard-00000-of-00002') io_utils.write_tfrecords([], empty_first_file) second_file = test_utils.test_tmpfile('empty_1st_shard-00001-of-00002') io_utils.write_tfrecords(examples, second_file) self.assertCallVariantsEmitsNRecordsForRandomGuess( test_utils.test_tmpfile('empty_1st_shard@2'), len(examples))
def test_get_shape_from_examples_path(self, file_name_to_write, tfrecord_path_to_match): example = example_pb2.Example() valid_shape = [1, 2, 3] example.features.feature['image/shape'].int64_list.value.extend(valid_shape) output_file = test_utils.test_tmpfile(file_name_to_write) io_utils.write_tfrecords([example], output_file) ds = data_providers.DeepVariantDataSet( name='test_shape', source=test_utils.test_tmpfile(tfrecord_path_to_match), num_examples=1) self.assertEqual(valid_shape, ds.tensor_shape)
def test_reading_empty_input_should_raise_error(self): empty_shard_one = test_utils.test_tmpfile( 'no_records.tfrecord-00000-of-00002') empty_shard_two = test_utils.test_tmpfile( 'no_records.tfrecord-00001-of-00002') io_utils.write_tfrecords([], empty_shard_one) io_utils.write_tfrecords([], empty_shard_two) FLAGS.infile = test_utils.test_tmpfile('no_records.tfrecord@2') FLAGS.ref = testdata.CHR20_FASTA FLAGS.outfile = test_utils.test_tmpfile('no_records.vcf') with self.assertRaisesRegexp(ValueError, 'Cannot find any records in'): postprocess_variants.main(['postprocess_variants.py'])
def test_get_shape_from_examples_path(self, file_name_to_write, tfrecord_path_to_match): example = example_pb2.Example() valid_shape = [1, 2, 3] example.features.feature['image/shape'].int64_list.value.extend( valid_shape) output_file = test_utils.test_tmpfile(file_name_to_write) io_utils.write_tfrecords([example], output_file) ds = data_providers.DeepVariantDataSet( name='test_shape', source=test_utils.test_tmpfile(tfrecord_path_to_match), num_examples=1) self.assertEqual(valid_shape, ds.tensor_shape)
def test_get_shape_from_examples_path(self, file_name_to_write, tfrecord_path_to_match): example = example_pb2.Example() valid_shape = [1, 2, 3] example.features.feature['image/shape'].int64_list.value.extend(valid_shape) output_file = test_utils.test_tmpfile(file_name_to_write) tfrecord.write_tfrecords([example], output_file) ds = data_providers.DeepVariantInput( mode=tf.estimator.ModeKeys.PREDICT, name='test_shape', input_file_spec=test_utils.test_tmpfile(tfrecord_path_to_match), num_examples=1) self.assertEqual(valid_shape, ds.tensor_shape)
def test_reading_sharded_input_with_empty_shards_does_not_crash(self): valid_variants = io_utils.read_tfrecords( testdata.GOLDEN_POSTPROCESS_INPUT, proto=deepvariant_pb2.CallVariantsOutput) empty_shard_one = test_utils.test_tmpfile( 'reading_empty_shard.tfrecord-00000-of-00002') none_empty_shard_two = test_utils.test_tmpfile( 'reading_empty_shard.tfrecord-00001-of-00002') io_utils.write_tfrecords([], empty_shard_one) io_utils.write_tfrecords(valid_variants, none_empty_shard_two) FLAGS.infile = test_utils.test_tmpfile('reading_empty_shard.tfrecord@2') FLAGS.ref = testdata.CHR20_FASTA FLAGS.outfile = test_utils.test_tmpfile('calls_reading_empty_shard.vcf') postprocess_variants.main(['postprocess_variants.py'])
def _test_dataset_config(filename, **kwargs): """Creates a DeepVariantDatasetConfig(**kwargs) and writes it to filename.""" dataset_config_pbtext_filename = test_utils.test_tmpfile(filename) dataset_config = deepvariant_pb2.DeepVariantDatasetConfig(**kwargs) data_providers.write_dataset_config_to_pbtxt( dataset_config, dataset_config_pbtext_filename) return dataset_config_pbtext_filename
def test_call_end2end(self, compressed_inputs): FLAGS.infile = make_golden_dataset(compressed_inputs) FLAGS.ref = testdata.CHR20_FASTA FLAGS.outfile = test_utils.test_tmpfile('calls.vcf') FLAGS.nonvariant_site_tfrecord_path = ( testdata.GOLDEN_POSTPROCESS_GVCF_INPUT) FLAGS.gvcf_outfile = test_utils.test_tmpfile('gvcf_calls.vcf') postprocess_variants.main(['postprocess_variants.py']) self.assertEqual( tf.gfile.FastGFile(FLAGS.outfile).readlines(), tf.gfile.FastGFile(testdata.GOLDEN_POSTPROCESS_OUTPUT).readlines()) self.assertEqual( tf.gfile.FastGFile(FLAGS.gvcf_outfile).readlines(), tf.gfile.FastGFile(testdata.GOLDEN_POSTPROCESS_GVCF_OUTPUT).readlines())
def setUp(self): super(TabixTest, self).setUp() self.input_file = test_utils.genomics_core_testdata('test_samples.vcf.gz') self.output_file = test_utils.test_tmpfile('test_samples.vcf.gz') shutil.copyfile(self.input_file, self.output_file) self.tbx_index_file = self.output_file + '.tbi' self.csi_index_file = self.output_file + '.csi'
def test_roundtrip(self, expected_infos, expected_fmt, expected_fmt1, expected_fmt2, reader_excluded_info=None, reader_excluded_format=None, writer_excluded_info=None, writer_excluded_format=None): expected_records = [ record.format(info=info, fmt=expected_fmt, efmts1=e1, efmts2=e2) for record, info, e1, e2 in zip( self.record_format_strings, expected_infos, expected_fmt1, expected_fmt2) ] expected = self.header + ''.join(expected_records) with vcf.VcfReader( test_utils.genomics_core_testdata('test_py_roundtrip.vcf'), use_index=False, excluded_info_fields=reader_excluded_info, excluded_format_fields=reader_excluded_format) as reader: records = list(reader.iterate()) output_path = test_utils.test_tmpfile('test_roundtrip_tmpfile.vcf') with vcf.VcfWriter( output_path, header=reader.header, excluded_info_fields=writer_excluded_info, excluded_format_fields=writer_excluded_format) as writer: for record in records: writer.write(record) with open(output_path) as f: actual = f.read() self.assertEqual(actual, expected)
def test_prepare_inputs(self, filename, expand_to_file_pattern): source_path = test_utils.test_tmpfile(filename) io_utils.write_tfrecords(self.examples, source_path) if expand_to_file_pattern: # Transform foo@3 to foo-?????-of-00003. source_path = io_utils.NormalizeToShardedFilePattern(source_path) with self.test_session() as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) ds = call_variants.prepare_inputs(source_path) _, variants, _ = data_providers.get_infer_batches(ds, model=self.model, batch_size=1) seen_variants = [] try: while True: seen_variants.extend(sess.run(variants)) except tf.errors.OutOfRangeError: pass self.assertItemsEqual(self.variants, variant_utils.decode_variants(seen_variants))
def _test_dataset_config(filename, **kwargs): """Creates a DeepVariantDatasetConfig(**kwargs) and writes it to filename.""" dataset_config_pbtext_filename = test_utils.test_tmpfile(filename) dataset_config = deepvariant_pb2.DeepVariantDatasetConfig(**kwargs) data_providers.write_dataset_config_to_pbtxt(dataset_config, dataset_config_pbtext_filename) return dataset_config_pbtext_filename
def test_round_trip_vcf(self, test_datum_name): # Round-trip variants through writing and reading: # 1. Read variants v1 from VcfReader; # 2. Write v1 to vcf using our VcfWriter; # 3. Read back in using VcfReader -- v2; # 4. compare v1 and v2. in_file = test_utils.genomics_core_testdata(test_datum_name) out_file = test_utils.test_tmpfile('output_' + test_datum_name) v1_reader = vcf.VcfReader(in_file, use_index=False) v1_records = list(v1_reader.iterate()) self.assertTrue(v1_records, 'Reader failed to find records') header = copy.deepcopy(v1_reader.header) writer_options = variants_pb2.VcfWriterOptions() with vcf_writer.VcfWriter.to_file(out_file, header, writer_options) as writer: for record in v1_records: writer.write(record) v2_reader = vcf.VcfReader(out_file, use_index=False) v2_records = list(v2_reader.iterate()) self.assertEqual(v1_records, v2_records, 'Round-tripped variants not as expected')
def testModelShapes(self): # Builds a graph. v0 = tf.Variable([[1, 2, 3], [4, 5, 6]], dtype=tf.float32, name='v0') v1 = tf.Variable([[[1], [2]], [[3], [4]], [[5], [6]]], dtype=tf.float32, name='v1') init_all_op = tf.initialize_all_variables() save = tf.train.Saver({'v0': v0, 'v1': v1}) save_path = test_utils.test_tmpfile('ckpt_for_debug_string') with tf.Session() as sess: sess.run(init_all_op) # Saves a checkpoint. save.save(sess, save_path) # Model shapes without any variable requests gives you all variables. self.assertEqual({ 'v0': (2, 3), 'v1': (3, 2, 1) }, tf_utils.model_shapes(save_path)) # Asking for v0 gives you only v0's shape. self.assertEqual({'v0': (2, 3)}, tf_utils.model_shapes(save_path, ['v0'])) # Asking for v1 gives you only v1's shape. self.assertEqual({'v1': (3, 2, 1)}, tf_utils.model_shapes(save_path, ['v1'])) # Verifies model_shapes() fails for non-existent tensors. with self.assertRaisesRegexp(KeyError, 'v3'): tf_utils.model_shapes(save_path, ['v3'])
def test_call_variants_runs_on_gpus(self, model): call_variants.call_variants( examples_filename=testdata.GOLDEN_CALLING_EXAMPLES, checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST, model=model, execution_hardware='accelerator', output_file=test_utils.test_tmpfile('zzz.tfrecord'))
def testModelNumClasses(self): # Builds a graph. class_variable_name = 'class_variable_name' v0 = tf.Variable([[1, 2, 3]], dtype=tf.int32, name='class_variable_name') v1 = tf.Variable([[[1], [2]], [[3], [4]], [[5], [6]]], dtype=tf.float32, name='v1') init_all_op = tf.initialize_all_variables() save = tf.train.Saver({class_variable_name: v0, 'v1': v1}) save_path = test_utils.test_tmpfile('ckpt_for_debug_classes') with tf.Session() as sess: sess.run(init_all_op) # Saves a checkpoint. save.save(sess, save_path) # If you pass in the correct class_variable_name, you'll find the number # of classes. self.assertEqual( 3, tf_utils.model_num_classes(save_path, class_variable_name)) # If the class variable name doesn't existin the checkpoint, return None. self.assertEqual( None, tf_utils.model_num_classes(save_path, 'non-existent-var')) # If the checkpoint doesn't exist, return none. self.assertEqual( None, tf_utils.model_num_classes(None, class_variable_name))
def setUp(self): self.out_fname = test_utils.test_tmpfile('output.vcf') self.header = variants_pb2.VcfHeader( contigs=[ reference_pb2.ContigInfo(name='Chr1', n_bases=50, pos_in_fasta=0), reference_pb2.ContigInfo(name='Chr2', n_bases=25, pos_in_fasta=1), ], sample_names=['Fido', 'Spot'], formats=[ variants_pb2.VcfFormatInfo( id='GT', number='1', type='String', description='Genotype'), variants_pb2.VcfFormatInfo( id='GQ', number='1', type='Float', description='Genotype Quality') ], ) self.options = variants_pb2.VcfWriterOptions() self.writer = vcf_writer.VcfWriter.to_file(self.out_fname, self.header, self.options) self.variant = test_utils.make_variant( chrom='Chr1', start=10, alleles=['A', 'C'], ) self.variant.calls.extend([ variants_pb2.VariantCall(genotype=[0, 0], call_set_name='Fido'), variants_pb2.VariantCall(genotype=[0, 1], call_set_name='Spot'), ])
def test_call_variants_with_empty_input(self): source_path = test_utils.test_tmpfile('empty.tfrecord') io_utils.write_tfrecords([], source_path) # Make sure that prepare_inputs don't crash on empty input. call_variants.prepare_inputs(source_path, modeling.get_model('random_guess'), batch_size=1)
def setUp(self): self.out_fname = test_utils.test_tmpfile('output.vcf') self.header = variants_pb2.VcfHeader( contigs=[ reference_pb2.ContigInfo(name='Chr1', n_bases=50, pos_in_fasta=0), reference_pb2.ContigInfo(name='Chr2', n_bases=25, pos_in_fasta=1), ], sample_names=['Fido', 'Spot'], formats=[ variants_pb2.VcfFormatInfo(id='GT', number='1', type='String', description='Genotype'), variants_pb2.VcfFormatInfo(id='GQ', number='1', type='Float', description='Genotype Quality') ], ) self.options = variants_pb2.VcfWriterOptions() self.writer = vcf_writer.VcfWriter.to_file(self.out_fname, self.header, self.options) self.variant = test_utils.make_variant( chrom='Chr1', start=10, alleles=['A', 'C'], ) self.variant.calls.extend([ variants_pb2.VariantCall(genotype=[0, 0], call_set_name='Fido'), variants_pb2.VariantCall(genotype=[0, 1], call_set_name='Spot'), ])
def test_round_trip_vcf(self, test_datum_name): # Round-trip variants through writing and reading: # 1. Read variants v1 from VcfReader; # 2. Write v1 to vcf using our VcfWriter; # 3. Read back in using VcfReader -- v2; # 4. compare v1 and v2. in_file = test_utils.genomics_core_testdata(test_datum_name) out_file = test_utils.test_tmpfile('output_' + test_datum_name) v1_reader = vcf.VcfReader(in_file) v1_records = list(v1_reader.iterate()) self.assertTrue(v1_records, 'Reader failed to find records') header = copy.deepcopy(v1_reader.header) writer_options = variants_pb2.VcfWriterOptions() with vcf_writer.VcfWriter.to_file(out_file, header, writer_options) as writer: for record in v1_records: writer.write(record) v2_reader = vcf.VcfReader(out_file) v2_records = list(v2_reader.iterate()) self.assertEqual(v1_records, v2_records, 'Round-tripped variants not as expected')
def test_roundtrip(self, expected_infos, expected_fmt, expected_fmt1, expected_fmt2, reader_excluded_info=None, reader_excluded_format=None, writer_excluded_info=None, writer_excluded_format=None): expected_records = [ record.format(info=info, fmt=expected_fmt, efmts1=e1, efmts2=e2) for record, info, e1, e2 in zip( self.record_format_strings, expected_infos, expected_fmt1, expected_fmt2) ] expected = self.header + ''.join(expected_records) with vcf.VcfReader( test_utils.genomics_core_testdata('test_py_roundtrip.vcf'), excluded_info_fields=reader_excluded_info, excluded_format_fields=reader_excluded_format) as reader: records = list(reader.iterate()) output_path = test_utils.test_tmpfile('test_roundtrip_tmpfile.vcf') with vcf.VcfWriter( output_path, header=reader.header, excluded_info_fields=writer_excluded_info, excluded_format_fields=writer_excluded_format) as writer: for record in records: writer.write(record) with open(output_path) as f: actual = f.read() self.assertEqual(actual, expected)
def setUp(self): writer_options = fastq_pb2.FastqWriterOptions() out_fname = test_utils.test_tmpfile('output.fastq') self.writer = fastq_writer.FastqWriter.to_file(out_fname, writer_options) self.expected_fastq_content = [ '@NODESC:header\n', 'GATTACA\n', '+\n', 'BB>B@FA\n', '@M01321:49:000000000-A6HWP:1:1101:17009:2216 1:N:0:1\n', 'CGTTAGCGCAGGGGGCATCTTCACACTGGTGACAGGTAACCGCCGTAGTAAAGGTTCCGCCTTTCACT\n', '+\n', 'AAAAABF@BBBDGGGG?FFGFGHBFBFBFABBBHGGGFHHCEFGGGGG?FGFFHEDG3EFGGGHEGHG\n', '@FASTQ contains multiple spaces in description\n', 'CGGCTGGTCAGGCTGACATCGCCGCCGGCCTGCAGCGAGCCGCTGC\n', '+\n', 'FAFAF;F/9;.:/;999B/9A.DFFF;-->.AAB/FC;9-@-=;=.\n', '@FASTQ_with_trailing_space\n', 'CGG\n', '+\n', 'FAD\n', ] self.record = fastq_pb2.FastqRecord(id='ID', description='desc', sequence='ACGTAC', quality='ABCDEF')
def test_realigner_diagnostics(self, enabled, emit_reads): # Make sure that by default we aren't emitting any diagnostic outputs. dx_dir = test_utils.test_tmpfile('dx_enabled{}_emitreads_{}'.format( enabled, emit_reads)) region_str = 'chr20:10046178-10046188' region = ranges.parse_literal(region_str) assembled_region_str = 'chr20:10046096-10046267' reads, header = _get_reads_and_header(region) self.config = realigner.realigner_config(FLAGS) self.config.diagnostics.enabled = enabled self.config.diagnostics.output_root = dx_dir self.config.diagnostics.emit_realigned_reads = emit_reads self.reads_realigner = realigner.Realigner(self.config, self.ref_reader, header) _, _ = self.reads_realigner.realign_reads(reads, region) self.reads_realigner.diagnostic_logger.close( ) # Force close all resources. if not enabled: # Make sure our diagnostic output isn't emitted. self.assertFalse(tf.io.gfile.exists(dx_dir)) else: # Our root directory exists. self.assertTrue(tf.io.gfile.isdir(dx_dir)) # We expect a realigner_metrics.csv in our rootdir with 1 entry in it. metrics_file = os.path.join( dx_dir, self.reads_realigner.diagnostic_logger.metrics_filename) self.assertTrue(tf.io.gfile.exists(metrics_file)) with tf.io.gfile.GFile(metrics_file) as fin: rows = list(csv.DictReader(fin)) self.assertLen(rows, 1) self.assertEqual(set(rows[0].keys()), {'window', 'k', 'n_haplotypes', 'time'}) self.assertEqual(rows[0]['window'], assembled_region_str) self.assertEqual(int(rows[0]['k']), 25) self.assertTrue(int(rows[0]['n_haplotypes']), 2) # Check that our runtime is reasonable (greater than 0, less than 10 s). self.assertTrue(0.0 < float(rows[0]['time']) < 10.0) # As does the subdirectory for this region. region_subdir = os.path.join(dx_dir, assembled_region_str) self.assertTrue(tf.io.gfile.isdir(region_subdir)) # We always have a graph.dot self.assertTrue( tf.io.gfile.exists( os.path.join( region_subdir, self.reads_realigner.diagnostic_logger. graph_filename))) reads_file = os.path.join( dx_dir, region_str, self.reads_realigner.diagnostic_logger. realigned_reads_filename) # if emit_reads=False then file should not exist and vice versa. self.assertEqual(emit_reads, tf.io.gfile.exists(reads_file))
def _run(): call_variants.call_variants( examples_filename=testdata.GOLDEN_CALLING_EXAMPLES, checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST, model=self.model, execution_hardware=hardware_env, max_batches=1, batch_size=1, output_file=test_utils.test_tmpfile('zzz.tfrecord'))
def test_call_variants_with_invalid_format(self, model, bad_format): # Read one good record from a valid file. example = next(io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES)) # Overwrite the image/format field to be an invalid value # (anything but 'raw'). example.features.feature['image/format'].bytes_list.value[0] = bad_format source_path = test_utils.test_tmpfile('make_examples_output.tfrecord') io_utils.write_tfrecords([example], source_path) outfile = test_utils.test_tmpfile('call_variants_invalid_format.tfrecord') with self.assertRaises(ValueError): call_variants.call_variants( examples_filename=source_path, checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST, model=model, output_file=outfile, batch_size=1, max_batches=1)
def test_roundtrip_writer(self, filename): output_path = test_utils.test_tmpfile(filename) original_reader = sam.SamReader(test_utils.genomics_core_testdata(filename)) original_records = list(original_reader.iterate()) with sam.SamWriter(output_path, header=original_reader.header) as writer: for record in original_records: writer.write(record) with sam.SamReader(output_path) as new_reader: self.assertEqual(original_records, list(new_reader.iterate()))
def test_make_examples_with_allele_frequency(self, mode): FLAGS.mode = 'calling' FLAGS.ref = testdata.GRCH38_FASTA FLAGS.reads = testdata.GRCH38_CHR20_AND_21_BAM num_shards = 1 FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) region = ranges.parse_literal('chr20:61001-62000') FLAGS.use_allele_frequency = True FLAGS.regions = [ranges.to_literal(region)] if mode == 'one vcf': FLAGS.population_vcfs = testdata.AF_VCF_CHR20_AND_21 elif mode == 'two vcfs': FLAGS.population_vcfs = ' '.join( [testdata.AF_VCF_CHR20, testdata.AF_VCF_CHR21]) else: raise ValueError('Invalid mode for parameterized test.') options = make_examples.default_options(add_flags=True) # Run make_examples with the flags above. make_examples_core.make_examples_runner(options) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=False) # Pileup images should have one extra channel. self.assertEqual([100, 221, dv_constants.PILEUP_NUM_CHANNELS + 1], decode_example(examples[0])['image/shape']) # Test there is something in the added channel. # Values capture whether each loci has been seen in the observed examples. population_matched_loci = { 'chr20:61539_A': False, 'chr20:61634_G': False, 'chr20:61644_G': False } for example in examples: locus_id = vis.locus_id_from_variant(vis.variant_from_example(example)) if locus_id in population_matched_loci.keys(): channels = vis.channels_from_example(example) self.assertGreater( np.sum(channels[dv_constants.PILEUP_NUM_CHANNELS]), 0, msg='There should be ' 'something in the %s-th channel for variant ' '%s' % (dv_constants.PILEUP_NUM_CHANNELS + 1, locus_id)) population_matched_loci[locus_id] = True self.assertTrue( all(population_matched_loci.values()), msg='Check that all ' '3 sample loci appeared in the examples.') # Check against the golden file (same for both modes). golden_file = _sharded(testdata.GOLDEN_ALLELE_FREQUENCY_EXAMPLES) examples_from_golden = list(tfrecord.read_tfrecords(golden_file)) self.assertDeepVariantExamplesEqual(examples_from_golden, examples)
def test_call_end2end_with_empty_shards(self): # Get only up to 10 examples. examples = list( io_utils.read_tfrecords( testdata.GOLDEN_CALLING_EXAMPLES, max_records=10)) # Write to 15 shards, which means there will be multiple empty shards. source_path = test_utils.test_tmpfile('sharded@{}'.format(15)) io_utils.write_tfrecords(examples, source_path) self.assertCallVariantsEmitsNRecordsForRandomGuess(source_path, len(examples))
def test_roundtrip_writer(self, filename): output_path = test_utils.test_tmpfile(filename) with fastq.FastqWriter(output_path) as writer: for record in self.records: writer.write(record) with fastq.FastqReader(output_path) as reader: v2_records = list(reader.iterate()) self.assertEqual(self.records, v2_records)
def test_catches_bad_flags(self): # Set all of the requested flag values. region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile('vsc.tfrecord') FLAGS.examples = test_utils.test_tmpfile('examples.tfrecord') FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = 'training' FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF # This is the bad flag. FLAGS.confident_regions = '' with mock.patch.object(logging, 'error') as mock_logging,\ mock.patch.object(sys, 'exit') as mock_exit: make_examples.main(['make_examples.py']) mock_logging.assert_called_once_with( 'confident_regions is required when in training mode.') mock_exit.assert_called_once_with(errno.ENOENT)
def make_golden_dataset(compressed_inputs=False): if compressed_inputs: source_path = test_utils.test_tmpfile('make_golden_dataset.tfrecord.gz') io_utils.write_tfrecords( io_utils.read_tfrecords(testdata.GOLDEN_TRAINING_EXAMPLES), source_path) else: source_path = testdata.GOLDEN_TRAINING_EXAMPLES return data_providers.DeepVariantDataSet( name='labeled_golden', source=source_path, num_examples=testdata.N_GOLDEN_TRAINING_EXAMPLES)
def test_call_variants_with_no_shape(self, model): # Read one good record from a valid file. example = next(io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES)) # Remove image/shape. del example.features.feature['image/shape'] source_path = test_utils.test_tmpfile('make_examples_out_noshape.tfrecord') io_utils.write_tfrecords([example], source_path) with self.assertRaisesRegexp( ValueError, 'Invalid image/shape: we expect to find an image/shape ' 'field with length 3.'): call_variants.prepare_inputs(source_path, model, batch_size=1)
def make_golden_dataset(compressed_inputs=False): if compressed_inputs: source_path = test_utils.test_tmpfile( 'golden.postprocess_single_site_input.tfrecord.gz') io_utils.write_tfrecords( io_utils.read_tfrecords( testdata.GOLDEN_POSTPROCESS_INPUT, proto=deepvariant_pb2.CallVariantsOutput), source_path) else: source_path = testdata.GOLDEN_POSTPROCESS_INPUT return source_path
def _parse_read_with_aux_tags(self, tag_string): # Minimal header line to create a valid SAM file. header_lines = '@HD\tVN:1.3\tSO:coordinate\n@SQ\tSN:chr1\tLN:248956422\n' # A single stock read we'll add our AUX fields to. read = 'read_name\t0\tchr1\t1\t0\t3M\t*\t0\t0\tCCC\tAAA\t' + tag_string path = test_utils.test_tmpfile('aux_tags.bam') with gfile.FastGFile(path, 'w') as fout: fout.write(header_lines) fout.write(read + '\n') with sam.SamReader( path, use_index=False, parse_aux_fields=True) as reader: return list(reader.iterate())
def test_catches_bad_argv(self): # Define valid flags to ensure raise occurs due to argv issues. FLAGS.infile = make_golden_dataset(False) FLAGS.ref = testdata.CHR20_FASTA FLAGS.outfile = test_utils.test_tmpfile('nonempty_outfile.vcf') with mock.patch.object(logging, 'error') as mock_logging,\ mock.patch.object(sys, 'exit') as mock_exit: postprocess_variants.main(['postprocess_variants.py', 'extra_arg']) mock_logging.assert_called_once_with( 'Command line parsing failure: postprocess_variants does not accept ' 'positional arguments but some are present on the command line: ' '"[\'postprocess_variants.py\', \'extra_arg\']".') mock_exit.assert_called_once_with(errno.ENOENT)
def test_call_variants_non_accelerated_execution_runs(self, execution_hardware): # This doesn't mock out the list_devices call so it's worth keeping # despite being very similar to the parameterized test below. outfile = test_utils.test_tmpfile('call_variants_cpu_only.tfrecord') call_variants.call_variants( examples_filename=testdata.GOLDEN_CALLING_EXAMPLES, checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST, model=self.model, execution_hardware=execution_hardware, max_batches=1, batch_size=1, output_file=outfile)
def assertCallVariantsEmitsNRecordsForRandomGuess(self, filename, num_examples): outfile = test_utils.test_tmpfile('call_variants.tfrecord') model = modeling.get_model('random_guess') call_variants.call_variants( examples_filename=filename, checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST, model=model, output_file=outfile, batch_size=4, max_batches=None) call_variants_outputs = list( io_utils.read_tfrecords(outfile, deepvariant_pb2.CallVariantsOutput)) # Check that we have the right number of output protos. self.assertEqual(len(call_variants_outputs), num_examples)
def test_make_read_writer_tfrecords(self): outfile = test_utils.test_tmpfile('test.tfrecord') writer = sam.SamWriter(outfile, header=self.header) # Test that the writer is a context manager and that we can write a read to # it. with writer: writer.write(self.read1) writer.write(self.read2) # Our output should have exactly one read in it. self.assertEqual([self.read1, self.read2], list( io_utils.read_tfrecords(outfile, proto=reads_pb2.Read)))
def _run_tiny_training(self, model_name, dataset): with mock.patch( 'deepvariant.data_providers.get_dataset' ) as mock_get_dataset: mock_get_dataset.return_value = dataset FLAGS.train_dir = test_utils.test_tmpfile(model_name) FLAGS.batch_size = 2 FLAGS.model_name = model_name FLAGS.save_interval_secs = 0 FLAGS.number_of_steps = 1 FLAGS.dataset_config_pbtxt = '/path/to/mock.pbtxt' FLAGS.start_from_checkpoint = '' model_train.parse_and_run() # We have a checkpoint after training. mock_get_dataset.assert_called_once_with(FLAGS.dataset_config_pbtxt) self.assertIsNotNone(tf.train.latest_checkpoint(FLAGS.train_dir))
def test_reading_sharded_dataset(self, compressed_inputs): golden_dataset = make_golden_dataset(compressed_inputs) n_shards = 3 sharded_path = test_utils.test_tmpfile('sharded@{}'.format(n_shards)) io_utils.write_tfrecords( io_utils.read_tfrecords(golden_dataset.source), sharded_path) config_file = _test_dataset_config( 'test_sharded.pbtxt', name='sharded_test', tfrecord_path=sharded_path, num_examples=golden_dataset.num_examples) self.assertDataSetExamplesMatchExpected( data_providers.get_dataset(config_file).get_slim_dataset(), golden_dataset)
def write_variant_to_tempfile(self, variant): output_path = test_utils.test_tmpfile('test.vcf') header = variants_pb2.VcfHeader( contigs=[reference_pb2.ContigInfo(name='20')], sample_names=[call.call_set_name for call in variant.calls], formats=[ variants_pb2.VcfFormatInfo( id='DP', number='1', type='Integer', description='Read depth'), variants_pb2.VcfFormatInfo( id='AD', number='R', type='Integer', description='Read depth for each allele') ]) writer = vcf.VcfWriter(output_path, header=header) with writer: writer.write(variant) return output_path
def test_prepare_inputs(self, filename, expand_to_file_pattern): source_path = test_utils.test_tmpfile(filename) io_utils.write_tfrecords(self.examples, source_path) if expand_to_file_pattern: # Transform foo@3 to foo-?????-of-00003. source_path = io_utils.NormalizeToShardedFilePattern(source_path) with self.test_session() as sess: _, variants, _ = call_variants.prepare_inputs( source_path, self.model, batch_size=1) sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) seen_variants = [] try: while True: seen_variants.extend(sess.run(variants)) except tf.errors.OutOfRangeError: pass self.assertItemsEqual(self.variants, variant_utils.decode_variants(seen_variants))
def test_call_end2end_zero_record_file(self): zero_record_file = test_utils.test_tmpfile('zero_record_file') io_utils.write_tfrecords([], zero_record_file) self.assertCallVariantsEmitsNRecordsForRandomGuess( test_utils.test_tmpfile('zero_record_file'), 0)
def test_call_end2end(self, model, shard_inputs, include_debug_info): FLAGS.include_debug_info = include_debug_info examples = list(io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES)) if shard_inputs: # Create a sharded version of our golden examples. source_path = test_utils.test_tmpfile('sharded@{}'.format(3)) io_utils.write_tfrecords(examples, source_path) else: source_path = testdata.GOLDEN_CALLING_EXAMPLES batch_size = 4 if model.name == 'random_guess': # For the random guess model we can run everything. max_batches = None else: # For all other models we only run a single batch for inference. max_batches = 1 outfile = test_utils.test_tmpfile('call_variants.tfrecord') call_variants.call_variants( examples_filename=source_path, checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST, model=model, output_file=outfile, batch_size=batch_size, max_batches=max_batches) call_variants_outputs = list( io_utils.read_tfrecords(outfile, deepvariant_pb2.CallVariantsOutput)) # Check that we have the right number of output protos. self.assertEqual( len(call_variants_outputs), batch_size * max_batches if max_batches else len(examples)) # Check that our CallVariantsOutput (CVO) have the following critical # properties: # - we have one CVO for each example we processed. # - the variant in the CVO is exactly what was in the example. # - the alt_allele_indices of the CVO match those of its corresponding # example. # - there are 3 genotype probabilities and these are between 0.0 and 1.0. # We can only do this test when processing all of the variants (max_batches # is None), since we processed all of the examples with that model. if max_batches is None: self.assertItemsEqual([cvo.variant for cvo in call_variants_outputs], [tf_utils.example_variant(ex) for ex in examples]) # Check the CVO debug_info: not filled if include_debug_info is False; # else, filled by logic based on CVO. if not include_debug_info: for cvo in call_variants_outputs: self.assertEqual(cvo.debug_info, deepvariant_pb2.CallVariantsOutput.DebugInfo()) else: for cvo in call_variants_outputs: self.assertEqual(cvo.debug_info.has_insertion, variant_utils.has_insertion(cvo.variant)) self.assertEqual(cvo.debug_info.has_deletion, variant_utils.has_deletion(cvo.variant)) self.assertEqual(cvo.debug_info.is_snp, variant_utils.is_snp( cvo.variant)) self.assertEqual(cvo.debug_info.predicted_label, np.argmax(cvo.genotype_probabilities)) def example_matches_call_variants_output(example, call_variants_output): return (tf_utils.example_variant(example) == call_variants_output.variant and tf_utils.example_alt_alleles_indices( example) == call_variants_output.alt_allele_indices.indices) for call_variants_output in call_variants_outputs: # Find all matching examples. matches = [ ex for ex in examples if example_matches_call_variants_output(ex, call_variants_output) ] # We should have exactly one match. self.assertEqual(len(matches), 1) example = matches[0] # Check that we've faithfully copied in the alt alleles (though currently # as implemented we find our example using this information so it cannot # fail). Included here in case that changes in the future. self.assertEqual( list(tf_utils.example_alt_alleles_indices(example)), list(call_variants_output.alt_allele_indices.indices)) # We should have exactly three genotype probabilities (assuming our # ploidy == 2). self.assertEqual(len(call_variants_output.genotype_probabilities), 3) # These are probabilities so they should be between 0 and 1. self.assertTrue( 0 <= gp <= 1 for gp in call_variants_output.genotype_probabilities)
def test_make_examples_end2end(self, mode, num_shards, labeler_algorithm=None): self.maxDiff = None self.assertIn(mode, {'calling', 'training'}) region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile( _sharded('vsc.tfrecord', num_shards)) FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = mode FLAGS.gvcf_gq_binsize = 5 if labeler_algorithm is not None: FLAGS.labeler_algorithm = labeler_algorithm if mode == 'calling': FLAGS.gvcf = test_utils.test_tmpfile( _sharded('gvcf.tfrecord', num_shards)) else: FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED for task_id in range(max(num_shards, 1)): FLAGS.task = task_id options = make_examples.default_options(add_flags=True) make_examples.make_examples_runner(options) # Test that our candidates are reasonable, calling specific helper functions # to check lots of properties of the output. candidates = sorted( io_utils.read_tfrecords( FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall), key=lambda c: variant_utils.variant_range_tuple(c.variant)) self.verify_deepvariant_calls(candidates, options) self.verify_variants( [call.variant for call in candidates], region, options, is_gvcf=False) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=mode == 'training') example_variants = [tf_utils.example_variant(ex) for ex in examples] self.verify_variants(example_variants, region, options, is_gvcf=False) # Verify the integrity of the examples and then check that they match our # golden labeled examples. Note we expect the order for both training and # calling modes to produce deterministic order because we fix the random # seed. if mode == 'calling': golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards) else: golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards) self.assertDeepVariantExamplesEqual( examples, list(io_utils.read_tfrecords(golden_file))) if mode == 'calling': nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) nist_variants = list(nist_reader.query(region)) self.verify_nist_concordance(example_variants, nist_variants) # Check the quality of our generated gvcf file. gvcfs = variant_utils.sorted_variants( io_utils.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant)) self.verify_variants(gvcfs, region, options, is_gvcf=True) self.verify_contiguity(gvcfs, region) gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT, num_shards) expected_gvcfs = list( io_utils.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant)) self.assertItemsEqual(gvcfs, expected_gvcfs)
def test_call_variants_with_empty_input(self): source_path = test_utils.test_tmpfile('empty.tfrecord') io_utils.write_tfrecords([], source_path) # Make sure that prepare_inputs don't crash on empty input. call_variants.prepare_inputs( source_path, modeling.get_model('random_guess'), batch_size=1)
def write_test_protos(self, filename): protos = [reference_pb2.ContigInfo(name=str(i)) for i in range(10)] path = test_utils.test_tmpfile(filename) io.write_tfrecords(protos, path) return protos, path