def test_make_tfrecord_options_with_bad_inputs(self, filenames): with self.assertRaisesRegexp( ValueError, 'Incorrect value: {}. Filenames need to be all of the same type: ' 'either all with .gz or all without .gz'.format( ','.join(filenames))): io.make_tfrecord_options(filenames)
def compression_type_of_files(files): """Return GZIP or None for the compression type of the files.""" reader_options = io_utils.make_tfrecord_options(files) if reader_options.compression_type == ( tf.python_io.TFRecordCompressionType.GZIP): return 'GZIP' return None
def assertDataSetExamplesMatchExpected(self, dataset, expected_dataset): with tf.Session() as sess: provider = slim.dataset_data_provider.DatasetDataProvider( expected_dataset.get_slim_dataset(), shuffle=False, reader_kwargs={ 'options': io_utils.make_tfrecord_options(expected_dataset.source) }) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) image, label, locus = provider.get(['image', 'label', 'locus']) seen = [ sess.run([image, label, locus])[2] for _ in range(expected_dataset.num_examples) ] coord.request_stop() coord.join(threads) expected_loci = [ example.features.feature['locus'].bytes_list.value[0] for example in io_utils.read_tfrecords(expected_dataset.source) ] self.assertEqual(len(expected_loci), expected_dataset.num_examples) self.assertEqual(expected_loci, seen) # Note that this expected shape comes from the golden dataset. If the data # is remade in the future, the values might need to be modified accordingly. self.assertEqual([100, 221, pileup_image.DEFAULT_NUM_CHANNEL], expected_dataset.tensor_shape)
def assertDataSetExamplesMatchExpected(self, dataset, expected_dataset): with tf.Session() as sess: provider = slim.dataset_data_provider.DatasetDataProvider( expected_dataset.get_slim_dataset(), shuffle=False, reader_kwargs={ 'options': io_utils.make_tfrecord_options(expected_dataset.source) }) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) image, label, locus = provider.get(['image', 'label', 'locus']) seen = [ sess.run([image, label, locus])[2] for _ in range(expected_dataset.num_examples) ] coord.request_stop() coord.join(threads) expected_loci = [ example.features.feature['locus'].bytes_list.value[0] for example in io_utils.read_tfrecords(expected_dataset.source) ] self.assertEqual(len(expected_loci), expected_dataset.num_examples) self.assertEqual(expected_loci, seen) # Note that this expected shape comes from the golden dataset. If the data # is remade in the future, the values might need to be modified accordingly. self.assertEqual([100, 221, pileup_image.DEFAULT_NUM_CHANNEL], expected_dataset.tensor_shape)
def prepare_inputs(source_path, model, batch_size, num_readers=None): """Prepares image and encoded_variant ops. Reads image / encoded_variant tuples from source_path, extracting the image and encoded_variant tensors from source_path. The image is decoded from its png encoding and preprocessed with model.preprocess_image as well. Every example in source_path is read once (num_epoch=1). Args: source_path: Path to a TFRecord file containing deepvariant tf.Example protos. model: A DeepVariantModel whose preprocess_image function will be used on image. batch_size: int > 0. Size of batches to use during inference. num_readers: int > 0 or None. Number of parallel readers to use to read examples from source_path. If None, uses FLAGS.num_readers instead. Returns: A tuple of (image, encoded_variant, encoded_alt_allele_indices) TF ops. Image is a [height, width, channel] tensor. encoded_variants is a tf.string tensor containing a serialized Variant proto describing the variant call associated with image. encoded_alt_allele_indices is a tf.string tensor containing a serialized CallVariantsOutput.AltAlleleIndices proto containing the alternate alleles indices used as "alt" when constructing the image. """ if not num_readers: num_readers = FLAGS.num_readers tensor_shape = tf_utils.get_shape_from_examples_path(source_path) def _parse_single_example(serialized_example): """Parses serialized example into a dictionary of de-serialized features.""" features = tf.parse_single_example( serialized_example, features={ 'image/encoded': tf.FixedLenFeature([], tf.string), 'variant/encoded': tf.FixedLenFeature([], tf.string), # deepvariant_pb2.CallVariantsOutput.AltAlleleIndices 'alt_allele_indices/encoded': tf.FixedLenFeature([], tf.string), }) return features with tf.name_scope('input'): def _preprocess_image(features): """Preprocess images (decode, reshape, and apply model-specific steps).""" image = features['image/encoded'] # Bypassing the reshaping and preprocessing if there is no tensor_shape. # Currently that could happen when the input file is empty. if tensor_shape: image = tf.reshape(tf.decode_raw(image, tf.uint8), tensor_shape) image = model.preprocess_image(image) features['image/encoded'] = image return features files = tf.gfile.Glob(io_utils.NormalizeToShardedFilePattern(source_path)) reader_options = io_utils.make_tfrecord_options(files) if reader_options.compression_type == ( tf.python_io.TFRecordCompressionType.GZIP): compression_type = 'GZIP' else: compression_type = None dataset = tf.data.TFRecordDataset(files, compression_type=compression_type) dataset = dataset.map( _parse_single_example, num_parallel_calls=FLAGS.num_readers) dataset = dataset.map( _preprocess_image, num_parallel_calls=FLAGS.num_readers) dataset = dataset.prefetch(10 * batch_size) dataset = dataset.batch(batch_size) iterator = dataset.make_one_shot_iterator() features = iterator.get_next() return (features['image/encoded'], features['variant/encoded'], features['alt_allele_indices/encoded'])
def make_batches(dataset, model, batch_size, mode): """Provides batches of pileup images from this dataset. Creates a DataSetProvider for dataset, extracts image, label, and variant from it, preprocesses each image with model.preprocess_image() and finally batches these up. Args: dataset: a slim DataSet we want to turn into batches. Must provide data items "image", "label", and "variant". model: a DeepVariantModel to use for preprocessing each image before batching. batch_size: the number of images in each batch. mode: str; one of TRAIN or EVAL. Returns: images: 4-D float Tensor of a batch of images with shape (batch_size, height, width, 3). labels: a 1-D integer Tensor shape (batch_size,) containing the labels for each image, in the same order. encoded_variants: Tensor of strings with shape (batch_size,). Each element of this tensor is a byte-encoded nucleus.genomics.v1.Variant protobuf in the same order as images and one_hot_labels. Raises: ValueError: if mode is not one of TRAIN or EVAL. """ if mode not in {'TRAIN', 'EVAL'}: raise ValueError( 'mode is {} but must be one of TRAIN or EVAL.'.format(mode)) if mode == 'TRAIN': data_provider = slim.dataset_data_provider.DatasetDataProvider( dataset, common_queue_capacity=2 * batch_size, common_queue_min=batch_size, reader_kwargs={ 'options': io_utils.make_tfrecord_options(dataset.data_sources) }) else: data_provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=1, shuffle=False, reader_kwargs={ 'options': io_utils.make_tfrecord_options(dataset.data_sources) }) # Load the data. image, label, variant = data_provider.get(['image', 'label', 'variant']) image = model.preprocess_image(image) if mode == 'TRAIN': return tf.train.shuffle_batch( [image, label, variant], batch_size=batch_size, num_threads=4, capacity=5000, # redacted min_after_dequeue=min(1000, dataset.num_samples)) else: return tf.train.batch( [image, label, variant], batch_size=batch_size, num_threads=1)
def prepare_inputs(source_path, model, batch_size, num_readers=None): """Prepares image and encoded_variant ops. Reads image / encoded_variant tuples from source_path, extracting the image and encoded_variant tensors from source_path. The image is decoded from its png encoding and preprocessed with model.preprocess_image as well. Every example in source_path is read once (num_epoch=1). Args: source_path: Path to a TFRecord file containing deepvariant tf.Example protos. model: A DeepVariantModel whose preprocess_image function will be used on image. batch_size: int > 0. Size of batches to use during inference. num_readers: int > 0 or None. Number of parallel readers to use to read examples from source_path. If None, uses FLAGS.num_readers instead. Returns: A tuple of (image, encoded_variant, encoded_alt_allele_indices) TF ops. Image is a [height, width, channel] tensor. encoded_variants is a tf.string tensor containing a serialized Variant proto describing the variant call associated with image. encoded_alt_allele_indices is a tf.string tensor containing a serialized CallVariantsOutput.AltAlleleIndices proto containing the alternate alleles indices used as "alt" when constructing the image. """ if not num_readers: num_readers = FLAGS.num_readers tensor_shape = tf_utils.get_shape_from_examples_path(source_path) def _parse_single_example(serialized_example): """Parses serialized example into a dictionary of de-serialized features.""" features = tf.parse_single_example( serialized_example, features={ 'image/encoded': tf.FixedLenFeature([], tf.string), 'variant/encoded': tf.FixedLenFeature([], tf.string), # deepvariant_pb2.CallVariantsOutput.AltAlleleIndices 'alt_allele_indices/encoded': tf.FixedLenFeature([], tf.string), }) return features with tf.name_scope('input'): def _preprocess_image(features): """Preprocess images (decode, reshape, and apply model-specific steps).""" image = features['image/encoded'] # Bypassing the reshaping and preprocessing if there is no tensor_shape. # Currently that could happen when the input file is empty. if tensor_shape: image = tf.reshape(tf.decode_raw(image, tf.uint8), tensor_shape) image = model.preprocess_image(image) features['image/encoded'] = image return features files = tf.gfile.Glob(io_utils.NormalizeToShardedFilePattern(source_path)) reader_options = io_utils.make_tfrecord_options(files) if reader_options.compression_type == ( tf.python_io.TFRecordCompressionType.GZIP): compression_type = 'GZIP' else: compression_type = None dataset = tf.data.TFRecordDataset(files, compression_type=compression_type) dataset = dataset.map( _parse_single_example, num_parallel_calls=FLAGS.num_readers) dataset = dataset.map( _preprocess_image, num_parallel_calls=FLAGS.num_readers) dataset = dataset.prefetch(10 * batch_size) dataset = dataset.batch(batch_size) iterator = dataset.make_one_shot_iterator() features = iterator.get_next() return (features['image/encoded'], features['variant/encoded'], features['alt_allele_indices/encoded'])
def make_batches(dataset, model, batch_size, mode): """Provides batches of pileup images from this dataset. Creates a DataSetProvider for dataset, extracts image, label, and variant from it, preprocesses each image with model.preprocess_image() and finally batches these up. Args: dataset: a slim DataSet we want to turn into batches. Must provide data items "image", "label", and "variant". model: a DeepVariantModel to use for preprocessing each image before batching. batch_size: the number of images in each batch. mode: str; one of TRAIN or EVAL. Returns: images: 4-D float Tensor of a batch of images with shape (batch_size, height, width, 3). labels: a 1-D integer Tensor shape (batch_size,) containing the labels for each image, in the same order. encoded_variants: Tensor of strings with shape (batch_size,). Each element of this tensor is a byte-encoded nucleus.genomics.v1.Variant protobuf in the same order as images and one_hot_labels. Raises: ValueError: if mode is not one of TRAIN or EVAL. """ if mode not in {'TRAIN', 'EVAL'}: raise ValueError( 'mode is {} but must be one of TRAIN or EVAL.'.format(mode)) if mode == 'TRAIN': data_provider = slim.dataset_data_provider.DatasetDataProvider( dataset, common_queue_capacity=2 * batch_size, common_queue_min=batch_size, reader_kwargs={ 'options': io_utils.make_tfrecord_options(dataset.data_sources) }) else: data_provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=1, shuffle=False, reader_kwargs={ 'options': io_utils.make_tfrecord_options(dataset.data_sources) }) # Load the data. image, label, variant = data_provider.get(['image', 'label', 'variant']) image = model.preprocess_image(image) if mode == 'TRAIN': return tf.train.shuffle_batch( [image, label, variant], batch_size=batch_size, num_threads=4, capacity=5000, # redacted min_after_dequeue=min(1000, dataset.num_samples)) else: return tf.train.batch([image, label, variant], batch_size=batch_size, num_threads=1)
def test_make_tfrecord_options_with_bad_inputs(self, filenames): with self.assertRaisesRegexp( ValueError, 'Incorrect value: {}. Filenames need to be all of the same type: ' 'either all with .gz or all without .gz'.format(','.join(filenames))): io.make_tfrecord_options(filenames)
def test_make_tfrecord_options(self, filenames, expected_compression_type): compression_type = python_io.TFRecordOptions.get_compression_type_string( io.make_tfrecord_options(filenames)) self.assertEqual(compression_type, expected_compression_type)