def testPrefetchToDeviceWithReInit(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( prefetching_ops.prefetch_to_device("/cpu:1")) # NOTE(mrry): This device block creates the "host" dataset and iterator on # /cpu:0, and ensures that the prefetching is across devices. In typical use # this would not be necessary, because the GPU device would not support any # of the dataset-related ops. with ops.device("/cpu:0"): iterator = device_dataset.make_initializable_iterator() self.assertEqual(host_dataset.output_types, device_dataset.output_types) self.assertEqual(host_dataset.output_types, iterator.output_types) self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes) self.assertEqual(host_dataset.output_shapes, iterator.output_shapes) self.assertEqual(host_dataset.output_classes, device_dataset.output_classes) self.assertEqual(host_dataset.output_classes, iterator.output_classes) next_element = iterator.get_next() self.assertEqual(dtypes.int64, next_element.dtype) self.assertEqual([], next_element.shape) worker_config = config_pb2.ConfigProto(device_count={"CPU": 2}) with self.test_session(config=worker_config) as sess: sess.run(iterator.initializer) for i in range(5): self.assertEqual(i, sess.run(next_element)) sess.run(iterator.initializer) for i in range(10): self.assertEqual(i, sess.run(next_element)) with self.assertRaises(errors.OutOfRangeError): sess.run(next_element)
def testPrefetchSparseTensorsToDevice(self): def make_tensor(i): return sparse_tensor.SparseTensorValue( indices=[[0, 0]], values=(i*[1]), dense_shape=[2, 2]) host_dataset = dataset_ops.Dataset.range(10).map(make_tensor) device_dataset = host_dataset.apply( prefetching_ops.prefetch_to_device("/cpu:1")) # NOTE(mrry): This device block creates the "host" dataset and iterator on # /cpu:0, and ensures that the prefetching is across devices. In typical use # this would not be necessary, because the GPU device would not support any # of the dataset-related ops. with ops.device("/cpu:0"): iterator = device_dataset.make_one_shot_iterator() self.assertEqual(host_dataset.output_types, device_dataset.output_types) self.assertEqual(host_dataset.output_types, iterator.output_types) self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes) self.assertEqual(host_dataset.output_shapes, iterator.output_shapes) self.assertEqual(host_dataset.output_classes, device_dataset.output_classes) self.assertEqual(host_dataset.output_classes, iterator.output_classes) next_element = iterator.get_next() self.assertEqual(dtypes.int64, next_element.dtype) worker_config = config_pb2.ConfigProto(device_count={"CPU": 2}) with self.test_session(config=worker_config) as sess: for i in range(10): actual = sess.run(next_element) self.assertAllEqual([i], actual.values) self.assertAllEqual([[0, 0]], actual.indices) self.assertAllEqual([2, 2], actual.dense_shape) with self.assertRaises(errors.OutOfRangeError): sess.run(next_element)
def testPrefetchDictToDevice(self): host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x}) device_dataset = host_dataset.apply( prefetching_ops.prefetch_to_device("/cpu:1")) # NOTE (mrry): This device block creates the "host" dataset and iterator on id:626 # https://github.com/imdone/tensorflow/issues/627 # /cpu:0, and ensures that the prefetching is across devices. In typical use # this would not be necessary, because the GPU device would not support any # of the dataset-related ops. with ops.device("/cpu:0"): iterator = device_dataset.make_one_shot_iterator() self.assertEqual(host_dataset.output_types, device_dataset.output_types) self.assertEqual(host_dataset.output_types, iterator.output_types) self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes) self.assertEqual(host_dataset.output_shapes, iterator.output_shapes) self.assertEqual(host_dataset.output_classes, device_dataset.output_classes) self.assertEqual(host_dataset.output_classes, iterator.output_classes) next_element = iterator.get_next() self.assertEqual(dtypes.int64, next_element["a"].dtype) self.assertEqual([], next_element["a"].shape) worker_config = config_pb2.ConfigProto() worker_config.device_count["CPU"] = 2 with self.test_session(config=worker_config) as sess: for i in range(10): self.assertEqual({"a": i}, sess.run(next_element)) with self.assertRaises(errors.OutOfRangeError): sess.run(next_element)
def testPrefetchToSameDevice(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( prefetching_ops.prefetch_to_device( "/job:localhost/replica:0/task:0/device:CPU:0")) # NOTE(mrry): This device block creates the "host" dataset and iterator on # /cpu:0, and ensures that the prefetching is across devices. In typical use # this would not be necessary, because the GPU device would not support any # of the dataset-related ops. with ops.device("/cpu:0"): iterator = device_dataset.make_one_shot_iterator() self.assertEqual(host_dataset.output_types, device_dataset.output_types) self.assertEqual(host_dataset.output_types, iterator.output_types) self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes) self.assertEqual(host_dataset.output_shapes, iterator.output_shapes) self.assertEqual(host_dataset.output_classes, device_dataset.output_classes) self.assertEqual(host_dataset.output_classes, iterator.output_classes) next_element = iterator.get_next() self.assertEqual(dtypes.int64, next_element.dtype) self.assertEqual([], next_element.shape) with self.cached_session() as sess: for i in range(10): self.assertEqual(i, sess.run(next_element)) with self.assertRaises(errors.OutOfRangeError): sess.run(next_element)
def testPrefetchDictToDevice(self): host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x}) device_dataset = host_dataset.apply( prefetching_ops.prefetch_to_device("/cpu:1")) # NOTE(mrry): This device block creates the "host" dataset and iterator on # /cpu:0, and ensures that the prefetching is across devices. In typical use # this would not be necessary, because the GPU device would not support any # of the dataset-related ops. with ops.device("/cpu:0"): iterator = device_dataset.make_one_shot_iterator() self.assertEqual(host_dataset.output_types, device_dataset.output_types) self.assertEqual(host_dataset.output_types, iterator.output_types) self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes) self.assertEqual(host_dataset.output_shapes, iterator.output_shapes) self.assertEqual(host_dataset.output_classes, device_dataset.output_classes) self.assertEqual(host_dataset.output_classes, iterator.output_classes) next_element = iterator.get_next() self.assertEqual(dtypes.int64, next_element["a"].dtype) self.assertEqual([], next_element["a"].shape) worker_config = config_pb2.ConfigProto() worker_config.device_count["CPU"] = 2 with self.test_session(config=worker_config) as sess: for i in range(10): self.assertEqual({"a": i}, sess.run(next_element)) with self.assertRaises(errors.OutOfRangeError): sess.run(next_element)
def testTensorsExplicitPrefetchToDevice(self): ds = Dataset.from_tensor_slices([0., 1.]) ds = ds.apply(prefetching_ops.prefetch_to_device(test.gpu_device_name())) with self.assertRaisesRegexp(TypeError, 'prefetch_to_device'): datasets.Iterator(ds) for i, x in enumerate(ds): with ops.device(test.gpu_device_name()): x = math_ops.add(x, x) self.assertEqual(float(i) + float(i), x.numpy())
def testTensorsExplicitPrefetchToDevice(self): ds = Dataset.from_tensor_slices([0., 1.]) ds = ds.apply(prefetching_ops.prefetch_to_device(test.gpu_device_name())) with self.assertRaisesRegexp(TypeError, 'prefetch_to_device'): datasets.Iterator(ds) for i, x in enumerate(ds): with ops.device(test.gpu_device_name()): x = math_ops.add(x, x) self.assertEqual(float(i) + float(i), x.numpy())
def testPrefetchToDeviceGpu(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( prefetching_ops.prefetch_to_device("/gpu:0")) iterator = device_dataset.make_one_shot_iterator() next_element = iterator.get_next() with self.cached_session() as sess: for i in range(10): self.assertEqual(i, sess.run(next_element)) with self.assertRaises(errors.OutOfRangeError): sess.run(next_element)
def testPrefetchToDeviceGpuWithReInit(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") host_dataset = dataset_ops.Dataset.range(10) gpu_name = test_util.gpu_device_name() device_dataset = host_dataset.apply( prefetching_ops.prefetch_to_device(gpu_name)) iterator = device_dataset.make_initializable_iterator() next_element = iterator.get_next() with self.test_session() as sess: sess.run(iterator.initializer) for i in range(5): self.assertEqual(i, sess.run(next_element)) sess.run(iterator.initializer) for i in range(10): self.assertEqual(i, sess.run(next_element)) with self.assertRaises(errors.OutOfRangeError): sess.run(next_element)
def wrap_as_tfdataset(x_train, y_train, data_augmentation, batch_size, gpu_local_rank=None, prefetch_to_device=False, comm=DummyComm()): '''Wrap numpy data in TF Datasets API.''' # ref: https://www.tensorflow.org/versions/master/performance/datasets_performance @IgnorePep8 buffer_size = tf.contrib.data.AUTOTUNE if TFVER >= '1.8.0' \ else 1000 shuffle_buffer = 1000 # Create the dataset and its associated one-shot iterator. dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) if TFVER >= '1.5.0': dataset = dataset.apply( tf.contrib.data.shuffle_and_repeat( shuffle_buffer)) # , seed=1234 + hvdrank)) else: dataset = dataset.shuffle(shuffle_buffer) dataset = dataset.repeat() if data_augmentation: print_rank0('USING IMAGE AUGMENTATION IN DATASET PIPELINE.', comm) def proc_dataset(images, labels): '''Aug/proc function for map_and_batch.''' images = aug_fn(images) # The per_image_standardization could be part of the model # layers (incorporate via Lambda layer), but preprocessing # via dataset pipeline seems to be faster. # images = tf.image.per_image_standardization(images) # NOTE: If using per_image_standardization then during # eval/inference the images have to be standardized as well. # Code for that would be: # xtest_dset = tf.data.Dataset.from_tensor_slices(x_test) # xtest_dset = xtest_dset.map(tf.image.per_image_standardization) # test_samples = x_test.shape[0] # xtest_dset = xtest_dset.batch(test_samples) # xtest_gen = xtest_dset.make_one_shot_iterator().get_next() # x_test = KB.get_session().run([xtest_gen]) return images, labels dataset = dataset.apply( tf.contrib.data.map_and_batch( map_func=proc_dataset, batch_size=batch_size)) # ,num_parallel_batches=4)) else: dataset = dataset.batch(batch_size) if TFVER >= '1.8.0' and prefetch_to_device: print_in_order( 'RANK {} PREFETCHING TO GPU: {}'.format(comm.rank(), gpu_local_rank), comm) # Note: In horovod once the visible device list is set you prefetch # to device starting from 0 even when that device is not # physically 0 device. # gdev = '/gpu:{}'.format(gpu_local_rank) # incorrect per Note ^. gdev = '/gpu:0' # Prefetch to GPU doesn't seem to help much dataset = dataset.apply(prefetching_ops.prefetch_to_device(gdev)) # Don't know what buffer_size to use. Some value is automatically set. # , buffer_size=10000 # Hangs if using AUTOTUNE??? # , buffer_size=tf.contrib.data.AUTOTUNE # failed to query event: CUDA_ERROR_DEINITIALIZED else: dataset.prefetch(buffer_size=buffer_size) # dataset.prefetch(buffer_size=buffer_size) return dataset