def _next_internal(self): autograph_status = autograph_ctx.control_status_ctx().status autograph_disabled = autograph_status == autograph_ctx.Status.DISABLED if not context.executing_eagerly() and autograph_disabled: self._get_next_call_count += 1 if self._get_next_call_count > GET_NEXT_CALL_ERROR_THRESHOLD: raise ValueError(GET_NEXT_CALL_ERROR_MESSAGE) if not context.executing_eagerly(): # TODO(b/169442955): Investigate the need for this colocation constraint. with ops.colocate_with(self._iterator_resource): ret = gen_dataset_ops.iterator_get_next( self._iterator_resource, output_types=self._flat_output_types, output_shapes=self._flat_output_shapes) return structure.from_compatible_tensor_list( self._element_spec, ret) # TODO(b/77291417): This runs in sync mode as iterators use an error status # to communicate that there is no more data to iterate over. with context.execution_mode(context.SYNC): ret = gen_dataset_ops.iterator_get_next( self._iterator_resource, output_types=self._flat_output_types, output_shapes=self._flat_output_shapes) try: # Fast path for the case `self._structure` is not a nested structure. return self._element_spec._from_compatible_tensor_list(ret) # pylint: disable=protected-access except AttributeError: return structure.from_compatible_tensor_list( self._element_spec, ret)
def testExecuteBasicAsync(self): with context.execution_mode(context.ASYNC): three = constant_op.constant(3) five = constant_op.constant(5) product = execute(b'Mul', num_outputs=1, inputs=[three, five], attrs=('T', three.dtype.as_datatype_enum))[0] self.assertAllEqual(15, product) # Error: Invalid arguments # TODO(b/149995282): When an exception is thrown in ASYNC mode, it seems # there are things left over that cause mutex corruption when # _reset_context() is called before the next test is executed. # # context.set_execution_mode(context.ASYNC) # with self.assertRaises(errors.InvalidArgumentError): # execute( # b'MatMul', # num_outputs=1, # inputs=[three, five], # attrs=('transpose_a', False, 'transpose_b', False, 'T', # three.dtype.as_datatype_enum)) # context.context().executor.wait() # context.context().executor.clear_error() context.context().execution_mode = context.SYNC
def _next_internal(self): """Returns a nested structure of `tf.Tensor`s containing the next element. """ if not context.executing_eagerly(): with ops.device(self._device): ret = gen_dataset_ops.iterator_get_next( self._iterator_resource, output_types=self._flat_output_types, output_shapes=self._flat_output_shapes) return structure.from_compatible_tensor_list( self._element_spec, ret) # This runs in sync mode as iterators use an error status to communicate # that there is no more data to iterate over. # TODO(b/77291417): Fix with context.execution_mode(context.SYNC): with ops.device(self._device): # TODO(ashankar): Consider removing this ops.device() contextmanager # and instead mimic ops placement in graphs: Operations on resource # handles execute on the same device as where the resource is placed. ret = gen_dataset_ops.iterator_get_next( self._iterator_resource, output_types=self._flat_output_types, output_shapes=self._flat_output_shapes) try: # Fast path for the case `self._structure` is not a nested structure. return self._element_spec._from_compatible_tensor_list(ret) # pylint: disable=protected-access except AttributeError: return structure.from_compatible_tensor_list( self._element_spec, ret)
def _benchmark_eager_apply(self, label, device_and_format, defun=False, execution_mode=None): with context.execution_mode(execution_mode): device, data_format = device_and_format model = resnet50.ResNet50(data_format) if defun: model.call = tf.function(model.call) batch_size = 64 num_burn = 5 num_iters = 30 with tf.device(device): images, _ = resnet50_test_util.random_batch( batch_size, data_format) for _ in xrange(num_burn): model(images, training=False).cpu() if execution_mode: context.async_wait() gc.collect() start = time.time() for _ in xrange(num_iters): model(images, training=False).cpu() if execution_mode: context.async_wait() self._report(label, start, num_iters, device, batch_size, data_format)
def testDatasetEagerIteration(self, execution_mode): with context.eager_mode(), context.execution_mode(execution_mode): val = 0 dataset = dataset_ops.Dataset.range(10) for foo in dataset: self.assertEqual(val, foo.numpy()) val += 1
def decorator(self, *args, **kwargs): # TODO(b/117110239): Re-enable. # with context.execution_mode(context.ASYNC): # f(self, *args, **kwargs) with context.execution_mode(context.SYNC): f(self, *args, **kwargs)
def _next_internal(self): if not context.executing_eagerly(): # TODO(b/169442955): Investigate the need for this colocation constraint. with ops.colocate_with(self._iterator_resource): ret = gen_dataset_ops.iterator_get_next( self._iterator_resource, output_types=self._flat_output_types, output_shapes=self._flat_output_shapes) return structure.from_compatible_tensor_list( self._element_spec, ret) # TODO(b/77291417): This runs in sync mode as iterators use an error status # to communicate that there is no more data to iterate over. with context.execution_mode(context.SYNC): ret = gen_dataset_ops.iterator_get_next( self._iterator_resource, output_types=self._flat_output_types, output_shapes=self._flat_output_shapes) try: # Fast path for the case `self._structure` is not a nested structure. return self._element_spec._from_compatible_tensor_list(ret) # pylint: disable=protected-access except AttributeError: return structure.from_compatible_tensor_list( self._element_spec, ret)
def _next_internal(self): """Returns a nested structure of `tf.Tensor`s containing the next element. """ if not context.executing_eagerly(): with ops.device(self._device): ret = gen_dataset_ops.iterator_get_next( self._iterator_resource, output_types=self._flat_output_types, output_shapes=self._flat_output_shapes) return self._structure._from_compatible_tensor_list(ret) # pylint: disable=protected-access # This runs in sync mode as iterators use an error status to communicate # that there is no more data to iterate over. # TODO(b/77291417): Fix with context.execution_mode(context.SYNC): with ops.device(self._device): # TODO(ashankar): Consider removing this ops.device() contextmanager # and instead mimic ops placement in graphs: Operations on resource # handles execute on the same device as where the resource is placed. # NOTE(mrry): Here we use the "_sync" variant of `iterator_get_next` # because in eager mode this code will run synchronously on the calling # thread. Therefore we do not need to make a defensive context switch # to a background thread, and can achieve a small constant performance # boost by invoking the iterator synchronously. ret = gen_dataset_ops.iterator_get_next_sync( self._iterator_resource, output_types=self._flat_output_types, output_shapes=self._flat_output_shapes) return self._structure._from_compatible_tensor_list(ret) # pylint: disable=protected-access
def testEagerIteratorAsync(self): with context.eager_mode(), context.execution_mode(context.ASYNC): val = 0 dataset = dataset_ops.Dataset.range(10) for foo in dataset: self.assertEqual(val, foo.numpy()) val += 1
def _next_internal(self): """Returns a nested structure of `tf.Tensor`s containing the next element. """ if not context.executing_eagerly(): with ops.device(self._device): ret = gen_dataset_ops.iterator_get_next( self._iterator_resource, output_types=self._flat_output_types, output_shapes=self._flat_output_shapes) return structure.from_compatible_tensor_list(self._element_spec, ret) # This runs in sync mode as iterators use an error status to communicate # that there is no more data to iterate over. # TODO(b/77291417): Fix with context.execution_mode(context.SYNC): with ops.device(self._device): # TODO(ashankar): Consider removing this ops.device() contextmanager # and instead mimic ops placement in graphs: Operations on resource # handles execute on the same device as where the resource is placed. # NOTE(mrry): Here we use the "_sync" variant of `iterator_get_next` # because in eager mode this code will run synchronously on the calling # thread. Therefore we do not need to make a defensive context switch # to a background thread, and can achieve a small constant performance # boost by invoking the iterator synchronously. ret = gen_dataset_ops.iterator_get_next_sync( self._iterator_resource, output_types=self._flat_output_types, output_shapes=self._flat_output_shapes) try: # Fast path for the case `self._structure` is not a nested structure. return self._element_spec._from_compatible_tensor_list(ret) # pylint: disable=protected-access except AttributeError: return structure.from_compatible_tensor_list(self._element_spec, ret)
def _next_internal(self): """Returns a nested structure of `tf.Tensor`s containing the next element. """ # This runs in sync mode as iterators use an error status to communicate # that there is no more data to iterate over. # TODO(b/77291417): Fix with context.execution_mode(context.SYNC): return super(Iterator, self)._next_internal()
def benchmarkAddScalars(self): with context.execution_mode(context.GRAPH_MODE): x = array_ops.placeholder(shape=[], dtype=dtypes.float32, name="x") y = array_ops.placeholder(shape=[], dtype=dtypes.float32, name="y") def bench(): return gen_math_ops.add(x, y) self._run_and_report(bench, 1000)
def _apply(self, defun=False, execution_mode=None): device, data_format = resnet50_test_util.device_and_data_format() model = resnet50.ResNet50(data_format) if defun: model.call = tf.function(model.call) with tf.device(device), context.execution_mode(execution_mode): images, _ = resnet50_test_util.random_batch(2, data_format) output = model(images, training=False) context.async_wait() self.assertEqual((2, 1000), output.shape)
def benchmarkMatMul(self): with context.execution_mode(context.GRAPH_MODE): x = array_ops.placeholder( shape=[784, 1000], dtype=dtypes.float32, name="x") y = array_ops.placeholder( shape=[1000, 1000], dtype=dtypes.float32, name="y") def bench(): return gen_math_ops.mat_mul(x, y) self._run_and_report(bench, 1000)
def _next_internal(self): """Returns a nested structure of `tf.Tensor`s containing the next element. """ # This runs in sync mode as iterators use an error status to communicate # that there is no more data to iterate over. # TODO(b/77291417): Fix with context.execution_mode(context.SYNC): with ops.device(self._device): flat_ret = ged_ops.experimental_function_buffering_resource_get_next( function_buffer_resource=self._buffering_resource, output_types=self._flat_output_types) return self._element_structure._from_tensor_list(flat_ret)
def _test_train(self, execution_mode=None): start = time.process_time() model = mnist.custom_model() with tf.device("CPU"), context.execution_mode(execution_mode): optimizer = tf.keras.optimizers.SGD(0.1) images, labels = random_batch(1000) apply_gradients(model, optimizer, compute_gradients(model, images, labels)) context.async_wait() end = time.process_time() print("time: ", end - start)
def _test_train(self, execution_mode=None): start = time.process_time() device, data_format = device_and_data_format() model = resnet50.ResNet50(data_format) for i in range(10): with tf.device(device), context.execution_mode(execution_mode): optimizer = tf.keras.optimizers.SGD(0.1) images, labels = random_batch(32, data_format) apply_gradients(model, optimizer, compute_gradients(model, images, labels)) context.async_wait() end = time.process_time() print("time: ", end - start)
def _next_internal(self): """Returns a nested structure of `tf.Tensor`s containing the next element. """ # This runs in sync mode as iterators use an error status to communicate # that there is no more data to iterate over. # TODO(b/77291417): Fix with context.execution_mode(context.SYNC): with ops.device(self._device): ret = ged_ops.experimental_function_buffering_resource_get_next( function_buffer_resource=self._buffering_resource, output_types=self._flat_output_types) return sparse.deserialize_sparse_tensors( nest.pack_sequence_as(self._output_types, ret), self._output_types, self._output_shapes, self._output_classes)
def _run_benchmark(func, num_iters, execution_mode=None): with context.execution_mode(execution_mode): # call func to warm up func() if execution_mode == context.ASYNC: get_executor().wait() start = time.time() for _ in range(num_iters): func() if execution_mode == context.ASYNC: get_executor().wait() end = time.time() return end - start
def testCopyBetweenDevicesAsync(self): with context.execution_mode(context.ASYNC): x = constant_op.constant([[1., 2.], [3., 4.]]) x = x.cpu() x = x.gpu() x = x.gpu() x = x.cpu() context.async_wait() # Invalid device with self.assertRaises(RuntimeError): x.gpu(context.context().num_gpus() + 1) context.async_wait() context.async_clear_error()
def testCopyBetweenDevicesAsync(self): with context.execution_mode(context.ASYNC): x = constant_op.constant([[1., 2.], [3., 4.]]) x = x.cpu() x = x.gpu() x = x.gpu() x = x.cpu() context.context().executor.wait() # Invalid device with self.assertRaises(RuntimeError): x.gpu(context.context().num_gpus() + 1) context.context().executor.wait() context.context().executor.clear_error()
def _next_internal(self): """Returns a nested structure of `tf.Tensor`s containing the next element. """ # This runs in sync mode as iterators use an error status to communicate # that there is no more data to iterate over. # TODO(b/77291417): Fix with context.execution_mode(context.SYNC): with ops.device(self._device): ret = gen_dataset_ops.function_buffering_resource_get_next( function_buffer_resource=self._buffering_resource, output_types=self._flat_output_types) return sparse.deserialize_sparse_tensors( nest.pack_sequence_as(self._output_types, ret), self._output_types, self._output_shapes, self._output_classes)
def run_benchmark(func, num_iters, execution_mode=None): ctx = context.context() with context.execution_mode(execution_mode): # call func to maybe warm up the GPU func() if execution_mode == context.ASYNC: ctx.async_wait() start = time.time() for _ in xrange(num_iters): func() if execution_mode == context.ASYNC: ctx.async_wait() end = time.time() return end - start
def testCopyBetweenDevicesAsync(self): if not context.context().num_gpus(): self.skipTest('No GPUs found') with context.execution_mode(context.ASYNC): x = constant_op.constant([[1., 2.], [3., 4.]]) x = x.cpu() x = x.gpu() x = x.gpu() x = x.cpu() context.async_wait() # Invalid device with self.assertRaises(RuntimeError): x.gpu(context.context().num_gpus() + 1) context.async_wait() context.async_clear_error()
def _call_for_each_replica(self, fn, args, kwargs): # For now, `fn` must be an @tf.function. # TODO(josh11b): Relax this restriction? Main problem is if # (a) executing eagerly, (b) `fn` not @tf.function, and # (c) executed frequently. assert isinstance(fn, def_function.Function) if _outside_run_graph() is not None: # Nested case, should just use outer function's context for things like # the current replica index. # TODO(josh11b): Test this case! with MirroredFunctionReplicaContext(self._container_strategy()): results = fn(*nest.map_structure(_unwrap_tensors, args), **nest.map_structure(_unwrap_tensors, kwargs)) return nest.map_structure(_wrap_tensors, results) _replica_index.graph_outside_run = ops.get_default_graph() return_values = [] try: with MirroredFunctionReplicaContext(self._container_strategy()): for index, device in enumerate(self._devices): _replica_index.current = index with ops.device(device): if context.executing_eagerly(): # NOTE: These functions need to execute concurrently if they # use a collective op. This is a particular concern with eager # execution. with context.execution_mode(context.ASYNC): return_values.append( fn( *distribute_utils.select_replica( index, args), **distribute_utils.select_replica( index, kwargs))) else: return_values.append( fn( *distribute_utils.select_replica( index, args), **distribute_utils.select_replica( index, kwargs))) finally: _replica_index.graph_outside_run = None _replica_index.current = None return distribute_utils.regroup(return_values)
def _run(self, func, num_iters, execution_mode=None): # call func to maybe warm up the GPU ctx = context.context() with context.execution_mode(execution_mode): func() if execution_mode == context.ASYNC: ctx.async_wait() start = time.time() for _ in xrange(num_iters): func() if execution_mode == context.ASYNC: ctx.async_wait() end = time.time() mean_us = (end - start) * 1e6 / num_iters self.report_benchmark( iters=num_iters, wall_time=mean_us, extras={"examples_per_sec": num_iters / (end - start)})
def _test_train(self, execution_mode=None): device, data_format = device_and_data_format() model = resnet50.ResNet50(data_format) tf.compat.v2.summary.experimental.set_step( tf.train.get_or_create_global_step()) logdir = tempfile.mkdtemp() with tf.compat.v2.summary.create_file_writer( logdir, max_queue=0, name='t0').as_default(), tf.compat.v2.summary.record_if(True): with tf.device(device), context.execution_mode(execution_mode): optimizer = tf.train.GradientDescentOptimizer(0.1) images, labels = random_batch(2, data_format) apply_gradients(model, optimizer, compute_gradients(model, images, labels)) self.assertEqual(320, len(model.variables)) context.async_wait() events = events_from_logdir(logdir) self.assertEqual(len(events), 2) self.assertEqual(events[1].summary.value[0].tag, 'loss')
def _benchmark_eager_train(self, label, make_iterator, device_and_format, defun=False, execution_mode=None): with context.execution_mode(execution_mode): device, data_format = device_and_format for batch_size in self._train_batch_sizes(): (images, labels) = resnet50_test_util.random_batch( batch_size, data_format) model = resnet50.ResNet50(data_format) # TODO(b/161911585): tf_to_corert MLIR lowering pipeline should handle # case when momentum is not set. optimizer = tf.keras.optimizers.SGD(0.1, 0.1) apply_grads = apply_gradients if defun: model.call = tf.function(model.call) apply_grads = tf.function(apply_gradients) num_burn = 3 num_iters = 10 with tf.device(device): iterator = make_iterator((images, labels)) for _ in xrange(num_burn): (images, labels) = iterator.next() apply_grads(model, optimizer, compute_gradients(model, images, labels)) if execution_mode: context.async_wait() self._force_device_sync() gc.collect() start = time.time() for _ in xrange(num_iters): (images, labels) = iterator.next() apply_grads(model, optimizer, compute_gradients(model, images, labels)) if execution_mode: context.async_wait() self._force_device_sync() self._report(label, start, num_iters, device, batch_size, data_format)
def _next_internal(self): """Returns a nested structure of `tf.Tensor`s containing the next element. """ # This runs in sync mode as iterators use an error status to communicate # that there is no more data to iterate over. # TODO (b/77291417): Fix id:669 # https://github.com/imdone/tensorflow/issues/670 with context.execution_mode(context.SYNC): if self._buffer_resource_handle is not None: with ops.device(self._device): ret = prefetching_ops.function_buffering_resource_get_next( function_buffer_resource=self._buffer_resource_handle, output_types=self._flat_output_types) return sparse.deserialize_sparse_tensors( nest.pack_sequence_as(self._output_types, ret), self._output_types, self._output_shapes, self._output_classes) else: return super(Iterator, self)._next_internal()
def testExecuteBasicAsync(self): with context.execution_mode(context.ASYNC): three = constant_op.constant(3) five = constant_op.constant(5) product = execute(b'Mul', num_outputs=1, inputs=[three, five], attrs=('T', three.dtype.as_datatype_enum))[0] self.assertAllEqual(15, product) # Error: Invalid arguments context.set_execution_mode(context.ASYNC) with self.assertRaises(errors.InvalidArgumentError): execute(b'MatMul', num_outputs=1, inputs=[three, five], attrs=('transpose_a', False, 'transpose_b', False, 'T', three.dtype.as_datatype_enum)) context.async_wait() context.async_clear_error() context.context().execution_mode = context.SYNC
def _benchmark_eager_train(self, label, make_iterator, device_and_format, defun=False, execution_mode=None): with context.execution_mode(execution_mode): device, data_format = device_and_format for batch_size in self._train_batch_sizes(): (images, labels) = resnet50_test_util.random_batch( batch_size, data_format) model = resnet50.ResNet50(data_format) optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.1) apply_grads = apply_gradients if defun: model.call = tf.function(model.call) apply_grads = tf.function(apply_gradients) num_burn = 3 num_iters = 10 with tf.device(device): iterator = make_iterator((images, labels)) for _ in xrange(num_burn): (images, labels) = iterator.next() apply_grads(model, optimizer, compute_gradients(model, images, labels)) if execution_mode: context.async_wait() self._force_device_sync() gc.collect() start = time.time() for _ in xrange(num_iters): (images, labels) = iterator.next() apply_grads(model, optimizer, compute_gradients(model, images, labels)) if execution_mode: context.async_wait() self._force_device_sync() self._report(label, start, num_iters, device, batch_size, data_format)
def testExecuteBasicAsync(self): with context.execution_mode(context.ASYNC): three = constant_op.constant(3) five = constant_op.constant(5) product = execute( b'Mul', num_outputs=1, inputs=[three, five], attrs=('T', three.dtype.as_datatype_enum))[0] self.assertAllEqual(15, product) # Error: Invalid arguments context.set_execution_mode(context.ASYNC) with self.assertRaises(errors.InvalidArgumentError): execute( b'MatMul', num_outputs=1, inputs=[three, five], attrs=('transpose_a', False, 'transpose_b', False, 'T', three.dtype.as_datatype_enum)) context.async_wait() context.async_clear_error() context.set_execution_mode(context.SYNC)
def _next_internal(self): """Returns a nested structure of `tf.Tensor`s containing the next element. """ # This runs in sync mode as iterators use an error status to communicate # that there is no more data to iterate over. # TODO(b/77291417): Fix with context.execution_mode(context.SYNC): with ops.device(self._device): # TODO(ashankar): Consider removing this ops.device() contextmanager # and instead mimic ops placement in graphs: Operations on resource # handles execute on the same device as where the resource is placed. # NOTE(mrry): Here we use the "_sync" variant of `iterator_get_next` # because in eager mode this code will run synchronously on the calling # thread. Therefore we do not need to make a defensive context switch # to a background thread, and can achieve a small constant performance # boost by invoking the iterator synchronously. ret = gen_dataset_ops.iterator_get_next_sync( self._resource, output_types=self._flat_output_types, output_shapes=self._flat_output_shapes) return sparse.deserialize_sparse_tensors( nest.pack_sequence_as(self._output_types, ret), self._output_types, self._output_shapes, self._output_classes)
def decorator(self, *args, **kwargs): with context.execution_mode(context.ASYNC): f(self, *args, **kwargs) with context.execution_mode(context.SYNC): f(self, *args, **kwargs)