def testParameterServerMultiExecutors(self): context.update_server_def(server_def=self.server_def_s1_s2_s3_s4) with ops.device(self.device_t1): v1 = variables.Variable(initial_value=0.) with ops.device(self.device_t2): v2 = variables.Variable(initial_value=10.) @def_function.function def worker_fn(): x1 = v1.read_value() x2 = v2.read_value() grad = (x1 + x2) * 0.1 v1.assign_add(grad) v2.assign_sub(grad) return v1 + v2 worker_fn.get_concrete_function() executor_t3 = executor.new_executor(enable_async=False) executor_t4 = executor.new_executor(enable_async=False) num_calls = 10 self._coord = coordinator.Coordinator() def thread_fn(executor_obj, device, results): with self._coord.stop_on_exception(): for i in range(num_calls): with context.executor_scope(executor_obj): with ops.device(device): results[i] = worker_fn() def update_server_def_fn(): with self._coord.stop_on_exception(): for _ in range(30): context.update_server_def(self.server_def_s1_s2_s3_s4) t3_results = [None] * num_calls t4_results = [None] * num_calls threads = [] threads.append( threading.Thread( target=thread_fn, args=(executor_t3, self.device_t3, t3_results))) threads.append( threading.Thread( target=thread_fn, args=(executor_t4, self.device_t4, t4_results))) threads.append(threading.Thread(target=update_server_def_fn)) for t in threads: t.start() self._coord.join(threads) # Cannot assert individual values since the results are non-deterministic. # By summing up the value we ensure that there are all reasonable and valid # numbers (not `None` or `NaN`). total = np.sum(t3_results + t4_results) self.assertGreater(total, 0)
def testTwoExecutors(self): # Run an op on the main executor that by default uses StreamingEnqueue to # schedule the op to run on the remote async executor. This op produces an # error, i.e., division by zero, but will not be immediately caught due to # streaming enqueue. with ops.device('job:worker/replica:0/task:0/device:CPU:0'): a = constant_op.constant(3) b = constant_op.constant(0) math_ops.div(a, b) # Run another op using another executor that disables streaming enqueue, # which would run the op using the tf_compute thread pool in the remote # worker. Since the op is not run in the same remotes async executor, it # will not carry back that error produced by the op above, even though this # op is executed synchronously. with context.executor_scope( executor.new_executor(enable_async=False, enable_streaming_enqueue=False)): with ops.device('job:worker/replica:0/task:0/device:CPU:0'): c = constant_op.constant(4) d = constant_op.constant(2) self.assertEqual(math_ops.div(c, d).numpy(), 2) # Sync on the context to force to catch the error produced by the first op. with self.assertRaises(errors.InvalidArgumentError) as cm: context.async_wait() self.assertIn('division by zero', cm.exception.message)
def __call__(self, device, token, args): """Passes `args` to `self._func`, which is executed eagerly.""" func_executor = executor.new_executor(context.is_async()) with context.executor_scope(func_executor): with context.eager_mode(), backprop.GradientTape() as tape: # Only watch tensors with a floating dtype. for tensor in args: for t in nest.flatten(tensor): if t.dtype.is_floating: tape.watch(t) ret = self._func(*args) # Use tf.identity to copy the returned tensors to device if necessary. with ops.device(device): if isinstance(ret, (tuple, list)): outputs = [ array_ops.identity(self._convert(x, dtype=dtype)) for (x, dtype) in zip(ret, self._out_dtypes) ] elif ret is None: outputs = None else: outputs = array_ops.identity( self._convert(ret, dtype=self._out_dtypes[0])) tape_cache[compat.as_bytes(token)] = (tape, args, outputs) return outputs func_executor.wait()
def __init__(self, worker_index, device_name, cluster): self.worker_index = worker_index self.device_name = device_name self.executor = executor.new_executor(enable_async=False) self.failure_handler = cluster.failure_handler self._cluster = cluster self._resource_remote_value_refs = [] # Worker threads need to start after `Worker`'s initialization. threading.Thread(target=self._process_queue, name="WorkerClosureProcessingLoop-%d" % self.worker_index, daemon=True).start()
def testCancelGetNextWithDevice(self, cls): ping = data_flow_ops.FIFOQueue(capacity=2, dtypes=dtypes.int64) pong = data_flow_ops.FIFOQueue(capacity=2, dtypes=dtypes.int64) @def_function.function def map_fn(v): ball = ping.dequeue() with ops.control_dependencies([pong.enqueue(ball)]): return v + ping.dequeue() dataset = dataset_ops.Dataset.range(10) dataset = dataset.map(map_fn) # We need to set prefetch_buffer_size=0 so that we can cancel the # MultiDeviceIteratorGetNextFromShardOp from eager. If # prefetch_buffer_size>0, that op runs in the background threads of the # prefetch and can only be cancelled by deleting the iterator. multi_device_iterator = cls( dataset, [self._devices[1], self._devices[2]], prefetch_buffer_size=0) @def_function.function def get_next_device1(): return multi_device_iterator.get_next(self._devices[1]) async_executor = executor.new_executor(enable_async=True) with context.executor_scope(async_executor): cancel_mgr = cancellation.CancellationManager() cancel_mgr.get_cancelable_function( get_next_device1.get_concrete_function())() # Make sure we cancel in the middle of get_next. ping.enqueue(0) pong.dequeue() cancel_mgr.start_cancel() with self.assertRaises(errors.CancelledError): async_executor.wait() # Note that fetching from upstream iterator is not cancelled with the # cancellation of get_next. ping.enqueue(0) # Cancelling a get_next on one device shouldn't cancel the # multi_device_iterator and iterators on other devices. ping.enqueue(0) ping.enqueue(0) self.assertEqual(1, multi_device_iterator.get_next(self._devices[2]).numpy()) # FIXME(b/209534797): Workaround an asan error caused by this test. # Remove the dangling reference from tf.function to ensure queue objects # are not freed before they are flushed. import gc # pylint: disable=g-import-not-at-top del get_next_device1 gc.collect()
def testPyFunctionAsync(self): def simple_fn(v): one = constant_op.constant(1.) return v + one @def_function.function def test_fn(v): return script_ops.eager_py_func(simple_fn, [v], dtypes.float32) async_executor = executor.new_executor(enable_async=True) with context.executor_scope(async_executor): test_var = variables.Variable(2.) self.assertAllEqual(test_fn(test_var), 3.0) async_executor.wait()
def __init__(self, devices, group_size, collective_keys=None, communication=CollectiveCommunication.AUTO): """Initializes the object. Args: devices: a list of device strings to run collectives on. group_size: the global group size. For between-graph replicated training it's the total number of devices across all workers. collective_keys: an optional CollectiveKey object. communication: indicates which collective communication to use. """ if group_size % len(devices) > 0: raise ValueError( "group_size must be divisible by the number of devices.") self._devices = tuple(device_util.canonicalize(d) for d in devices) self._group_size = group_size self._collective_keys = (collective_keys or cross_device_utils.CollectiveKeys()) self._communication = communication # This lock guards all collective launches, i.e. calls to # cross_device_utils.build_collectve_*. # # In a multi threaded eager program we need to ensure different groups of # collectives don't interleave each other, otherwise there couuld be # deadlocks. E.g. if two user threads both are launching collectives: # user-thread-0 device0 device1 # user-thread-1 device0 device1 # In eager mode, we use one executor per device. Executors use single FIFO # queues, so the above launch sequences end up with the following queues: # device-0 collective-0 collective-1 # device-1 collective-1 collective-0 # This deadlocks since neither collective is able to finish. self._lock = threading.Lock() # Collective ops requires all devices to participate and is blocking. In # eager, we need one async executor for each device to be able to launch # them altogether. Note that async doesn't imply concurrency. Within an # async executor operations are still executed sequentially. In graph or # function building, the executors are not used. self._executors = [] for _ in range(len(devices)): self._executors.append(executor.new_executor(enable_async=True)) super(CollectiveAllReduce, self).__init__()
def testRemoteFunctionCancellation(self): context._reset_context() logical_devices = [] logical_devices.append(context.LogicalDeviceConfiguration()) logical_devices.append(context.LogicalDeviceConfiguration()) framework_config.set_logical_device_configuration( framework_config.list_physical_devices("CPU")[0], logical_devices) @function.Defun(dtypes.float32) def _remote_fn(v): # We run two collectives here to make sure we cancel in the middle of the # RemoteCall. The second one should never finish. anchor = collective_ops.all_reduce_v2( v, group_size=2, group_key=1, instance_key=1) with ops.control_dependencies([anchor]): return collective_ops.all_reduce_v2( v, group_size=2, group_key=1, instance_key=2) @eager_def_function.function def run(): with ops.device("/cpu:0"): return functional_ops.remote_call( args=[constant_op.constant([1.])], Tout=[dtypes.float32], f=_remote_fn, target="/cpu:1")[0] async_executor = executor.new_executor(enable_async=True) cancel_mgr = cancellation.CancellationManager() with context.executor_scope(async_executor): # This should never finish. cancel_mgr.get_cancelable_function(run.get_concrete_function())() with ops.device("/cpu:0"): collective_ops.all_reduce_v2([1.], group_size=2, group_key=1, instance_key=1) cancel_mgr.start_cancel() with self.assertRaises(errors.CancelledError): async_executor.wait()
def __init__(self, devices, group_size, collective_keys=None, communication=CollectiveCommunication.AUTO): """Initializes the object. Args: devices: a list of device strings to run collectives on. group_size: the global group size. For between-graph replicated training it's the total number of devices across all workers. collective_keys: an optional CollectiveKey object. communication: indicates which collective communication to use. """ if group_size % len(devices) > 0: raise ValueError("group_size must be divisible by the number of devices.") self._devices = tuple(device_util.canonicalize(d) for d in devices) self._group_size = group_size self._collective_keys = (collective_keys or cross_device_utils.CollectiveKeys()) self._communication = communication # In a multi threaded eager program we need to ensure different groups of # collectives don't interleave each other, otherwise there will be deadlock. self._lock = threading.Lock() # Collective ops requires all devices to participate and is blocking. In # eager, we need one async executor for each device to be able to launch # them altogether. Note that async doesn't imply concurrency. Within an # async executor operations are still executed sequentially. In graph or # function building, the executors are not used. self._executors = [] for _ in range(len(devices)): self._executors.append(executor.new_executor(enable_async=True)) super(CollectiveAllReduce, self).__init__()