def testRemoteDeviceInPartitionedCallOp(self): workers, _ = test_util.create_local_cluster(2, 0) worker0_device = "/job:worker/replica:0/task:0/cpu:0" worker1_device = "/job:worker/replica:0/task:1/cpu:0" @eager_def_function.function def f(a, b): return a + b with session.Session(workers[0].target) as sess: with ops.device(worker0_device): a = variable_scope.get_variable( "a", initializer=constant_op.constant(1.), use_resource=True) with ops.device(worker1_device): b = variable_scope.get_variable( "b", initializer=constant_op.constant(1.), use_resource=True) sess.run(variables.global_variables_initializer()) config = config_pb2.ConfigProto() config.experimental.share_cluster_devices_in_session = True with session.Session(workers[0].target, config=config) as sess: res = sess.run(f(a, b)) self.assertEqual(res, 2)
def testDistributedOOM(self): if not test.is_gpu_available(): return ops.reset_default_graph() workers, _ = test_util.create_local_cluster(2, 0) with ops.device('/job:worker/replica:0/task:0/gpu:0'): a = random_ops.random_normal([1, 10000, 20000], name='test_random1') with ops.device('/job:worker/replica:0/task:1/gpu:0'): b = random_ops.random_normal([30000, 10000, 1], name='test_random2') c = a * b try: with session.Session(workers[1].target) as sess: sess.run(c, options=config_pb2.RunOptions( report_tensor_allocations_upon_oom=True)) except Exception as e: # pylint: disable=broad-except exception_str = '%s' % e # test_random2 is reported because it's allocated in worker 1. self.assertTrue('Current usage from device: ' '/job:worker/replica:0/task:1/device:GPU:0, ' 'allocator: GPU_0_bfc' in exception_str) mat = re.search('(.*)GiB from test_random2/RandomStandardNormal', exception_str) self.assertGreater(float(mat.group(1)), 0.0) # test_random1 is not reported because it's allocated in worker 0. mat = re.search('(.*)MiB from test_random1/RandomStandardNormal', exception_str) self.assertTrue(mat is None)
def testImplicitDisposeParallelMapDataset(self): # Tests whether a parallel map dataset will be cleaned up correctly when # the pipeline does not run it until exhaustion. # The pipeline is TensorSliceDataset -> MapDataset(square_3) -> # RepeatDataset(None) -> PrefetchDataset(100). worker, _ = test_util.create_local_cluster(1, 1) components = (np.arange(1000), np.array([[1, 2, 3]]) * np.arange(1000)[:, np.newaxis], np.array(37.0) * np.arange(1000)) def _map_fn(x, y, z): return math_ops.square(x), math_ops.square(y), math_ops.square(z) dataset = ( dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn) .repeat(None).prefetch(10000)) iterator = dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with session.Session(worker[0].target) as sess: self.evaluate(init_op) for _ in range(3): sess.run(get_next)
def testRemoteFunction(self): worker_config = config_pb2.ConfigProto() worker_config.device_count["CPU"] = 2 worker, _ = test_util.create_local_cluster( 1, 1, worker_config=worker_config) @function.Defun(dtypes.int32, dtypes.int32) def _remote_fn(a, b): return math_ops.multiply(a, b) with ops.device("/job:ps/task:0"): a = variables.Variable(2, dtype=dtypes.int32) b = variables.Variable(3, dtype=dtypes.int32) with ops.device("/job:worker/replica:0/task:0/cpu:0"): remote_op = functional_ops.remote_call( args=[a, b], Tout=[dtypes.int32], f=_remote_fn, target="/job:worker/replica:0/task:0/cpu:1") with session.Session(worker[0].target) as sess: self.evaluate(variables.global_variables_initializer()) mul = self.evaluate(remote_op) self.assertEqual(mul, [6])
def testCaptureHashTableInSharedIterator(self): worker, _ = test_util.create_local_cluster(1, 1) # NOTE(mrry): We must use the V2 variants of `HashTable` # etc. because these produce a `tf.resource`-typed output that is # compatible with the in-graph function implementation. default_val = -1 keys = constant_op.constant(["brain", "salad", "surgery"]) values = constant_op.constant([0, 1, 2], dtypes.int64) table = lookup_ops.HashTable( lookup_ops.KeyValueTensorInitializer(keys, values), default_val, shared_name="shared_table") input_sentences = dataset_ops.Dataset.from_tensor_slices( ["brain brain tank salad surgery", "surgery brain"]) iterator = ( input_sentences.map(lambda x: string_ops.string_split([x]).values).map( table.lookup) .make_initializable_iterator(shared_name="shared_iterator")) init_op = iterator.initializer get_next = iterator.get_next() with session.Session(worker[0].target) as sess: self.evaluate(table.initializer) self.evaluate(init_op) self.assertAllEqual([0, 0, -1, 1, 2], self.evaluate(get_next)) with session.Session(worker[0].target) as sess: self.assertAllEqual([2, 0], self.evaluate(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def testRemoteIteratorUsingRemoteCallOp(self): worker_config = config_pb2.ConfigProto() worker_config.device_count["CPU"] = 2 worker, _ = test_util.create_local_cluster( 1, 1, worker_config=worker_config) self._testRemoteIteratorHelper("/job:worker/replica:0/task:0/cpu:0", "/job:worker/replica:0/task:0/cpu:1", worker[0].target)
def setUpClass(cls): # We have to create a global in-process cluster because once an in-process # tensorflow server is created, there is no way to terminate it. Please see # multi_worker_test_base.py for more details. cls._workers, cls._ps = test_util.create_local_cluster( NUM_WORKERS, num_ps=NUM_PS) cls._cluster_spec = { WORKER: [_bytes_to_str(w.target) for w in cls._workers], PS: [_bytes_to_str(ps.target) for ps in cls._ps] }
def _test_device_and_input_device_are_colocated(self, strategy): if context.executing_eagerly(): self.skipTest( "cross-device tests are not supported with eager execution.") workers, _ = test_util.create_local_cluster(2, 0) inputs = strategy.make_input_fn_iterator( lambda _: dataset_ops.Dataset.range(5)) comm_fn = lambda x: x + 1 run_op = strategy.experimental_run(comm_fn, inputs) with session_lib.Session(target=workers[1].target) as sess: sess.run(inputs.initialize()) sess.run(run_op)
def testSerializeListWithUnknownRank(self): worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0] with ops.Graph().as_default(), session.Session(target=worker.target): with ops.device("/job:worker"): t = constant_op.constant([[1.0], [2.0]]) l = list_ops.tensor_list_from_tensor(t, element_shape=None) with ops.device("/job:ps"): l_ps = array_ops.identity(l) element_shape = list_ops.tensor_list_element_shape( l_ps, shape_type=dtypes.int32) with ops.device("/job:worker"): element_shape = array_ops.identity(element_shape) self.assertEqual(self.evaluate(element_shape), -1)
def testSerialize(self): worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0] with ops.Graph().as_default(), session.Session(target=worker.target): with ops.device("/job:worker"): t = constant_op.constant([[1.0], [2.0]]) l = list_ops.tensor_list_from_tensor(t, element_shape=[1]) with ops.device("/job:ps"): l_ps = array_ops.identity(l) l_ps, e = list_ops.tensor_list_pop_back( l_ps, element_dtype=dtypes.float32) with ops.device("/job:worker"): worker_e = array_ops.identity(e) self.assertAllEqual(self.evaluate(worker_e), [2.0])
def testRemoteIteratorUsingRemoteCallOp(self): worker_config = config_pb2.ConfigProto() worker_config.device_count["CPU"] = 2 worker, _ = test_util.create_local_cluster( 1, 1, worker_config=worker_config) with ops.device("/job:worker/replica:0/task:0/cpu:1"): dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3]) iterator_3 = dataset_3.make_one_shot_iterator() iterator_3_handle = iterator_3.string_handle() @function.Defun(dtypes.string) def _remote_fn(h): remote_iterator = dataset_ops.Iterator.from_string_handle( h, dataset_3.output_types, dataset_3.output_shapes) return remote_iterator.get_next() with ops.device("/job:worker/replica:0/task:0/cpu:0"): target_placeholder = array_ops.placeholder(dtypes.string, shape=[]) remote_op = functional_ops.remote_call( args=[iterator_3_handle], Tout=[dtypes.int32], f=_remote_fn, target=target_placeholder) with session.Session(worker[0].target) as sess: elem = sess.run( remote_op, feed_dict={target_placeholder: "/job:worker/replica:0/task:0/cpu:1"}) self.assertEqual(elem, [1]) # Fails when target is cpu:0 where the resource is not located. with self.assertRaises(errors.InvalidArgumentError): sess.run( remote_op, feed_dict={ target_placeholder: "/job:worker/replica:0/task:0/cpu:0" }) elem = sess.run( remote_op, feed_dict={target_placeholder: "/job:worker/replica:0/task:0/cpu:1"}) self.assertEqual(elem, [2]) elem = sess.run( remote_op, feed_dict={target_placeholder: "/job:worker/replica:0/task:0/cpu:1"}) self.assertEqual(elem, [3]) with self.assertRaises(errors.OutOfRangeError): sess.run( remote_op, feed_dict={ target_placeholder: "/job:worker/replica:0/task:0/cpu:1" })
def testSerializeListWithInvalidTensors(self): worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0] with ops.Graph().as_default(), session.Session(target=worker.target): with ops.device("/job:worker"): l = list_ops.tensor_list_reserve( element_dtype=dtypes.float32, element_shape=[], num_elements=2) l = list_ops.tensor_list_set_item(l, 0, 1.) with ops.device("/job:ps"): l_ps = array_ops.identity(l) l_ps = list_ops.tensor_list_set_item(l_ps, 1, 2.) t = list_ops.tensor_list_stack(l_ps, element_dtype=dtypes.float32) with ops.device("/job:worker"): worker_t = array_ops.identity(t) self.assertAllEqual(self.evaluate(worker_t), [1.0, 2.0])
def setUpClass(cls): """Create a local cluster with 2 workers.""" num_workers = 2 # Leave some memory for cuda runtime. gpu_mem_frac = 0.7 / num_workers default_config = config_pb2.ConfigProto() default_config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_frac # The local cluster takes some portion of the local GPUs and there is no way # for the cluster to terminate unless using multiple processes. Therefore, # we have to only create only one cluster throughout a test process. workers, _ = test_util.create_local_cluster( num_workers, num_ps=0, worker_config=default_config) cls._master_target = workers[0].target
def _test_device_and_input_device_are_colocated_with_function(self, strategy): if context.executing_eagerly(): self.skipTest( "cross-device tests are not supported with eager execution.") workers, _ = test_util.create_local_cluster(2, 0) inputs = strategy.make_input_fn_iterator( lambda _: dataset_ops.Dataset.range(5)) comm_fn = lambda x: x + 1 experimental_run = def_function.function()(strategy.experimental_run) with ops.device("/job:worker/replica:0/task:1/device:CPU:0"): # The tf.function must be defined on the right device as well. run_op = experimental_run(comm_fn, inputs) with session_lib.Session(target=workers[1].target) as sess: sess.run(inputs.initialize()) sess.run(run_op)
def testRemoteIteratorWithoutRemoteCallFail(self): worker_config = config_pb2.ConfigProto() worker_config.device_count["CPU"] = 2 worker, _ = test_util.create_local_cluster( 1, 1, worker_config=worker_config) with ops.device("/job:worker/replica:0/task:0/cpu:1"): dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3]) iterator_3 = dataset_3.make_one_shot_iterator() iterator_3_handle = iterator_3.string_handle() with ops.device("/job:worker/replica:0/task:0/cpu:0"): remote_it = dataset_ops.Iterator.from_string_handle( iterator_3_handle, dataset_3.output_types, dataset_3.output_shapes) get_next_op = remote_it.get_next() with session.Session(worker[0].target) as sess: with self.assertRaises(errors.InvalidArgumentError): sess.run(get_next_op)
def testSerializeListWithMaxNumElements(self): if context.num_gpus(): # TODO(b/119151861): Enable on GPU. return worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0] with ops.Graph().as_default(), session.Session(target=worker.target): with ops.device("/job:worker"): l = list_ops.empty_tensor_list( element_shape=-1, element_dtype=dtypes.float32, max_num_elements=2) l = list_ops.tensor_list_push_back(l, 1.) with ops.device("/job:ps"): l_ps = array_ops.identity(l) l_ps = list_ops.tensor_list_push_back(l_ps, 2.) with self.assertRaisesRegexp(errors.InvalidArgumentError, "Tried to push item into a full list"): with ops.device("/job:worker"): l_worker = array_ops.identity(l_ps) l_worker = list_ops.tensor_list_push_back(l_worker, 3.0) self.evaluate(l_worker)
def testRemoteFunctionCrossProcess(self): workers, _ = test_util.create_local_cluster(2, 1) @function.Defun(dtypes.float32, dtypes.float32) def _remote_fn(a, b): return math_ops.multiply(a, b) with ops.device("/job:ps/task:0"): a = variables.Variable(2, dtype=dtypes.float32) b = variables.Variable(3, dtype=dtypes.float32) with ops.device("/job:worker/replica:0/task:0/cpu:0"): remote_op = functional_ops.remote_call( args=[a, b], Tout=[dtypes.float32], f=_remote_fn, target="/job:worker/replica:0/task:1/cpu:0")[0] + 3.0 with session.Session(workers[0].target) as sess: sess.run(variables.global_variables_initializer()) mul = sess.run(remote_op) self.assertEqual(mul, 9)
def testRemoteFunctionCrossProcess(self): workers, _ = test_util.create_local_cluster(2, 1) @function.Defun(dtypes.float32, dtypes.float32) def _remote_fn(a, b): return math_ops.multiply(a, b) with ops.device("/job:ps/task:0"): a = variables.Variable(2, dtype=dtypes.float32) b = variables.Variable(3, dtype=dtypes.float32) with ops.device("/job:worker/replica:0/task:0/cpu:0"): remote_op = functional_ops.remote_call( args=[a, b], Tout=[dtypes.float32], f=_remote_fn, target="/job:worker/replica:0/task:1/cpu:0")[0] + 3.0 with session.Session(workers[0].target) as sess: self.evaluate(variables.global_variables_initializer()) mul = self.evaluate(remote_op) self.assertEqual(mul, 9)
def create_in_process_cluster(num_workers, num_ps): """Create an in-process cluster that consists of only standard server.""" # Leave some memory for cuda runtime. gpu_mem_frac = 0.7 / num_workers worker_config = config_pb2.ConfigProto() worker_config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_frac ps_config = config_pb2.ConfigProto() ps_config.device_count['GPU'] = 0 # Create in-process servers. Once an in-process tensorflow server is created, # there is no way to terminate it. So we create one cluster per test process. # We could've started the server in another process, we could then kill that # process to terminate the server. The reasons why we don't want multiple # processes are # 1) it is more difficult to manage these processes # 2) there is something global in CUDA such that if we initialize CUDA in the # parent process, the child process cannot initialize it again and thus cannot # use GPUs (https://stackoverflow.com/questions/22950047). return test_util.create_local_cluster( num_workers, num_ps=num_ps, worker_config=worker_config, ps_config=ps_config)
def testRemoteIteratorUsingRemoteCallOpCrossProcess(self): workers, _ = test_util.create_local_cluster(2, 1) self._testRemoteIteratorHelper("/job:worker/replica:0/task:0/cpu:0", "/job:worker/replica:0/task:1/cpu:0", workers[0].target)
def testRemoteIteratorUsingRemoteCallOpCrossProcess(self): workers, _ = test_util.create_local_cluster(2, 1) self._testRemoteIteratorHelper("/job:worker/replica:0/task:0/cpu:0", "/job:worker/replica:0/task:1/cpu:0", workers[0].target)
def test3Workers1Backup(self): num_workers = 3 replicas_to_aggregate = 2 num_ps = 2 workers, _ = create_local_cluster(num_workers=num_workers, num_ps=num_ps) # Creates and returns all the workers. sessions, graphs, train_ops = get_workers(num_workers, replicas_to_aggregate, workers) # Chief should have already initialized all the variables. var_0_g_1 = graphs[1].get_tensor_by_name("v0:0") var_1_g_1 = graphs[1].get_tensor_by_name("v1:0") local_step_1 = graphs[1].get_tensor_by_name("sync_rep_local_step:0") global_step = graphs[1].get_tensor_by_name("global_step:0") # The steps should also be initialized. self.assertAllEqual(0, sessions[1].run(global_step)) self.assertAllEqual(0, sessions[1].run(local_step_1)) # We have initial tokens in the queue so we can call this one by one. After # the token queue becomes empty, they should be called concurrently. # Here worker 0 and worker 2 finished first. sessions[0].run(train_ops[0]) sessions[2].run(train_ops[2]) # The global step should have been updated since we only need to collect 2 # gradients. The variables should now have the new values after the average # of the gradients from worker 0/2 are applied. while sessions[1].run(global_step) != 1: time.sleep(0.01) self.assertAllEqual(1, sessions[1].run(global_step)) self.assertAllClose(0 - (0.1 + 0.5) / 2 * 2.0, sessions[1].run(var_0_g_1)) self.assertAllClose(1 - (0.9 + 1.3) / 2 * 2.0, sessions[1].run(var_1_g_1)) # Worker 1 finished later and its gradients will now be dropped as it is # stale. sessions[1].run(train_ops[1]) # As shown in the previous test, the local_step for all workers should be # still 0 so their next computation will also be dropped. sessions[0].run(train_ops[0]) sessions[1].run(train_ops[1]) sessions[2].run(train_ops[2]) # Although the global step should still be 1 as explained above, the local # step should now be updated to 1. Just check worker 1 as an example. self.assertAllEqual(1, sessions[1].run(global_step)) self.assertAllEqual(1, sessions[1].run(local_step_1)) thread_0 = self.checkedThread(target=self._run, args=(train_ops[0], sessions[0])) thread_1 = self.checkedThread(target=self._run, args=(train_ops[1], sessions[1])) # Lets worker 0 execute first. # It will wait as we need 2 workers to finish this step and the global step # should be still 1. thread_0.start() self.assertAllEqual(1, sessions[1].run(global_step)) # Starts worker 1. thread_1.start() thread_1.join() thread_0.join() # The global step should now be 2 and the gradients should have been # applied again. self.assertAllEqual(2, sessions[1].run(global_step)) self.assertAllClose(-0.6 - (0.1 + 0.3) / 2 * 2.0, sessions[1].run(var_0_g_1)) self.assertAllClose(-1.2 - (0.9 + 1.1) / 2 * 2.0, sessions[1].run(var_1_g_1))
def test2Workers(self): num_workers = 2 replicas_to_aggregate = 2 num_ps = 2 workers, _ = create_local_cluster(num_workers=num_workers, num_ps=num_ps) # Creates and returns all the workers. sessions, graphs, train_ops = get_workers(num_workers, replicas_to_aggregate, workers) # Chief should have already initialized all the variables. var_0_g_0 = graphs[0].get_tensor_by_name("v0:0") var_1_g_0 = graphs[0].get_tensor_by_name("v1:0") local_step_0 = graphs[0].get_tensor_by_name("sync_rep_local_step:0") self.assertAllEqual(0.0, sessions[0].run(var_0_g_0)) self.assertAllEqual(1.0, sessions[0].run(var_1_g_0)) self.assertAllEqual(0, sessions[0].run(local_step_0)) # Will just use session 1 to verify all the variables later. var_0_g_1 = graphs[1].get_tensor_by_name("v0:0") var_1_g_1 = graphs[1].get_tensor_by_name("v1:0") var_sparse_g_1 = graphs[1].get_tensor_by_name("v_sparse:0") local_step_1 = graphs[1].get_tensor_by_name("sync_rep_local_step:0") global_step = graphs[1].get_tensor_by_name("global_step:0") # The steps should also be initialized. self.assertAllEqual(0, sessions[1].run(global_step)) self.assertAllEqual(0, sessions[1].run(local_step_1)) self.assertAllClose([[3.0], [4.0]], sessions[1].run(var_sparse_g_1)) # We have initial tokens in the queue so we can call this one by one. After # the first step, this will no longer work as there will be no more extra # tokens in the queue. sessions[0].run(train_ops[0]) sessions[1].run(train_ops[1]) # The global step should have been updated and the variables should now have # the new values after the average of the gradients are applied. while sessions[1].run(global_step) != 1: time.sleep(0.01) self.assertAllClose(0 - (0.1 + 0.3) / 2 * 2.0, sessions[1].run(var_0_g_1)) self.assertAllClose(1 - (0.9 + 1.1) / 2 * 2.0, sessions[1].run(var_1_g_1)) self.assertAllClose([[3.0], [4.0 - (0.1 + 0.3) / 2 * 2.0]], sessions[1].run(var_sparse_g_1)) # The local step for both workers should still be 0 because the initial # tokens in the token queue are 0s. This means that the following # computation of the gradients will be wasted as local_step is smaller than # the current global step. However, this only happens once when the system # just starts and this is necessary to make the system robust for the case # when chief gets restarted by errors/preemption/... self.assertAllEqual(0, sessions[0].run(local_step_0)) self.assertAllEqual(0, sessions[1].run(local_step_1)) sessions[0].run(train_ops[0]) sessions[1].run(train_ops[1]) # Although the global step should still be 1 as explained above, the local # step should now be updated to 1. The variables are still the same. self.assertAllEqual(1, sessions[1].run(global_step)) self.assertAllEqual(1, sessions[0].run(local_step_0)) self.assertAllEqual(1, sessions[1].run(local_step_1)) self.assertAllClose(0 - (0.1 + 0.3) / 2 * 2.0, sessions[1].run(var_0_g_1)) self.assertAllClose(1 - (0.9 + 1.1) / 2 * 2.0, sessions[1].run(var_1_g_1)) # At this step, the token queue is empty. So the 2 workers need to work # together to proceed. threads = [] threads.append( self.checkedThread(target=self._run, args=(train_ops[0], sessions[0]))) threads.append( self.checkedThread(target=self._run, args=(train_ops[1], sessions[1]))) # The two workers starts to execute the train op. for thread in threads: thread.start() for thread in threads: thread.join() # The global step should now be 2 and the gradients should have been # applied twice. self.assertAllEqual(2, sessions[1].run(global_step)) self.assertAllClose(0 - 2 * (0.1 + 0.3) / 2 * 2.0, sessions[1].run(var_0_g_1)) self.assertAllClose(1 - 2 * (0.9 + 1.1) / 2 * 2.0, sessions[1].run(var_1_g_1))
def setUp(self): context.context().soft_device_placement = True context.context().log_device_placement = True workers, _ = test_util.create_local_cluster(2, 0) remote.connect_to_remote_host([workers[0].target, workers[1].target])
def test2Workers(self): num_workers = 2 replicas_to_aggregate = 2 num_ps = 2 workers, _ = create_local_cluster(num_workers=num_workers, num_ps=num_ps) # Creates and returns all the workers. sessions, graphs, train_ops = get_workers(num_workers, replicas_to_aggregate, workers) # Chief should have already initialized all the variables. var_0_g_0 = graphs[0].get_tensor_by_name("v0:0") var_1_g_0 = graphs[0].get_tensor_by_name("v1:0") local_step_0 = graphs[0].get_tensor_by_name("sync_rep_local_step:0") self.assertAllEqual(0.0, sessions[0].run(var_0_g_0)) self.assertAllEqual(1.0, sessions[0].run(var_1_g_0)) self.assertAllEqual(0, sessions[0].run(local_step_0)) # Will just use session 1 to verify all the variables later. var_0_g_1 = graphs[1].get_tensor_by_name("v0:0") var_1_g_1 = graphs[1].get_tensor_by_name("v1:0") var_sparse_g_1 = graphs[1].get_tensor_by_name("v_sparse:0") local_step_1 = graphs[1].get_tensor_by_name("sync_rep_local_step:0") global_step = graphs[1].get_tensor_by_name("global_step:0") # The steps should also be initialized. self.assertAllEqual(0, sessions[1].run(global_step)) self.assertAllEqual(0, sessions[1].run(local_step_1)) self.assertAllClose([[3.0], [4.0]], sessions[1].run(var_sparse_g_1)) # We have initial tokens in the queue so we can call this one by one. After # the first step, this will no longer work as there will be no more extra # tokens in the queue. sessions[0].run(train_ops[0]) sessions[1].run(train_ops[1]) # The global step should have been updated and the variables should now have # the new values after the average of the gradients are applied. while sessions[1].run(global_step) != 1: time.sleep(0.01) self.assertAllClose(0 - (0.1 + 0.3) / 2 * 2.0, sessions[1].run(var_0_g_1)) self.assertAllClose(1 - (0.9 + 1.1) / 2 * 2.0, sessions[1].run(var_1_g_1)) self.assertAllClose([[3.0], [4.0 - (0.1 + 0.3) / 2 * 2.0]], sessions[1].run(var_sparse_g_1)) # The local step for both workers should still be 0 because the initial # tokens in the token queue are 0s. This means that the following # computation of the gradients will be wasted as local_step is smaller than # the current global step. However, this only happens once when the system # just starts and this is necessary to make the system robust for the case # when chief gets restarted by errors/preemption/... self.assertAllEqual(0, sessions[0].run(local_step_0)) self.assertAllEqual(0, sessions[1].run(local_step_1)) sessions[0].run(train_ops[0]) sessions[1].run(train_ops[1]) # Although the global step should still be 1 as explained above, the local # step should now be updated to 1. The variables are still the same. self.assertAllEqual(1, sessions[1].run(global_step)) self.assertAllEqual(1, sessions[0].run(local_step_0)) self.assertAllEqual(1, sessions[1].run(local_step_1)) self.assertAllClose(0 - (0.1 + 0.3) / 2 * 2.0, sessions[1].run(var_0_g_1)) self.assertAllClose(1 - (0.9 + 1.1) / 2 * 2.0, sessions[1].run(var_1_g_1)) # At this step, the token queue is empty. So the 2 workers need to work # together to proceed. threads = [] threads.append( self.checkedThread( target=self._run, args=(train_ops[0], sessions[0]))) threads.append( self.checkedThread( target=self._run, args=(train_ops[1], sessions[1]))) # The two workers starts to execute the train op. for thread in threads: thread.start() for thread in threads: thread.join() # The global step should now be 2 and the gradients should have been # applied twice. self.assertAllEqual(2, sessions[1].run(global_step)) self.assertAllClose(0 - 2 * (0.1 + 0.3) / 2 * 2.0, sessions[1].run(var_0_g_1)) self.assertAllClose(1 - 2 * (0.9 + 1.1) / 2 * 2.0, sessions[1].run(var_1_g_1))
def test3Workers1Backup(self): num_workers = 3 replicas_to_aggregate = 2 num_ps = 2 workers, _ = create_local_cluster(num_workers=num_workers, num_ps=num_ps) # Creates and returns all the workers. sessions, graphs, train_ops = get_workers(num_workers, replicas_to_aggregate, workers) # Chief should have already initialized all the variables. var_0_g_1 = graphs[1].get_tensor_by_name("v0:0") var_1_g_1 = graphs[1].get_tensor_by_name("v1:0") local_step_1 = graphs[1].get_tensor_by_name("sync_rep_local_step:0") global_step = graphs[1].get_tensor_by_name("global_step:0") # The steps should also be initilized. self.assertAllEqual(0, sessions[1].run(global_step)) self.assertAllEqual(0, sessions[1].run(local_step_1)) # We have initial tokens in the queue so we can call this one by one. After # the token queue becomes empty, they should be called concurrently. # Here worker 0 and worker 2 finished first. sessions[0].run(train_ops[0]) sessions[2].run(train_ops[2]) # The global step should have been updated since we only need to collect 2 # gradients. The variables should now have the new values after the average # of the gradients from worker 0/2 are applied. while sessions[1].run(global_step) != 1: time.sleep(0.01) self.assertAllEqual(1, sessions[1].run(global_step)) self.assertAllClose(0 - (0.1 + 0.5) / 2 * 2.0, sessions[1].run(var_0_g_1)) self.assertAllClose(1 - (0.9 + 1.3) / 2 * 2.0, sessions[1].run(var_1_g_1)) # Worker 1 finished later and its gradients will now be dropped as it is # stale. sessions[1].run(train_ops[1]) # As shown in the previous test, the local_step for all workers should be # still 0 so their next computation will also be dropped. sessions[0].run(train_ops[0]) sessions[1].run(train_ops[1]) sessions[2].run(train_ops[2]) # Although the global step should still be 1 as explained above, the local # step should now be updated to 1. Just check worker 1 as an example. self.assertAllEqual(1, sessions[1].run(global_step)) self.assertAllEqual(1, sessions[1].run(local_step_1)) thread_0 = self.checkedThread( target=self._run, args=(train_ops[0], sessions[0])) thread_1 = self.checkedThread( target=self._run, args=(train_ops[1], sessions[1])) # Lets worker 0 execute first. # It will wait as we need 2 workers to finish this step and the global step # should be still 1. thread_0.start() self.assertAllEqual(1, sessions[1].run(global_step)) # Starts worker 1. thread_1.start() thread_1.join() thread_0.join() # The global step should now be 2 and the gradients should have been # applied again. self.assertAllEqual(2, sessions[1].run(global_step)) self.assertAllClose(-0.6 - (0.1 + 0.3) / 2 * 2.0, sessions[1].run(var_0_g_1)) self.assertAllClose(-1.2 - (0.9 + 1.1) / 2 * 2.0, sessions[1].run(var_1_g_1))
def setUp(self): super(SingleWorkerTest, self).setUp() workers, _ = test_util.create_local_cluster(1, 0) remote.connect_to_remote_host(workers[0].target)
def setUp(self): super(MultiWorkersTest, self).setUp() workers, _ = test_util.create_local_cluster(3, 0) remote.connect_to_remote_host( [workers[0].target, workers[1].target, workers[2].target])
def setUp(self): super(SingleWorkerTestBaseEager, self).setUp() workers, _ = test_util.create_local_cluster(num_workers=1, num_ps=0) remote.connect_to_remote_host(workers[0].target)