def run_independent_workers(self, worker_fn, strategy_cls, num_workers, num_ps=None, **kwargs): cluster_spec = multi_worker_test_base.create_cluster_spec( num_workers=num_workers, num_ps=num_ps) self._barrier = dc._Barrier(num_workers + (num_ps or 0)) # pylint: disable=protected-access def _worker_fn(**kwargs): """Runs the worker function in a thread.""" with test.mock.patch.object(dc, '_run_std_server', self._make_mock_run_std_server()): strategy = get_strategy_object(strategy_cls) with strategy.scope(): return worker_fn(**kwargs) threads = self.run_multiple_tasks_in_threads(_worker_fn, cluster_spec, **kwargs) strategy = get_strategy_object(strategy_cls) if strategy.extended.experimental_between_graph: threads_to_join = [ ts for task_type, ts in threads.items() if task_type == 'ps' ] else: threads_to_join = [threads['worker'][0]] self.join_independent_workers(threads_to_join)
def testSimpleModelIndependentWorkerAsync(self, strategy_cls): num_workers = 2 num_epoch = 2 cluster_spec = test_base.create_cluster_spec( num_workers=num_workers, num_ps=2) self._barrier = dc._Barrier(4) # The verification callback will be shared by multiple threads. verification_callback = MultiWorkerVerificationCallback( num_epoch=num_epoch, num_worker=num_workers) def _independent_worker_fn(*args, **kwargs): # pylint: disable=unused-argument """Simulates an Independent Worker inside of a thread.""" # TODO(rchao/yuefengz): The following is run by both worker and ps # threads. The distribute coordinator should run std server immediately # without configuring the session (or building the graph) on PS. with test.mock.patch.object(dc, '_run_std_server', self._make_mock_run_std_server()): batch_size = 64 steps = 2 strategy = strategy_cls() verification_callback.is_between_graph = \ strategy.extended.experimental_between_graph train_ds, _ = _mnist_synthetic_dataset(batch_size, steps) val_ds, _ = _mnist_synthetic_dataset(batch_size, steps) with strategy.scope(): model = _get_model((28, 28, 1)) # TODO(b/123868066): Verify callback for model.evaluate(). callbacks_for_fit = nest.flatten( kwargs.get('verification_callback', [])) history = model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, validation_data=val_ds, validation_steps=steps, callbacks=callbacks_for_fit) self.assertIsInstance(history, keras.callbacks.History) threads = self.run_multiple_tasks_in_threads( _independent_worker_fn, cluster_spec, verification_callback=verification_callback) threads_to_join = [] for task_type, ts in threads.items(): # This test can finish once the worker threads complete, and thus # the ps threads don't need to be joined. if task_type == 'ps': continue threads_to_join.extend(ts) self.join_independent_workers(threads_to_join) verification_callback.verify(self)
def test_process_exists(self): def fn(): time.sleep(100000) mpr = multi_process_runner.MultiProcessRunner( fn, multi_worker_test_base.create_cluster_spec(num_workers=1)) mpr.start() self.assertTrue(mpr.process_exists('worker', 0)) mpr.terminate('worker', 0) # Worker 0 should exit at some point, or else the test would time out. while mpr.process_exists('worker', 0): time.sleep(1)
def test_error_reporting_overrides_timeout_reporting(self): def fn(): if self._worker_idx() == 1: time.sleep(10000) raise ValueError('Worker 0 errored') mpr = multi_process_runner.MultiProcessRunner( fn, multi_worker_test_base.create_cluster_spec(num_workers=2)) mpr.start() with self.assertRaisesRegex(ValueError, 'Worker 0 errored'): mpr.join(timeout=20)
def test_template(self, strategy_cls, file_format): num_workers = 2 num_epoch = 2 cluster_spec = test_base.create_cluster_spec(num_workers=num_workers, test_obj=self) self._barrier = dc._Barrier(2) def _independent_worker_fn(*args, **kwargs): # pylint: disable=unused-argument """Simulates an Independent Worker inside of a thread.""" with test.mock.patch.object(dc, '_run_std_server', self._make_mock_run_std_server()): strategy = get_strategy_object(strategy_cls) batch_size = 64 steps = 2 train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset( batch_size, steps) with strategy.scope(): model = multi_worker_testing_utils.get_mnist_model( (28, 28, 1)) custom_callable(model, self, train_ds, num_epoch, steps, strategy, saving_filepath=kwargs['saving_filepath'], barrier=kwargs['barrier'], threading_local=kwargs['threading_local']) # Pass saving_filepath from the parent thread to ensure every worker has the # same fileapth to save. saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint.' + file_format) barrier = dc._Barrier(2) threading_local = threading.local() threads = self.run_multiple_tasks_in_threads( _independent_worker_fn, cluster_spec, saving_filepath=saving_filepath, barrier=barrier, threading_local=threading_local) self.assertFalse(training_state.checkpoint_exists(saving_filepath)) threads_to_join = [] strategy = get_strategy_object(strategy_cls) if strategy.extended.experimental_between_graph: for ts in threads.values(): threads_to_join.extend(ts) else: threads_to_join = [threads['worker'][0]] self.join_independent_workers(threads_to_join)
def testCheckHealthInvalidPeer(self): def worker_fn(): enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver()) context.context().check_collective_ops_peer_health( "localhost:12345", timeout_in_ms=1000) cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2) mpr = multi_process_runner.MultiProcessRunner(worker_fn, cluster_spec) mpr.start_single_process("worker", 0) with self.assertRaises(errors.InvalidArgumentError): mpr.join()
def test_auto_restart_and_timeout(self): def proc_func(): time.sleep(1) raise ValueError mpr = multi_process_runner.MultiProcessRunner( proc_func, multi_worker_test_base.create_cluster_spec(num_workers=1), auto_restart=True) mpr.start() with self.assertRaises(multi_process_runner.SubprocessTimeoutError): mpr.join(timeout=10)
def testArbitraryJobName(self): cluster_def = multi_worker_test_base.create_cluster_spec( num_workers=1, num_ps=1, has_chief=True) cluster_def["some_arbitrary_name"] = [ "localhost:%d" % multi_worker_test_base.pick_unused_port() ] cluster_resolver = SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc") with self.assertRaisesRegexp(ValueError, "Disallowed task type found in"): parameter_server_strategy_v2.ParameterServerStrategyV2( cluster_resolver)
def test_timeout_none(self): def proc_func(): time.sleep(250) raise ValueError('Worker 0 errored') mpr = multi_process_runner.MultiProcessRunner( proc_func, multi_worker_test_base.create_cluster_spec(num_workers=1)) mpr.start() with self.assertRaisesRegex(ValueError, 'Worker 0 errored'): mpr.join(timeout=None)
def test_signal_doesnt_fire_after_process_exits(self): mpr = multi_process_runner.MultiProcessRunner( proc_func_that_does_nothing, multi_worker_test_base.create_cluster_spec(num_workers=1), max_run_time=10) mpr.start() mpr.join() with self.assertRaisesRegexp(Queue.Empty, ''): # If the signal was fired, another message would be added to internal # queue, so verifying it's empty. multi_process_runner._resource( multi_process_runner.PROCESS_STATUS_QUEUE).get(block=False)
def test_tf_config(self): cluster_spec = multi_worker_test_base.create_cluster_spec( has_chief=True, num_workers=2) runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec) result = runner.run(fn_that_adds_task_type_in_return_data) job_count_dict = {'worker': 2, 'chief': 1} for data in result: job_count_dict[data] -= 1 self.assertEqual(job_count_dict['worker'], 0) self.assertEqual(job_count_dict['chief'], 0)
def test_complete_flow_independent_worker_between_graph( self, train_distribute_cls, eval_distribute_cls): if (context.num_gpus() < 2 and eval_distribute_cls == collective_all_reduce_strategy.CollectiveAllReduceStrategy): self.skipTest( "`CollectiveAllReduceStrategy` needs at least two towers.") train_distribute = self._get_strategy_object(train_distribute_cls) if eval_distribute_cls: eval_distribute = self._get_strategy_object(eval_distribute_cls, eval_strategy=True) else: eval_distribute = None if (train_distribute_cls == parameter_server_strategy.ParameterServerStrategy): cluster_spec = multi_worker_test_base.create_cluster_spec( num_workers=3, num_ps=2, has_eval=True) # 3 workers, 2 ps and 1 evaluator. self._barrier = dc._Barrier(6) else: cluster_spec = multi_worker_test_base.create_cluster_spec( num_workers=3, num_ps=0, has_eval=True) # 3 workers and 1 evaluator. self._barrier = dc._Barrier(4) threads = self.run_multiple_tasks_in_threads( self._independent_worker_fn, cluster_spec, train_distribute, eval_distribute) threads_to_join = [] for task_type, ts in threads.items(): if task_type == PS: continue for t in ts: threads_to_join.append(t) self.join_independent_workers(threads_to_join) estimator = self._get_estimator(train_distribute, eval_distribute) self._inspect_train_and_eval_events(estimator)
def test_streaming(self): def proc_func(): for i in range(5): logging.info('(logging) %s-%d, i: %d', multi_worker_test_base.get_task_type(), self._worker_idx(), i) print( '(print) {}-{}, i: {}'.format( multi_worker_test_base.get_task_type(), self._worker_idx(), i), flush=True) time.sleep(1) mpr = multi_process_runner.MultiProcessRunner( proc_func, multi_worker_test_base.create_cluster_spec( has_chief=True, num_workers=2, num_ps=2, has_eval=True), list_stdout=True) mpr._dependence_on_chief = False mpr.start() mpr.start_single_process('worker', 2) mpr.start_single_process('ps', 2) mpr_result = mpr.join() list_to_assert = mpr_result.stdout for job in ['chief', 'evaluator']: for iteration in range(5): self.assertTrue( any('(logging) {}-0, i: {}'.format(job, iteration) in line for line in list_to_assert)) self.assertTrue( any('(print) {}-0, i: {}'.format(job, iteration) in line for line in list_to_assert)) for job in ['worker', 'ps']: for iteration in range(5): for task in range(3): self.assertTrue( any('(logging) {}-{}, i: {}'.format(job, task, iteration) in line for line in list_to_assert)) self.assertTrue( any('(print) {}-{}, i: {}'.format(job, task, iteration) in line for line in list_to_assert)) task = 3 self.assertFalse( any('(logging) {}-{}, i: {}'.format(job, task, iteration) in line for line in list_to_assert)) self.assertFalse( any('(print) {}-{}, i: {}'.format(job, task, iteration) in line for line in list_to_assert))
def test_basic_run(self): has_chief = False cluster_spec = multi_worker_test_base.create_cluster_spec( has_chief=has_chief, num_workers=CLUSTER_SIZE) maintenance_event = multi_process_runner.manager().Event() training_finished = multi_process_runner.manager().Event() checkpoint_dir = os.path.join(self.get_temp_dir(), 'fh_ckpt') if _is_oss(): rpc_layer = 'grpc' else: rpc_layer = 'grpc+loas' mpr = multi_process_runner.MultiProcessRunner( self.worker_fn, cluster_spec, args=(checkpoint_dir, cluster_spec, maintenance_event, training_finished), rpc_layer=rpc_layer, return_output=True, dependence_on_chief=has_chief) logging.info('Cluster starting.') mpr.start() while (not maintenance_event.is_set()) and (not training_finished.is_set()): time.sleep(1) time.sleep(5) if not training_finished.is_set(): logging.info('restarting workers') for worker_id in range(CLUSTER_SIZE): mpr.start_single_process('worker', worker_id, cluster_spec) logging.info('workers restarted') stdout = mpr.join().stdout if maintenance_event.is_set(): all_start_point = [] for msg in stdout: matched_group = re.search(r'.*Start training at (\d+)', msg) if matched_group: all_start_point.append(int(matched_group.group(1))) # remove duplicate logs created due to presence of multiple workers start_points = all_start_point[::CLUSTER_SIZE] if len(start_points) > 1: # assert that after restarting, we don't repeat previous training steps self.assertNotEqual(start_points[-1], 0)
def test_preemption_checkpointing(self): has_chief = False cluster_spec = multi_worker_test_base.create_cluster_spec( has_chief=has_chief, num_workers=CLUSTER_SIZE) training_started_event = multi_process_runner.manager().Event() checkpoint_dir = os.path.join(self.get_temp_dir(), 'fh_ckpt') if _is_oss(): rpc_layer = 'grpc' else: rpc_layer = 'grpc+loas' mpr = multi_process_runner.MultiProcessRunner( self.worker_fn, cluster_spec, args=(checkpoint_dir, cluster_spec, [training_started_event]), rpc_layer=rpc_layer, return_output=True, dependence_on_chief=has_chief) logging.info('Cluster starting.') mpr.start() while not training_started_event.is_set(): time.sleep(1) logging.info('sending sigterm') killed_worker = random.randrange(0, CLUSTER_SIZE) os.kill(mpr.get_process_id('worker', killed_worker), signal.SIGTERM) logging.info('sigterm sent') time.sleep(5) logging.info('restarting workers') for worker_id in range(CLUSTER_SIZE): mpr.start_single_process('worker', worker_id, cluster_spec) logging.info('workers restarted') stdout = mpr.join().stdout all_start_point = [] for msg in stdout: matched_group = re.search(r'.*Restored training at (\d+)', msg) if matched_group: all_start_point.append(int(matched_group.group(1))) # remove duplicate logs created due to presence of multiple workers start_points = all_start_point[::CLUSTER_SIZE] # assert that after restarting, we don't repeat previous training steps self.assertNotEqual(start_points[-1], 0)
def test_model_checkpoint_saves_on_chief_but_not_otherwise( self, file_format, mode, save_weights_only): def proc_model_checkpoint_saves_on_chief_but_not_otherwise( test_obj, file_format): model, saving_filepath, train_ds, steps = _model_setup( test_obj, file_format) num_epoch = 2 extension = os.path.splitext(saving_filepath)[1] # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves checkpoint but non-chief doesn't. task_config = _get_task_config() saving_filepath = os.path.join( test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' % (task_config['type'], task_config['index'], extension)) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(checkpoint_exists(saving_filepath)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, validation_data=train_ds, validation_steps=steps, callbacks=[ callbacks.ModelCheckpoint( filepath=saving_filepath, save_weights_only=save_weights_only) ]) # If it's chief, the model should be saved; if not, the model shouldn't. test_obj.assertEqual( checkpoint_exists(saving_filepath), test_base.is_chief()) # If it's chief, the model should be saved (`write_filepath` should # simply return `saving_filepath`); if not, i.e. for non-chief workers, # the temporary path generated by `write_filepath` should no longer # contain the checkpoint that has been deleted. test_obj.assertEqual( checkpoint_exists( distributed_file_utils.write_filepath( saving_filepath, model._distribution_strategy)), test_base.is_chief()) multi_process_runner.run( proc_model_checkpoint_saves_on_chief_but_not_otherwise, cluster_spec=test_base.create_cluster_spec(num_workers=2), args=(self, file_format))
def test_multi_process_runner(self): mpr_result = multi_process_runner.run( proc_func_that_adds_task_type_in_return_data, multi_worker_test_base.create_cluster_spec( num_workers=2, num_ps=3, has_eval=1)) job_count_dict = {'worker': 2, 'ps': 3, 'evaluator': 1} for data in mpr_result.return_value: job_count_dict[data] -= 1 self.assertEqual(job_count_dict['worker'], 0) self.assertEqual(job_count_dict['ps'], 0) self.assertEqual(job_count_dict['evaluator'], 0)
def test_complete_flow_independent_worker_between_graph( self, train_distribute_cls, eval_distribute_cls): if (context.num_gpus() < 2 and eval_distribute_cls == collective_all_reduce_strategy.CollectiveAllReduceStrategy): self.skipTest("`CollectiveAllReduceStrategy` needs at least two towers.") train_distribute = self._get_strategy_object(train_distribute_cls) if eval_distribute_cls: eval_distribute = self._get_strategy_object( eval_distribute_cls, eval_strategy=True) else: eval_distribute = None if (train_distribute_cls == parameter_server_strategy .ParameterServerStrategy): cluster_spec = multi_worker_test_base.create_cluster_spec( num_workers=3, num_ps=2, has_eval=True) # 3 workers, 2 ps and 1 evaluator. self._barrier = dc._Barrier(6) else: cluster_spec = multi_worker_test_base.create_cluster_spec( num_workers=3, num_ps=0, has_eval=True) # 3 workers and 1 evaluator. self._barrier = dc._Barrier(4) threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn, cluster_spec, train_distribute, eval_distribute) threads_to_join = [] for task_type, ts in threads.items(): if task_type == PS: continue for t in ts: threads_to_join.append(t) self.join_independent_workers(threads_to_join) estimator = self._get_estimator(train_distribute, eval_distribute) self._inspect_train_and_eval_events(estimator)
def testCheckHealthPeerDown(self): def worker_fn(): enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver()) context.context().check_collective_ops_peer_health( "/job:worker/replica:0/task:1", timeout_in_ms=1000) cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2) mpr = multi_process_runner.MultiProcessRunner(worker_fn, cluster_spec) mpr.start_single_process("worker", 0) with self.assertRaises( (errors.UnavailableError, errors.DeadlineExceededError)): mpr.join()
def test_exit_code_is_reported_by_subprocess(self): def proc_func_expected_to_exit_with_10(): sys.exit(10) mpr = multi_process_runner.MultiProcessRunner( proc_func_expected_to_exit_with_10, multi_worker_test_base.create_cluster_spec(num_workers=1)) mpr.start() with self.assertRaisesRegex( multi_process_runner.UnexpectedSubprocessExitError, 'Subprocess worker-0 exited with exit code 10'): mpr.join()
def testMoreThanOneChief(self): cluster_def = multi_worker_test_base.create_cluster_spec( num_workers=1, num_ps=1) chief_ports = [multi_worker_test_base.pick_unused_port() for _ in range(3)] cluster_def["chief"] = ["localhost:%s" % port for port in chief_ports] cluster_resolver = SimpleClusterResolver( ClusterSpec(cluster_def), rpc_layer="grpc", task_type="chief", task_id=1) with self.assertRaisesRegexp(ValueError, "There must be at most one 'chief' job."): parameter_server_strategy_v2.ParameterServerStrategyV2(cluster_resolver)
def testSimpleModelIndependentWorkerSync(self, strategy_cls): num_workers = 2 num_epoch = 2 cluster_spec = test_base.create_cluster_spec(num_workers=num_workers, test_obj=self) self._barrier = dc._Barrier(2) # The verification callback will be shared by multiple threads. verification_callback = MultiWorkerVerificationCallback( num_epoch=num_epoch, num_worker=num_workers) def _independent_worker_fn(*args, **kwargs): # pylint: disable=unused-argument """Simulates an Independent Worker inside of a thread.""" with test.mock.patch.object(dc, '_run_std_server', self._make_mock_run_std_server()): strategy = strategy_cls() verification_callback.is_between_graph = \ strategy.extended.experimental_between_graph batch_size = 64 steps = 2 train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset( batch_size, steps) with strategy.scope(): model = multi_worker_testing_utils.get_mnist_model( (28, 28, 1)) orig_loss, _ = model.evaluate(train_ds, steps=steps) callbacks_for_fit = nest.flatten( kwargs.get('verification_callback', [])) history = model.fit(x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=callbacks_for_fit) self.assertIsInstance(history, keras.callbacks.History) trained_loss, _ = model.evaluate(train_ds, steps=steps) self.assertLess(trained_loss, orig_loss) threads = self.run_multiple_tasks_in_threads( _independent_worker_fn, cluster_spec, verification_callback=verification_callback) threads_to_join = [] strategy = strategy_cls() if strategy.extended.experimental_between_graph: for ts in threads.values(): threads_to_join.extend(ts) else: threads_to_join = [threads['worker'][0]] self.join_independent_workers(threads_to_join) verification_callback.verify(self)
def runner(self): if not self._runner: if (_num_total_workers(self.has_chief, self.num_workers) > 1 and self.use_pool_runner): # Need to create the strategy in the initializer so that collectives are # configured before eager context initialization. cluster_spec = multi_worker_test_base.create_cluster_spec( has_chief=self.has_chief, num_workers=self.num_workers, num_ps=0, has_eval=False) self._runner = multi_process_runner.MultiProcessPoolRunner( cluster_spec, initializer=self._distribution_fn) return self._runner
def test_seg_fault_raises_error(self): def proc_func_expected_to_seg_fault(): ctypes.string_at(0) # Intentionally made seg fault. with self.assertRaises( multi_process_runner.UnexpectedSubprocessExitError) as cm: multi_process_runner.run( proc_func_expected_to_seg_fault, multi_worker_test_base.create_cluster_spec(num_workers=1), list_stdout=True) self.assertIn('Subprocess worker-0 exited with exit code', str(cm.exception)) list_to_assert = cm.exception.mpr_result.stdout self.assertTrue(any('SIGSEGV' in line for line in list_to_assert))
def decorator(self, has_chief, num_workers, runner, **kwargs): if _num_total_workers(has_chief, num_workers) == 1 or _running_in_worker: # We're in worker process or the test is for single worker. Either case we # execute the test method directly instead of spawning subprocesses. test_method(self, **kwargs) return # We're in the main process. We spawn subprocesses and run the *test* on # each of them. Note that we're not directly executing test_method passed to # _multi_worker_test, because we need setUp()/tearDown() to be called and # all the decorations on the test method. The conceptual call stack is: # [main process]test.main() # [main process]test_runner.run(test) # [main process]wrapper by combinations.generate() # [main process]_multi_worker_test.decorator() # # A sub process goes through the same code path as the main # # process. # [sub process]_test_runner() # [sub process]test_runner.run(test) # [sub process]wrapper by combinations.generate() # [sub process]_multi_worker_test.decorator() # # _running_in_worker is True # [sub process]test_method() test_id = self.id() if runner: results = runner.run(_test_runner, args=(test_id, )) else: cluster_spec = multi_worker_test_base.create_cluster_spec( has_chief=has_chief, num_workers=num_workers, num_ps=0, has_eval=False) results = multi_process_runner.run(_test_runner, cluster_spec, args=(test_id, )).return_value skip_reason = None for result in results: if result.status == "failure": # We can't tell which worker the return value come from, so we fail on # the first error. self.fail(result.message) break elif result.status == "skipped": # Record the skip reason, but do not actually skip the test in case some # processes fail instead. skip_reason = result.message if skip_reason is not None: self.skipTest(skip_reason)
def testSimpleModelIndependentWorkerSync(self, strategy_cls): num_workers = 2 num_epoch = 2 cluster_spec = test_base.create_cluster_spec(num_workers=num_workers) self._barrier = dc._Barrier(2) # The verification callback will be shared by multiple threads. verification_callback = MultiWorkerVerificationCallback( num_epoch=num_epoch, num_worker=num_workers) def _independent_worker_fn(*args, **kwargs): # pylint: disable=unused-argument """Simulates an Independent Worker inside of a thread.""" with test.mock.patch.object(dc, '_run_std_server', self._make_mock_run_std_server()): strategy = strategy_cls() verification_callback.is_between_graph = \ strategy.extended.experimental_between_graph batch_size = 64 steps = 2 train_ds, _ = _mnist_synthetic_dataset(batch_size, steps) with strategy.scope(): model = _get_model((28, 28, 1)) orig_loss, _ = model.evaluate(train_ds, steps=steps) callbacks_for_fit = nest.flatten( kwargs.get('verification_callback', [])) history = model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=callbacks_for_fit) self.assertIsInstance(history, keras.callbacks.History) trained_loss, _ = model.evaluate(train_ds, steps=steps) self.assertLess(trained_loss, orig_loss) threads = self.run_multiple_tasks_in_threads( _independent_worker_fn, cluster_spec, verification_callback=verification_callback) threads_to_join = [] strategy = strategy_cls() if strategy.extended.experimental_between_graph: for ts in threads.values(): threads_to_join.extend(ts) else: threads_to_join = [threads['worker'][0]] self.join_independent_workers(threads_to_join) verification_callback.verify(self)
def test_timeout_none(self): if multi_process_runner.is_oss(): self.skipTest('Intentionally skipping longer test in OSS.') def fn(): time.sleep(250) raise ValueError('Worker 0 errored') mpr = multi_process_runner.MultiProcessRunner( fn, multi_worker_test_base.create_cluster_spec(num_workers=1)) mpr.start() with self.assertRaisesRegex(ValueError, 'Worker 0 errored'): mpr.join(timeout=None)
def testCanonicalizeWithoutDefaultDeviceCollectiveEnabled(self): cluster_spec = server_lib.ClusterSpec( multi_worker_test_base.create_cluster_spec(has_chief=False, num_workers=1, num_ps=0, has_eval=False)) server_def = tensorflow_server_pb2.ServerDef( cluster=cluster_spec.as_cluster_def(), job_name="worker", task_index=0, protocol="grpc", port=0) context.context().enable_collective_ops(server_def) self.assertEqual(device_util.canonicalize("/cpu:0"), "/job:worker/replica:0/task:0/device:CPU:0")
def test_stdout_captured(self): def simple_print_func(): print('This is something printed.') return 'This is returned data.' returned_data, std_stream_data = multi_process_runner.run( simple_print_func, multi_worker_test_base.create_cluster_spec(num_workers=2), capture_std_stream=True) num_string_std_stream = len( [d for d in std_stream_data if d == 'This is something printed.']) num_string_returned_data = len( [d for d in returned_data if d == 'This is returned data.']) self.assertEqual(num_string_std_stream, 2) self.assertEqual(num_string_returned_data, 2)
def test_process_that_exits(self): def func_to_exit_in_10_sec(): time.sleep(5) mpr._add_return_data('foo') time.sleep(20) mpr._add_return_data('bar') mpr = multi_process_runner.MultiProcessRunner( func_to_exit_in_10_sec, multi_worker_test_base.create_cluster_spec(num_workers=1), max_run_time=10) mpr.start() returned_data, _ = mpr.join() self.assertLen(returned_data, 1)
def test_process_that_exits(self): def func_to_exit_in_25_sec(): logging.error('foo') time.sleep(100) logging.error('bar') mpr = multi_process_runner.MultiProcessRunner( func_to_exit_in_25_sec, multi_worker_test_base.create_cluster_spec(num_workers=1), list_stdout=True, max_run_time=25) mpr.start() stdout = mpr.join().stdout self.assertLen([msg for msg in stdout if 'foo' in msg], 1) self.assertLen([msg for msg in stdout if 'bar' in msg], 0)
def test_stdout_captured(self): def simple_print_func(): print('This is something printed.', flush=True) return 'This is returned data.' mpr_result = multi_process_runner.run( simple_print_func, multi_worker_test_base.create_cluster_spec(num_workers=2), list_stdout=True) std_stream_results = mpr_result.stdout return_value = mpr_result.return_value self.assertIn('[worker-0]: This is something printed.\n', std_stream_results) self.assertIn('[worker-1]: This is something printed.\n', std_stream_results) self.assertIn('This is returned data.', return_value)
def test_exit_code_is_reported_by_chief_subprocess(self): def proc_func_expected_to_exit_with_20(): if multi_worker_test_base.get_task_type() == 'worker': time.sleep(10000) sys.exit(20) mpr = multi_process_runner.MultiProcessRunner( proc_func_expected_to_exit_with_20, multi_worker_test_base.create_cluster_spec(has_chief=True, num_workers=1)) mpr.start() with self.assertRaisesRegex( multi_process_runner.UnexpectedSubprocessExitError, 'Subprocess chief-0 exited with exit code 20'): mpr.join()
def test_auto_restart(self): def proc_func(counter): counter.value += 1 if counter.value == 1: raise ValueError manager = multi_process_runner.manager() counter = manager.Value(int, 0) mpr = multi_process_runner.MultiProcessRunner( proc_func, multi_worker_test_base.create_cluster_spec(num_workers=1), args=(counter, ), auto_restart=True) mpr.start() mpr.join() self.assertEqual(counter.value, 2)
def test_template(self, strategy_cls): num_workers = 2 num_epoch = 2 cluster_spec = test_base.create_cluster_spec(num_workers=num_workers) self._barrier = dc._Barrier(2) def _independent_worker_fn(*args, **kwargs): # pylint: disable=unused-argument """Simulates an Independent Worker inside of a thread.""" with test.mock.patch.object(dc, '_run_std_server', self._make_mock_run_std_server()): strategy = get_strategy_object(strategy_cls) batch_size = 64 steps = 2 train_ds, _ = _mnist_synthetic_dataset(batch_size, steps) with strategy.scope(): model = _get_model((28, 28, 1)) custom_callable( model, self, train_ds, num_epoch, steps, strategy, saving_filepath=kwargs['saving_filepath']) # Pass saving_filepath from the parent thread to ensure every worker has the # same fileapth to save. saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint.h5') threads = self.run_multiple_tasks_in_threads( _independent_worker_fn, cluster_spec, saving_filepath=saving_filepath) if os.path.exists(saving_filepath): os.remove(saving_filepath) threads_to_join = [] strategy = get_strategy_object(strategy_cls) if strategy.extended.experimental_between_graph: for ts in threads.values(): threads_to_join.extend(ts) else: threads_to_join = [threads['worker'][0]] self.join_independent_workers(threads_to_join)
def test_complete_flow_independent_worker_in_graph(self, train_distribute_cls, eval_distribute_cls): train_distribute = self._get_strategy_object(train_distribute_cls) if eval_distribute_cls: eval_distribute = self._get_strategy_object(eval_distribute_cls) else: eval_distribute = None cluster_spec = multi_worker_test_base.create_cluster_spec( num_workers=3, num_ps=0, has_eval=True) # 3 workers and 1 evaluator. self._barrier = dc._Barrier(4) threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn, cluster_spec, train_distribute, eval_distribute) self.join_independent_workers([threads[WORKER][0], threads[EVALUATOR][0]]) estimator = self._get_estimator(train_distribute, eval_distribute) self._inspect_train_and_eval_events(estimator)
def testFaultToleranceInSyncStrategy(self, strategy_cls, file_format, preemption_callback): """Test fault-tolerance with multi-threading using sync dist-strat. This test simulates multi-worker training that is interrupted by a preemption, by having two threads, each of which represents a chief and a non-chief worker, where the non-chief raises an error in the middle of training loop. Upon excepting the error, a new thread with a new cluster spec is created to simulate the recovered non-chief worker. Meanwhile, the chief worker cannot proceed and hangs since the non-chief worker has crashed. To simulate a restart of the chief, a new thread has been prepared to run to take over chief with the help of a condition variable. It is expected that after the restart of both chief and non-chief workers, the training continues from the epoch they previously failed at. The test concludes by verifying the preemption-interrupted training can finish with the same loss and accuracy had the preemption not occurred. Arguments: strategy_cls: The strategy class to use. file_format: `h5` or `tf`. preemption_callback: The callback to simulate preemption. """ def _independent_worker_fn(*args, **kwargs): # pylint: disable=unused-argument with test.mock.patch.object(dc, '_run_std_server', self._make_mock_run_std_server()): # Condition variable that blocks the thread that represents the # restarted chief. cv = kwargs.get('cv', None) # `before_restart` is True for the threads that represent the original # chief and non-chief worker, and False for threads that represent the # restarted chief and non-chief workers. before_restart = kwargs['before_restart'] if kwargs['new_chief']: # `new_chief` is only True for the restarted chief thread. It waits # until non-chief is preempted and restarted to simulate the causality # where chief's restart results from non-chief's failure. cv.acquire() while not hasattr(cv, 'preempted'): cv.wait() cv.release() # Model building under strategy scope. Following is the code we expect # the user runs on every worker. strategy = get_strategy_object(strategy_cls) batch_size = 64 steps = 3 train_ds, _ = _mnist_synthetic_dataset(batch_size, steps) with strategy.scope(): model = _get_model((28, 28, 1)) # Function to start a new thread. This will be called twice in the # following code: one represents the restart of the non-chief, and one # represents the restart of the chief as a result of the restart of the # non-chief (so the training can continue in sync). def start_new_thread(new_chief=False): new_thread_tf_config = json.loads(os.environ['TF_CONFIG']) new_thread_tf_config['cluster']['worker'] = kwargs['reserved_ports'] return self._run_task_in_thread( task_fn=_independent_worker_fn, cluster_spec=None, task_type=None, task_id=None, tf_config=new_thread_tf_config, before_restart=False, cv=cv, new_chief=new_chief) if test_base.is_chief() and before_restart: # Chief to start a new thread (that will be blocked by a condition # variable until the non-chief's new thread is started). The thread # for (recovered) chief is started before entering `fit()` because # the original chief thread will eventually hang and be ignored. start_new_thread(new_chief=True) try: class CkptSavedEpochAssertingCallback(callbacks.Callback): def __init__(self, test_obj): super(CkptSavedEpochAssertingCallback, self).__init__() self.test_obj = test_obj def on_epoch_begin(self, epoch, logs=None): # `_ckpt_saved_epoch` attribute is set at the end of every epoch. self.test_obj.assertEqual(self.model._ckpt_saved_epoch is None, epoch == 0) callbacks_list = [ callbacks.ModelCheckpoint( filepath=saving_filepath, save_weights_only=True, load_weights_on_restart=True), CkptSavedEpochAssertingCallback(self) ] if before_restart: callbacks_list.append(preemption_callback()) self.assertIsNone(model._ckpt_saved_epoch) history = model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=callbacks_list) self.assertIsNone(model._ckpt_saved_epoch) # `history` of the training result is collected to be compared against # each other. It is expected that the training results (loss and # accuracy`) are the same with or without preemption. self._histories.append(history.history) except RuntimeError: # pylint: disable=g-assert-in-except self.assertTrue(before_restart) # Reset the barrier so the new threads simulating recovery can # continue. self._barrier._counter = 0 self._barrier._flag = False # Now that the non-chief has been preempted, it notifies the thread # that simulates the restarted chief to start so they can be back in # sync. cv.acquire() cv.preempted = True cv.notify() cv.release() # At this point we should discard the original non-chief thread, and # start the new thread that simulates the restarted non-chief, hence # joining the thread and return. self.join_independent_workers([start_new_thread()]) return # Successful end of a `fit()` call. self._successful_thread_ends += 1 self.assertFalse(before_restart) # Common parameters num_workers = 2 num_epoch = 3 # History list storing the results for preemption and no preemption cases. self._histories = [] # Pass `saving_filepath` from the parent thread to ensure every worker has # the same filepath to save. saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint.' + file_format) strategy = get_strategy_object(strategy_cls) # Case 1: Training for `num_epoch` without preemptions. cluster_spec = test_base.create_cluster_spec(num_workers=num_workers) self._barrier = dc._Barrier(2) self._successful_thread_ends = 0 threads = self.run_multiple_tasks_in_threads( _independent_worker_fn, cluster_spec, saving_filepath=saving_filepath, before_restart=False, new_chief=False) if os.path.exists(saving_filepath): os.remove(saving_filepath) threads_to_join = [] if strategy.extended.experimental_between_graph: for ts in threads.values(): threads_to_join.extend(ts) else: threads_to_join = [threads['worker'][0]] self.join_independent_workers(threads_to_join) self.assertEqual(self._successful_thread_ends, 2) # Case 2: Training for `num_epoch` epoch with preemptions. # The preemption is simulated at both epoch boundary and batch boundary. cluster_spec = test_base.create_cluster_spec(num_workers=num_workers) cv = threading.Condition() self._barrier = dc._Barrier(2) # Ports reserved for new threads simulating recovery. reserved_ports = [ 'localhost:%s' % test_base.pick_unused_port() for _ in range(num_workers) ] self._successful_thread_ends = 0 threads = self.run_multiple_tasks_in_threads( _independent_worker_fn, cluster_spec, saving_filepath=saving_filepath, reserved_ports=reserved_ports, before_restart=True, cv=cv, new_chief=False) if os.path.exists(saving_filepath): os.remove(saving_filepath) threads_to_join = [] if strategy.extended.experimental_between_graph: # Only join the non-chief thread since the first thread for chief will # eventually hang and be ignored. threads_to_join = [threads['worker'][1]] else: threads_to_join = [threads['worker'][0]] self.join_independent_workers(threads_to_join) self.assertEqual(self._successful_thread_ends, 2) def assert_all_elements_are_identical(list_to_check): first_item = list_to_check[0] for item in list_to_check[1:]: self.assertAllClose(first_item, item, rtol=1e-5, atol=1e-5) # Important: the results from preemption interrupted and non-interrupted # cases should give the same final results. assert_all_elements_are_identical( [history['acc'][-1] for history in self._histories]) assert_all_elements_are_identical( [history['loss'][-1] for history in self._histories]) # The length of `self._histories` would be num_workers * num_runs (3). self.assertLen(self._histories, 4)
def run_optimizer_comparison_with_simple_bias_model( self, strategy_cls, optimizer_class_1, optimizer_class_2): def get_input_datasets(): # Simple training input. train_input = [[1]] * 16 train_label = [[0]] * 16 ds = dataset_ops.Dataset.from_tensor_slices((train_input, train_label)) ds = maybe_shard_dataset(ds) # TODO(rchao): Investigate to figure out the reason for having 8 workers # instead of 2 as expected. return ds.batch(8, drop_remainder=True) def get_simple_bias_model(): class Bias(base_layer.Layer): def build(self, input_shape): self.bias = self.add_variable('bias', (1,), initializer='zeros') def call(self, inputs): return inputs + self.bias model = sequential.Sequential() model.add(Bias(input_shape=(1,))) return model self._lock = threading.Lock() cluster_spec = test_base.create_cluster_spec(num_workers=2) self._barrier = dc._Barrier(2) def _independent_worker_fn(*args, **kwargs): # pylint: disable=unused-argument """Simulates an Independent Worker inside a thread.""" # TODO(rchao): Refactor to abstract the common boilerplate out. with test.mock.patch.object(dc, '_run_std_server', self._make_mock_run_std_server()): model = get_simple_bias_model() initial_weights = model.get_weights() def _get_model_results(optimizer, initial_weights): # Clear Keras session to reset device assignment keras.backend._SESSION.session = None strategy = strategy_cls() with strategy.scope(): train_ds = get_input_datasets() model = get_simple_bias_model() model.set_weights(initial_weights) model.compile(loss='mae', optimizer=optimizer, metrics=['mae']) return { 'trained_loss_and_accuracy': model.fit(x=train_ds, epochs=20).history, 'trained_weights': model.get_weights(), } results1 = _get_model_results(optimizer_class_1(0.01), initial_weights) results2 = _get_model_results(optimizer_class_2(0.01), initial_weights) for key in results1: self.assertAllClose( results1[key], results2[key], atol=1e-5, rtol=1e-5, msg='Fail to assert {}'.format(key)) threads = self.run_multiple_tasks_in_threads(_independent_worker_fn, cluster_spec) threads_to_join = [] strategy = strategy_cls() if strategy.extended.experimental_between_graph: for ts in threads.values(): threads_to_join.extend(ts) else: threads_to_join = [threads['worker'][0]] self.join_independent_workers(threads_to_join)