def _create_multi_worker_mirrored(): tf_config = cluster_resolver.TFConfigClusterResolver() master = tf_config.master() if tf_config.rpc_layer: # Strip off the rpc_layer suffix. master = master[len("%s://" % tf_config.rpc_layer):] resolver = cluster_resolver.SimpleClusterResolver( cluster_spec=tf_config.cluster_spec(), task_type=tf_config.task_type, task_id=tf_config.task_id, master=master, environment=tf_config.environment, num_accelerators={"GPU": required_gpus}, rpc_layer=tf_config.rpc_layer or "grpc", ) # Always create the strategy in eager mode so that it starts the server and # configures the eager context. The eager context can no longer be # configured after initialization. with context.eager_mode(): strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy( cluster_resolver=resolver) # TODO(b/152320929): Wait for the cluster before proceeding, otherwise # collectives may hang if any worker launches collectives before the chief # creates the strategy. try: multi_process_runner.barrier().wait() except ValueError: # If the creator is called in the main process, # multi_process_runner.barrier() raises ValueError, which is safe to # ignore. pass return strategy
def worker_fn(): enable_collective_ops( cluster_resolver_lib.TFConfigClusterResolver()) # There may be some delays before the server startup. Check health should # eventually be OK. while True: try: for task in [ "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1", ]: context.context().check_collective_ops_peer_health( task) except errors.UnavailableError: continue break multi_process_runner.barrier().wait()
def worker_step_fn(worker_id): strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy() # Make sure the processeses are in sync after updating the cluster multi_process_runner.barrier().wait() @def_function.function def run_reduce(): with ops.device(self._local_device): t_in = array_ops.ones(tensor_shape) * worker_id return strategy.reduce(reduce_util.ReduceOp.MEAN, t_in, axis=None) t_out = run_reduce() # Element values from the workers are # 0, 1, ..., (NUM_WORKERS - 1) expected_mean = (NUM_WORKERS - 1) / 2 expected_out = np.ones(tensor_shape) * expected_mean self.assertAllClose(t_out, expected_out)
def worker_step_fn(worker_id, num_dims): strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy() # Make sure the processeses are in sync after updating the cluster multi_process_runner.barrier().wait() tensor_shape = [2] * num_dims def variable_fn(): with ops.device(self._local_device): # The initial value will be broadcasted from worker 0 to others. initial_value = (array_ops.ones(tensor_shape) if worker_id == 0 else array_ops.zeros(tensor_shape)) var = variable_scope.get_variable(name='x', initializer=initial_value) return array_ops.identity(var) t_out = strategy.extended.call_for_each_replica(variable_fn) expected_out = np.ones(tensor_shape) self.assertAllClose(t_out, expected_out)
def proc_tensorboard_works_with_same_file_path(test_obj, saving_filepath): model, _, train_ds, steps = _model_setup(test_obj, file_format='') num_epoch = 2 # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(file_io.file_exists(saving_filepath)) multi_process_runner.barrier().wait() model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)]) multi_process_runner.barrier().wait() test_obj.assertTrue(file_io.list_directory(saving_filepath))
def proc_model_checkpoint_works_with_same_file_path(test_obj, saving_filepath): model, _, train_ds, steps = _model_setup(test_obj, file_format='') num_epoch = 4 # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(file_io.file_exists(saving_filepath)) bar_dir = os.path.join(os.path.dirname(saving_filepath), 'backup') try: model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[ callbacks.ModelCheckpoint(filepath=saving_filepath), callbacks.BackupAndRestore(backup_dir=bar_dir), InterruptingCallback() ]) except RuntimeError as e: if 'Interrupting!' not in str(e): raise multi_process_runner.barrier().wait() backup_filepath = os.path.join(bar_dir, 'checkpoint') test_obj.assertTrue(file_io.file_exists(backup_filepath)) test_obj.assertTrue(file_io.file_exists(saving_filepath)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[ callbacks.ModelCheckpoint(filepath=saving_filepath), callbacks.BackupAndRestore(backup_dir=bar_dir), AssertCallback() ]) multi_process_runner.barrier().wait() test_obj.assertFalse(file_io.file_exists(backup_filepath)) test_obj.assertTrue(file_io.file_exists(saving_filepath))
def _create_multi_worker_mirrored(): tf_config = cluster_resolver.TFConfigClusterResolver() resolver = cluster_resolver.SimpleClusterResolver( cluster_spec=tf_config.cluster_spec(), task_type=tf_config.task_type, task_id=tf_config.task_id, environment=tf_config.environment, num_accelerators={"GPU": required_gpus}, rpc_layer=tf_config.rpc_layer, ) strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy( cluster_resolver=resolver) # TODO(b/152320929): Wait for the cluster before proceeding, otherwise # collectives may hang if any worker launches collectives before the chief # creates the strategy. try: multi_process_runner.barrier().wait() except ValueError: # If the creator is called in the main process, # multi_process_runner.barrier() raises ValueError, which is safe to # ignore. pass return strategy
def _create_multi_worker_mirrored(): tf_config = cluster_resolver.TFConfigClusterResolver() master = tf_config.master() if tf_config.rpc_layer: # Strip off the rpc_layer suffix. master = master[len("%s://" % tf_config.rpc_layer):] resolver = cluster_resolver.SimpleClusterResolver( cluster_spec=tf_config.cluster_spec(), task_type=tf_config.task_type, task_id=tf_config.task_id, master=master, environment=tf_config.environment, num_accelerators={"GPU": required_gpus}, rpc_layer=tf_config.rpc_layer or "grpc", ) # Disable health check. We don't have a reliable to shutdown the strategy # (and thus the health check) at the end of a test. Turning on health check # causes some flakiness since we re-create part of the server when creating # a strategy, and our tests are capable of handling failures. CollectiveAllReduceExtended._enable_check_health = False # pylint: disable=protected-access # Always create the strategy in eager mode so that it starts the server and # configures the eager context. The eager context can no longer be # configured after initialization. with context.eager_mode(): strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy( cluster_resolver=resolver) # TODO(b/152320929): Wait for the cluster before proceeding, otherwise # collectives may hang if any worker launches collectives before the chief # creates the strategy. try: multi_process_runner.barrier().wait() except ValueError: # If the creator is called in the main process, # multi_process_runner.barrier() raises ValueError, which is safe to # ignore. pass return strategy
def worker_fn(attempts): strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() task_id, attempt = get_attempt(strategy, attempts) if attempt == 2 and task_id == 1: multi_process_runner.barrier().wait() @tf.function def replica_fn(): ctx = tf.distribute.get_replica_context() # Use a large tensor because small tensor may hang regardless when the # worker recovers. value = tf.ones((64, 64)) ctx.all_reduce(tf.distribute.ReduceOp.SUM, [value, value]) strategy.run(replica_fn) # worker-1 dies here. if attempt == 1 and task_id == 1: quick_exit(1) # Make worker-0 waits for worker-1 to restart before entering the next # collective to simulate a quick recovery of worker-1. if attempt == 1 and task_id == 0: multi_process_runner.barrier().wait() strategy.run(replica_fn)
def proc_func_with_barrier(): return multi_process_runner.barrier()
def test_barrier_called_in_main_process(self): with self.assertRaises(ValueError): multi_process_runner.barrier()
def proc_func(model_path, checkpoint_dir): global_batch_size = per_worker_batch_size * num_workers strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy( ) with strategy.scope(): multi_worker_model = build_and_compile_cnn_model() callbacks = [ keras.callbacks.ModelCheckpoint( filepath=os.path.join(self.get_temp_dir(), 'checkpoint')) ] multi_worker_dataset = mnist_dataset(global_batch_size) if shard_policy: options = dataset_ops.Options() options.experimental_distribute.auto_shard_policy = shard_policy multi_worker_dataset = multi_worker_dataset.with_options( options) multi_worker_model.fit(multi_worker_dataset, epochs=2, steps_per_epoch=20, callbacks=callbacks) def _is_chief(task_type, task_id): return task_type is None or task_type == 'chief' or ( task_type == 'worker' and task_id == 0) def _get_temp_dir(dirpath, task_id): base_dirpath = 'workertemp_' + str(task_id) temp_dir = os.path.join(dirpath, base_dirpath) file_io.recursive_create_dir_v2(temp_dir) return temp_dir def write_filepath(filepath, task_type, task_id): dirpath = os.path.dirname(filepath) base = os.path.basename(filepath) if not _is_chief(task_type, task_id): dirpath = _get_temp_dir(dirpath, task_id) return os.path.join(dirpath, base) task_type, task_id = (strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id) write_model_path = write_filepath(model_path, task_type, task_id) multi_worker_model.save(write_model_path) if not _is_chief(task_type, task_id): file_io.delete_recursively_v2( os.path.dirname(write_model_path)) # Make sure chief finishes saving before non-chief's assertions. multi_process_runner.barrier().wait() if not file_io.file_exists_v2(model_path): raise RuntimeError() if file_io.file_exists_v2(write_model_path) != _is_chief( task_type, task_id): raise RuntimeError() loaded_model = keras.saving.save.load_model(model_path) loaded_model.fit(multi_worker_dataset, epochs=2, steps_per_epoch=20) checkpoint = tracking_util.Checkpoint(model=multi_worker_model) write_checkpoint_dir = write_filepath(checkpoint_dir, task_type, task_id) checkpoint_manager = checkpoint_management.CheckpointManager( checkpoint, directory=write_checkpoint_dir, max_to_keep=1) checkpoint_manager.save() if not _is_chief(task_type, task_id): file_io.delete_recursively_v2(write_checkpoint_dir) # Make sure chief finishes saving before non-chief's assertions. multi_process_runner.barrier().wait() if not file_io.file_exists_v2(checkpoint_dir): raise RuntimeError() if file_io.file_exists_v2(write_checkpoint_dir) != _is_chief( task_type, task_id): raise RuntimeError() latest_checkpoint = checkpoint_management.latest_checkpoint( checkpoint_dir) checkpoint.restore(latest_checkpoint) multi_worker_model.fit(multi_worker_dataset, epochs=2, steps_per_epoch=20) logging.info('testMultiWorkerTutorial successfully ends')
def fn_with_barrier(): return multi_process_runner.barrier()