def testCountPs(self): cluster_spec = { "chief": ["127.0.0.1:1234"], "ps": ["127.0.0.1:1926", "127.0.0.1:3141"] } # A "ps" job shouldn't call this method. with self.assertRaisesRegexp(ValueError, "Unexpected `task_type` 'ps'"): multi_worker_util.worker_count(cluster_spec, task_type="ps")
def testCountPs(self): cluster_spec = { "chief": ["127.0.0.1:1234"], "ps": ["127.0.0.1:1926", "127.0.0.1:3141"] } # A "ps" job shouldn't call this method. with self.assertRaisesRegex(ValueError, "Unexpected `task_type` 'ps'"): multi_worker_util.worker_count(cluster_spec, task_type="ps")
def testCountWorker(self): cluster_spec = { "chief": ["127.0.0.1:1234"], "worker": ["127.0.0.1:8964", "127.0.0.1:2333"], "ps": ["127.0.0.1:1926", "127.0.0.1:3141"] } self.assertEqual( multi_worker_util.worker_count(cluster_spec, task_type="chief"), 3) self.assertEqual( multi_worker_util.worker_count(cluster_spec, task_type="worker"), 3)
def testCountWorker(self): cluster_spec = { "chief": ["127.0.0.1:1234"], "worker": ["127.0.0.1:8964", "127.0.0.1:2333"], "ps": ["127.0.0.1:1926", "127.0.0.1:3141"] } self.assertEqual( multi_worker_util.worker_count(cluster_spec, task_type="chief"), 3) self.assertEqual( multi_worker_util.worker_count(cluster_spec, task_type="worker"), 3)
def _initialize_multi_worker(self, cluster_resolver): """Initializes the object for multi-worker training.""" # TODO(yuefengz): The `num_gpus` is only for this particular task. It # assumes all workers have the same number of GPUs. We should remove this # assumption by querying all tasks for their numbers of GPUs. num_gpus = cluster_resolver.num_accelerators().get("GPU", 0) cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_resolver.cluster_spec()) task_type = cluster_resolver.task_type task_id = cluster_resolver.task_id if task_type is None or task_id is None: raise ValueError( "When `cluster_spec` is given, you must also specify " "`task_type` and `task_id` in the `cluster_resolver`.") if task_type not in ("chief", "worker"): raise ValueError( "Unrecognized task_type: %r, valid task types are: \"chief\", " "\"worker\"." % task_type) self._num_workers = multi_worker_util.worker_count( cluster_spec, task_type) if not self._num_workers: raise ValueError("No `worker` or `chief` tasks can be found in " "`cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._worker_device = "/job:%s/task:%d" % (task_type, task_id) self._host_input_device = numpy_dataset.SingleDevice( self._worker_device) if num_gpus: local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i) for i in range(num_gpus)) else: local_devices = (self._worker_device, ) self._collective_keys = cross_device_utils.CollectiveKeys() super(CollectiveAllReduceExtended, self)._initialize_local(local_devices) self._input_workers = input_lib.InputWorkers( self._device_map, [(self._worker_device, self.worker_devices)]) self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus, collective_keys=self._collective_keys) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) self._cluster_spec = cluster_spec self._task_type = task_type self._task_id = task_id logging.info( "Multi-worker CollectiveAllReduceStrategy with " "cluster_spec = %r, task_type = %r, task_id = %r, " "num_workers = %r, local_devices = %r", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices)
def get_num_workers(): cluster_resolver = TFConfigClusterResolver() cluster_spec = cluster_resolver.cluster_spec().as_dict() if cluster_spec: task_type = cluster_resolver.task_type return int(multi_worker_util.worker_count(cluster_spec, task_type)) return 1
def _checkpoint_if_preempted(self): """Checkpoint if any worker has received a preemption signal. This function handles preemption signal reported by any worker in the cluster. The current implementation relies on the fact that all workers in a MultiWorkerMirroredStrategy training cluster have a step number difference maximum of 1. - If the signal comes from the worker itself (i.e., where this failure handler sits), the worker will notify all peers to checkpoint after they finish CURRENT_STEP+1 steps, where CURRENT_STEP is the step this worker has just finished. And the worker will wait for all peers to acknowledge that they have received its preemption signal and the final-step number before the worker proceeds on training the final step. - If the signal comes from another member in the cluster but NO final-step info is available, proceed on training, because it will be available after finishing the next step. - If the signal comes from some other member in the cluster, and final-step info is available, if the worker has not finished these steps yet, keep training; otherwise, checkpoint and exit with a cluster-recognized restart code. """ if self._received_sigterm_and_step.is_set(): run_count_key = context.context().get_config_key_value( _RUN_COUNT_KEY) if run_count_key == str(self._run_counter): logging.info('Starting checkpoint and exit') self._checkpointed_runs.assign(self.total_runs) start_time = time.monotonic() self._save_checkpoint() end_time = time.monotonic() logging.info('Checkpoint finished at path %s', self._write_checkpoint_manager.directory) logging.info('Checkpoint time: %f', end_time - start_time) sys.exit(self._exit_code) elif (self._received_own_sigterm.is_set() and (context.context().get_config_key_value(_PREEMPTION_KEY) == self._id_in_cluster)): logging.info( 'Termination caught in main thread on preempted worker') step_to_save_at = str(self._run_counter + 1) context.context().set_config_key_value(_RUN_COUNT_KEY, step_to_save_at) logging.info('%s set to %s', _RUN_COUNT_KEY, step_to_save_at) n_workers = multi_worker_util.worker_count( self._cluster_resolver.cluster_spec(), self._cluster_resolver.task_type) for i in range(n_workers): context.context().get_config_key_value( f'{_ACKNOWLEDGE_KEY}_{i}') logging.info( 'Sigterm acknowledgement from replica %d received', i)
def get_num_workers(): cluster_resolver = TFConfigClusterResolver() cluster_spec = cluster_resolver.cluster_spec().as_dict() if cluster_spec: task_type = cluster_resolver.task_type return int(multi_worker_util.worker_count(cluster_spec, task_type)) return 1
def testCountEvaluator(self): cluster_spec = { "chief": ["127.0.0.1:1234"], "worker": ["127.0.0.1:8964", "127.0.0.1:2333"], "evaluator": ["127.0.0.1:7566"] } self.assertEqual( multi_worker_util.worker_count(cluster_spec, task_type="evaluator"), 1)
def testCountEvaluator(self): cluster_spec = { "chief": ["127.0.0.1:1234"], "worker": ["127.0.0.1:8964", "127.0.0.1:2333"], "evaluator": ["127.0.0.1:7566"] } self.assertEqual( multi_worker_util.worker_count(cluster_spec, task_type="evaluator"), 1)
def _initialize_multi_worker(self, cluster_resolver): """Initializes the object for multi-worker training.""" # TODO(yuefengz): The `num_gpus` is only for this particular task. It # assumes all workers have the same number of GPUs. We should remove this # assumption by querying all tasks for their numbers of GPUs. num_gpus = cluster_resolver.num_accelerators() cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_resolver.cluster_spec()) task_type = cluster_resolver.task_type task_id = cluster_resolver.task_id if task_type is None or task_id is None: raise ValueError("When `cluster_spec` is given, you must also specify " "`task_type` and `task_id` in the `cluster_resolver`.") if task_type not in ("chief", "worker"): raise ValueError( "Unrecognized task_type: %r, valid task types are: \"chief\", " "\"worker\"." % task_type) self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type) if not self._num_workers: raise ValueError("No `worker` or `chief` tasks can be found in " "`cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._worker_device = "/job:%s/task:%d" % (task_type, task_id) self._host_input_device = numpy_dataset.SingleDevice(self._worker_device) if num_gpus: local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i) for i in range(num_gpus)) else: local_devices = (self._worker_device,) self._collective_keys = cross_device_utils.CollectiveKeys() super(CollectiveAllReduceExtended, self)._initialize_local(local_devices) self._input_workers = input_lib.InputWorkers( self._device_map, [(self._worker_device, self.worker_devices)]) self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus, collective_keys=self._collective_keys) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) self._cluster_spec = cluster_spec self._task_type = task_type self._task_id = task_id logging.info( "Multi-worker CollectiveAllReduceStrategy with " "cluster_spec = %r, task_type = %r, task_id = %r, " "num_workers = %r, local_devices = %r", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices)
def maybe_shard_dataset(dataset): """Shard the dataset if running in multi-node environment.""" cluster_resolver = TFConfigClusterResolver() cluster_spec = cluster_resolver.cluster_spec().as_dict() if cluster_spec: dataset = dataset.shard( multi_worker_util.worker_count(cluster_spec, cluster_resolver.task_type), multi_worker_util.id_in_cluster( cluster_spec, cluster_resolver.task_type, cluster_resolver.task_id)) return dataset
def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec, task_type, task_id): """Initializes the object for multi-worker training.""" if task_type is None or task_id is None: raise ValueError( "When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`") if task_type not in ("chief", "worker"): raise ValueError( "Unrecognized task_type: %r, valid task types are: \"chief\", " "\"worker\"." % task_type) cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._num_workers = multi_worker_util.worker_count( cluster_spec, task_type) if not self._num_workers: raise ValueError("No `worker` or `chief` tasks can be found in " "`cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._worker_device = "/job:%s/task:%d" % (task_type, task_id) self._host_input_device = numpy_dataset.SingleDevice( self._worker_device) if num_gpus_per_worker: local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i) for i in range(num_gpus_per_worker)) else: local_devices = (self._worker_device, ) self._collective_keys = cross_device_utils.CollectiveKeys() self._initialize_local(local_devices) self._input_workers = input_lib.InputWorkers( self._device_map, [(self._worker_device, self.worker_devices)]) self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus_per_worker, collective_keys=self._collective_keys) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) self._cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_spec) self._task_type = task_type self._task_id = task_id logging.info( "Multi-worker CollectiveAllReduceStrategy with " "cluster_spec = %r, task_type = %r, task_id = %r, " "num_workers = %r, local_devices = %r", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices)
def _initialize_multi_worker(self, container_strategy, num_gpus_per_worker, cluster_spec, task_type, task_id): """Initializes the object for multi-worker training.""" if task_type is None or task_id is None: raise ValueError( "When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`") if task_type not in ["chief", "worker"]: raise ValueError( "Unrecognized task_type: %r, valid task types are: \"chief\", " "\"worker\"." % task_type) cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._num_workers = multi_worker_util.worker_count( cluster_spec, task_type) if not self._num_workers: raise ValueError("No `worker` or `chief` tasks can be found in " "`cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) worker_device = "/job:%s/task:%d" % (task_type, task_id) if num_gpus_per_worker: local_devices = [ "%s/device:GPU:%d" % (worker_device, i) for i in range(num_gpus_per_worker) ] else: local_devices = [worker_device] self._collective_keys = cross_device_utils.CollectiveKeys() super(CollectiveAllReduceExtended, self).__init__( container_strategy, devices=local_devices, cross_device_ops=cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus_per_worker, collective_keys=self._collective_keys)) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) self._cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_spec) self._task_type = task_type self._task_id = task_id logging.info( "Multi-worker CollectiveAllReduceStrategy with " "cluster_spec = %r, task_type = %r, task_id = %r, " "num_workers = %r, local_devices = %r", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices)
def maybe_shard_dataset(dataset): """Shard the dataset if running in multi-node environment.""" cluster_resolver = TFConfigClusterResolver() cluster_spec = cluster_resolver.cluster_spec().as_dict() if cluster_spec: dataset = dataset.shard( multi_worker_util.worker_count(cluster_spec, cluster_resolver.task_type), multi_worker_util.id_in_cluster(cluster_spec, cluster_resolver.task_type, cluster_resolver.task_id)) return dataset
def batch_and_maybe_shard_dataset(dataset, global_batch_size): """Shard the dataset if running in multi-node environment.""" cluster_resolver = TFConfigClusterResolver() cluster_spec = cluster_resolver.cluster_spec().as_dict() if cluster_spec: task_type = cluster_resolver.task_type task_id = cluster_resolver.task_id num_workers = int(multi_worker_util.worker_count(cluster_spec, task_type)) id_in_cluster = int( multi_worker_util.id_in_cluster(cluster_spec, task_type, task_id)) dataset = dataset.shard(num_workers, id_in_cluster) return dataset.batch(global_batch_size)
def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec, task_type, task_id): """Initializes the object for multi-worker training.""" if task_type is None or task_id is None: raise ValueError("When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`") if task_type not in ("chief", "worker"): raise ValueError( "Unrecognized task_type: %r, valid task types are: \"chief\", " "\"worker\"." % task_type) cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type) if not self._num_workers: raise ValueError("No `worker` or `chief` tasks can be found in " "`cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._worker_device = "/job:%s/task:%d" % (task_type, task_id) self._host_input_device = numpy_dataset.SingleDevice(self._worker_device) if num_gpus_per_worker: local_devices = tuple( "%s/device:GPU:%d" % (self._worker_device, i) for i in range(num_gpus_per_worker) ) else: local_devices = (self._worker_device,) self._collective_keys = cross_device_utils.CollectiveKeys() self._initialize_local(local_devices) self._input_workers = input_lib.InputWorkers( self._device_map, [(self._worker_device, self.worker_devices)]) self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus_per_worker, collective_keys=self._collective_keys) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) self._cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._task_type = task_type self._task_id = task_id logging.info( "Multi-worker CollectiveAllReduceStrategy with " "cluster_spec = %r, task_type = %r, task_id = %r, " "num_workers = %r, local_devices = %r", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices)
def batch_and_maybe_shard_dataset(dataset, global_batch_size): """Shard the dataset if running in multi-node environment.""" cluster_resolver = TFConfigClusterResolver() cluster_spec = cluster_resolver.cluster_spec().as_dict() if cluster_spec: task_type = cluster_resolver.task_type task_id = cluster_resolver.task_id num_workers = int( multi_worker_util.worker_count(cluster_spec, task_type)) id_in_cluster = int( multi_worker_util.id_in_cluster(cluster_spec, task_type, task_id)) dataset = dataset.shard(num_workers, id_in_cluster) return dataset.batch(global_batch_size)
def _initialize_multi_worker(self, container_strategy, num_gpus_per_worker, cluster_spec, task_type, task_id): """Initializes the object for multi-worker training.""" if task_type is None or task_id is None: raise ValueError("When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`") if task_type not in ["chief", "worker"]: raise ValueError( "Unrecognized task_type: %r, valid task types are: \"chief\", " "\"worker\"." % task_type) cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type) if not self._num_workers: raise ValueError("No `worker` or `chief` tasks can be found in " "`cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) worker_device = "/job:%s/task:%d" % (task_type, task_id) if num_gpus_per_worker: local_devices = [ "%s/device:GPU:%d" % (worker_device, i) for i in range(num_gpus_per_worker) ] else: local_devices = [worker_device] self._collective_keys = cross_device_utils.CollectiveKeys() super(CollectiveAllReduceExtended, self).__init__( container_strategy, devices=local_devices, cross_device_ops=cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus_per_worker, collective_keys=self._collective_keys)) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) self._cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._task_type = task_type self._task_id = task_id logging.info( "Multi-worker CollectiveAllReduceStrategy with " "cluster_spec = %r, task_type = %r, task_id = %r, " "num_workers = %r, local_devices = %r", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices)
def _experimental_distribute_datasets_from_function(self, dataset_fn): if self._cluster_spec: input_pipeline_id = multi_worker_util.id_in_cluster( self._cluster_spec, self._task_type, self._task_id) num_input_pipelines = multi_worker_util.worker_count( self._cluster_spec, self._task_type) else: input_pipeline_id = 0 num_input_pipelines = 1 input_context = distribute_lib.InputContext( num_input_pipelines=num_input_pipelines, input_pipeline_id=input_pipeline_id, num_replicas_in_sync=self._num_replicas_in_sync) return input_lib.get_distributed_datasets_from_function( dataset_fn, self._input_workers, [input_context], self._container_strategy())
def _make_input_fn_iterator( self, input_fn, replication_mode=distribute_lib.InputReplicationMode.PER_WORKER): """Distributes the dataset to each local GPU.""" if self._cluster_spec: input_pipeline_id = multi_worker_util.id_in_cluster( self._cluster_spec, self._task_type, self._task_id) num_input_pipelines = multi_worker_util.worker_count( self._cluster_spec, self._task_type) else: input_pipeline_id = 0 num_input_pipelines = 1 input_context = distribute_lib.InputContext( num_input_pipelines=num_input_pipelines, input_pipeline_id=input_pipeline_id, num_replicas_in_sync=self._num_replicas_in_sync) return input_lib.InputFunctionIterator(input_fn, self._input_workers, [input_context])
def _make_input_fn_iterator( self, input_fn, replication_mode=distribute_lib.InputReplicationMode.PER_WORKER): """Distributes the dataset to each local GPU.""" if self._cluster_spec: input_pipeline_id = multi_worker_util.id_in_cluster( self._cluster_spec, self._task_type, self._task_id) num_input_pipelines = multi_worker_util.worker_count( self._cluster_spec, self._task_type) else: input_pipeline_id = 0 num_input_pipelines = 1 input_context = distribute_lib.InputContext( num_input_pipelines=num_input_pipelines, input_pipeline_id=input_pipeline_id, num_replicas_in_sync=self._num_replicas_in_sync) return input_lib.InputFunctionIterator(input_fn, self._input_workers, [input_context])
def _make_input_fn_iterator( self, input_fn, replication_mode=distribute_lib.InputReplicationMode.PER_WORKER): """Distributes the dataset to each local GPU.""" if self._cluster_spec: input_pipeline_id = multi_worker_util.id_in_cluster( self._cluster_spec, self._task_type, self._task_id) num_input_pipelines = multi_worker_util.worker_count( self._cluster_spec, self._task_type) else: input_pipeline_id = 0 num_input_pipelines = 1 input_context = distribute_lib.InputContext( num_input_pipelines=num_input_pipelines, input_pipeline_id=input_pipeline_id, num_replicas_in_sync=self.num_replicas_in_sync) return values.PerReplicaDataset( self._call_dataset_fn(input_fn, input_context), self._compute_devices, True)
def testTaskTypeNotFound(self): cluster_spec = {} with self.assertRaisesRegexp( ValueError, "`task_type` 'worker' not found in cluster_spec."): multi_worker_util.worker_count(cluster_spec, task_type="worker")
def _checkpoint_if_preempted(self): """Checkpoint if any worker has received a preemption signal. This function handles preemption signal reported by any worker in the cluster. The current implementation relies on the fact that all workers in a MultiWorkerMirroredStrategy training cluster have a step number difference maximum of 1. - If the signal comes from the worker itself (i.e., where this failure handler sits), the worker will notify all peers to checkpoint after they finish CURRENT_STEP+1 steps, where CURRENT_STEP is the step this worker has just finished. And the worker will wait for all peers to acknowledge that they have received its preemption signal and the final-step number before the worker proceeds on training the final step. - If the signal comes from another member in the cluster but NO final-step info is available, proceed on training, because it will be available after finishing the next step. - If the signal comes from some other member in the cluster, and final-step info is available, if the worker has not finished these steps yet, keep training; otherwise, checkpoint and exit with a cluster-recognized restart code. """ if self._received_sigterm_and_step.is_set(): run_count_key = context.context().get_config_key_value( _RUN_COUNT_KEY) if run_count_key == str(self._run_counter): self._save_checkpoint_and_exit() elif self._received_own_sigterm.is_set(): step_to_save_at = str(self._run_counter + 1) try: context.context().set_config_key_value(_RUN_COUNT_KEY, step_to_save_at) logging.info( 'Termination caught in main thread on preempted worker') logging.info('%s set to %s', _RUN_COUNT_KEY, step_to_save_at) n_workers = multi_worker_util.worker_count( self._cluster_resolver.cluster_spec(), self._cluster_resolver.task_type) for i in range(n_workers): context.context().get_config_key_value( f'{_ACKNOWLEDGE_KEY}_{i}') logging.info( 'Sigterm acknowledgement from replica %d received', i) # This is to handle the case that some other worker receives termination # notice as well, and it has made a step key available right before this # worker attempts to set it. In this case, it incurs a config key # AlreadyExistsError. # With MultiWorkerMirroredStrategy, every step contains collective ops # (all-reduce, all-gather, etc.) that require the participation of all # workers, which forms a synchronization point. Thus the max difference in # the training progresses made by the workers is less than one complete # step (e.g., one worker is finishing up the post-collective ops part of # step N, and another is doing the pre-collective ops part of step N+1.) # # We can safely ignore this AlreadyExistsError. Say both worker-a and # worker-b have received preemption notice, and worker-b encounters an # AlreadyExistsError here because worker-a has already uploaded a value as # the last step to finish before saving a checkpoint. Assume worker-b has # finished step N and attempt to set the last step as N+1. If the training # progress made by worker-a is ahead of that of worker-b, then worker-a # must be running step N+1 due to the mechanism mentioned above and has # set the last step as step N+1. If worker-a is behind worker-b, then it # cannot possibly have set the last step as step N (not to mention a step # number less than N). Because worker-a would only do so before executing # step N. Consider that when a worker resolves and sets the last step to # finish, it waits until receiving acknowledgment from all workers before # continuing to train the next step. And thus, in this case, worker-b # would never have finished step N, which requires the participation of # worker-a. In either case, we can safely ignore the error and revisit the # checkpoint and exit option after finishing the current step, step N+1. # At that time, it will be re-direct to the earlier branch. except errors.AlreadyExistsError: logging.info( 'Member %s has received termination notice. But some other' ' worker has received it as well! Leaving' ' it to them to decide when to checkpoint. ', self._id_in_cluster) return
def _initialize_multi_worker(self, cluster_resolver): """Initializes the object for multi-worker training.""" cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_resolver.cluster_spec()) task_type = cluster_resolver.task_type task_id = cluster_resolver.task_id if task_type is None or task_id is None: raise ValueError( "When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`.") self._cluster_spec = cluster_spec self._task_type = task_type self._task_id = task_id self._num_workers = multi_worker_util.worker_count( cluster_spec, task_type) if not self._num_workers: raise ValueError( "No `worker`, `chief` or `evaluator` tasks can be found " "in `cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._worker_device = "/job:%s/task:%d" % (task_type, task_id) self._host_input_device = numpy_dataset.SingleDevice( self._worker_device) if (ops.executing_eagerly_outside_functions() and not getattr(self, "_local_or_standalone_client_mode", False)): context.context().configure_collective_ops( collective_leader=multi_worker_util.collective_leader( cluster_spec, task_type, task_id), scoped_allocator_enabled_ops=("CollectiveReduce", ), use_nccl_communication=( self._communication == cross_device_ops_lib.CollectiveCommunication.NCCL), device_filters=("/job:%s/task:%d" % (task_type, task_id), )) self._collective_ops_configured = True # Starting a std server in eager mode and in independent worker mode. if (context.executing_eagerly() and not getattr(self, "_std_server_started", False) and not getattr(self, "_local_or_standalone_client_mode", False)): # Checking _local_or_standalone_client_mode as well because we should not # create the std server in standalone client mode. config_proto = config_pb2.ConfigProto() config_proto = self._update_config_proto(config_proto) server_def = tensorflow_server_pb2.ServerDef( cluster=cluster_spec.as_cluster_def(), default_session_config=config_proto, job_name=task_type, task_index=task_id, protocol=cluster_resolver.rpc_layer or "grpc") context.context().enable_collective_ops(server_def) self._std_server_started = True # The `ensure_initialized` is needed before calling # `context.context().devices()`. context.context().ensure_initialized() logging.info( "Enabled multi-worker collective ops with available devices: %r", context.context().devices()) # TODO(yuefengz): The `num_gpus` is only for this particular task. It # assumes all workers have the same number of GPUs. We should remove this # assumption by querying all tasks for their numbers of GPUs. # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in # some cases. if isinstance(cluster_resolver, TFConfigClusterResolver): num_gpus = context.num_gpus() else: num_gpus = cluster_resolver.num_accelerators().get("GPU", 0) if num_gpus: local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i) for i in range(num_gpus)) else: local_devices = (self._worker_device, ) self._collective_keys = cross_device_utils.CollectiveKeys() super(CollectiveAllReduceExtended, self)._initialize_local(local_devices) self._input_workers = input_lib.InputWorkers( self._device_map, [(self._worker_device, self.worker_devices)]) self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus, collective_keys=self._collective_keys) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) # Save the num_gpus_per_worker and rpc_layer for configure method. self._num_gpus_per_worker = num_gpus self._rpc_layer = cluster_resolver.rpc_layer self._warn_nccl_no_gpu() logging.info( "Multi-worker CollectiveAllReduceStrategy with cluster_spec = %r, " "task_type = %r, task_id = %r, num_workers = %r, local_devices = %r, " "communication = %s", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices, self._communication)
def _initialize_multi_worker(self, cluster_resolver): """Initializes the object for multi-worker training.""" # TODO(yuefengz): The `num_gpus` is only for this particular task. It # assumes all workers have the same number of GPUs. We should remove this # assumption by querying all tasks for their numbers of GPUs. # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in # some cases. if isinstance(cluster_resolver, TFConfigClusterResolver): num_gpus = context.num_gpus() else: num_gpus = cluster_resolver.num_accelerators().get("GPU", 0) cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_resolver.cluster_spec()) task_type = cluster_resolver.task_type task_id = cluster_resolver.task_id if task_type is None or task_id is None: raise ValueError("When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`.") self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type) if not self._num_workers: raise ValueError("No `worker`, `chief` or `evaluator` tasks can be found " "in `cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._worker_device = "/job:%s/task:%d" % (task_type, task_id) self._host_input_device = numpy_dataset.SingleDevice(self._worker_device) if num_gpus: local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i) for i in range(num_gpus)) else: local_devices = (self._worker_device,) self._collective_keys = cross_device_utils.CollectiveKeys() super(CollectiveAllReduceExtended, self)._initialize_local(local_devices) self._input_workers = input_lib.InputWorkers( self._device_map, [(self._worker_device, self.worker_devices)]) self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus, collective_keys=self._collective_keys) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) self._cluster_spec = cluster_spec self._task_type = task_type self._task_id = task_id # Save the num_gpus_per_worker and rpc_layer for configure method. self._num_gpus_per_worker = num_gpus self._rpc_layer = cluster_resolver.rpc_layer logging.info( "Multi-worker CollectiveAllReduceStrategy with cluster_spec = %r, " "task_type = %r, task_id = %r, num_workers = %r, local_devices = %r, " "communication = %s", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices, self._communication) if (context.executing_eagerly() and not getattr(self, "_std_server_started", False) and not getattr(self, "_local_or_standalone_client_mode", False)): # Checking _local_or_standalone_client_mode as well because we should not # create the std server in standalone client mode. config_proto = config_pb2.ConfigProto() config_proto = self._update_config_proto(config_proto) server_def = tensorflow_server_pb2.ServerDef( cluster=cluster_spec.as_cluster_def(), default_session_config=config_proto, job_name=task_type, task_index=task_id, protocol=cluster_resolver.rpc_layer or "grpc") context.context().enable_collective_ops(server_def) self._std_server_started = True logging.info( "Enabled multi-worker collective ops with available devices: %r", context.context().devices())
def testTaskTypeNotFound(self): cluster_spec = {} with self.assertRaisesRegex( ValueError, "`task_type` 'worker' not found in cluster_spec."): multi_worker_util.worker_count(cluster_spec, task_type="worker")
def _initialize_multi_worker(self, cluster_resolver): """Initializes the object for multi-worker training.""" cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_resolver.cluster_spec()) task_type = cluster_resolver.task_type task_id = cluster_resolver.task_id if task_type is None or task_id is None: raise ValueError( "When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`.") self._cluster_spec = cluster_spec self._task_type = task_type self._task_id = task_id self._id_in_cluster = multi_worker_util.id_in_cluster( self._cluster_spec, self._task_type, self._task_id) self._num_workers = multi_worker_util.worker_count( cluster_spec, task_type) if not self._num_workers: raise ValueError( "No `worker`, `chief` or `evaluator` tasks can be found " "in `cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._worker_device = "/job:%s/task:%d" % (task_type, task_id) self._host_input_device = numpy_dataset.SingleDevice( self._worker_device) if (ops.executing_eagerly_outside_functions() and not getattr(self, "_local_or_standalone_client_mode", False)): context.context().configure_collective_ops( collective_leader=multi_worker_util.collective_leader( cluster_spec, task_type, task_id), scoped_allocator_enabled_ops=("CollectiveReduce", ), device_filters=("/job:%s/task:%d" % (task_type, task_id), )) self._collective_ops_configured = True if context.context().coordination_service is None: coordinated_jobs = ["chief", "worker"] if task_type in coordinated_jobs: context.context().configure_coordination_service( service_type="standalone", service_leader=multi_worker_util.coordination_leader( cluster_spec), coordinated_jobs=coordinated_jobs) # Starting a std server in eager mode and in independent worker mode. if (context.executing_eagerly() and not getattr(self, "_std_server_started", False) and not getattr(self, "_local_or_standalone_client_mode", False)): # Checking _local_or_standalone_client_mode as well because we should not # create the std server in standalone client mode. config_proto = copy.deepcopy(context.context().config) config_proto = self._update_config_proto(config_proto) # If coordination service is enabled, use its internal heartbeat to detect # peer failures instead of the Python-level health check. if config_proto.experimental.coordination_config.service_type: self._enable_check_health = False if hasattr(cluster_resolver, "port"): port = cluster_resolver.port else: port = 0 server_def = tensorflow_server_pb2.ServerDef( cluster=cluster_spec.as_cluster_def(), default_session_config=config_proto, job_name=task_type, task_index=task_id, protocol=cluster_resolver.rpc_layer or "grpc", port=port) context.context().enable_collective_ops(server_def) self._std_server_started = True # The `ensure_initialized` is needed before calling # `context.context().devices()`. context.context().ensure_initialized() logging.info( "Enabled multi-worker collective ops with available devices: %r", context.context().devices()) # TODO(yuefengz): The `num_gpus` is only for this particular task. It # assumes all workers have the same number of GPUs. We should remove this # assumption by querying all tasks for their numbers of GPUs. # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in # some cases. local_devices, local_device_type = self._initialize_local_devices( cluster_resolver, self._worker_device) if local_device_type == "TPU": tpu_strategy_util.initialize_tpu_system() self._collective_keys = cross_device_utils.CollectiveKeys( group_key_start=1 + self._collective_key_base) self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( devices=local_devices, group_size=len(local_devices) * self._num_workers, options=self._communication_options, collective_keys=self._collective_keys) # CrossDeviceOps for per host tensors. self._host_cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( devices=[self._worker_device], group_size=self._num_workers, options=self._communication_options, collective_keys=self._collective_keys) super(CollectiveAllReduceExtended, self)._initialize_single_worker(local_devices) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) # Save the num_devices_per_worker and rpc_layer for configure method. self._num_devices_per_worker = len(local_devices) self._local_device_type = local_device_type self._rpc_layer = cluster_resolver.rpc_layer self._warn_nccl_no_gpu() if self._enable_check_health and context.executing_eagerly(): self._start_check_health_thread() else: logging.info("Check health not enabled.") logging.info( "MultiWorkerMirroredStrategy with cluster_spec = %r, task_type = %r, " "task_id = %r, num_workers = %r, local_devices = %r, " "communication = %s", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices, self._communication_options.implementation)
def _checkpoint_if_preempted(self): """Checkpoint if any worker has received a preemption signal. This function handles preemption signal reported by any worker in the cluster. The current implementation relies on the fact that all workers in a MultiWorkerMirroredStrategy training cluster have a step number difference maximum of 1. - If the signal comes from the worker itself (i.e., where this failure handler sits), the worker will notify all peers to checkpoint after they finish CURRENT_STEP+1 steps, where CURRENT_STEP is the step this worker has just finished. And the worker will wait for all peers to acknowledge that they have received its preemption signal and the final-step number before the worker proceeds on training the final step. - If the signal comes from another member in the cluster but NO final-step info is available, proceed on training, because it will be available after finishing the next step. - If the signal comes from some other member in the cluster, and final-step info is available, if the worker has not finished these steps yet, keep training; otherwise, checkpoint and exit with a cluster-recognized restart code. """ if self._final_checkpoint_countdown: run_count_config_key = _FINAL_RUN_COUNT_KEY else: run_count_config_key = _INITIAL_RUN_COUNT_KEY if self._received_checkpoint_step.is_set(): run_count_key = context.context().get_config_key_value( run_count_config_key) if run_count_key == str(self._run_counter): self._save_checkpoint() if self._time_to_exit(): self._stop_poll_termination_signal_thread() self._stop_cluster_wise_termination_watcher_thread() logging.info( 'WorkerPreemptionHandler: checkpoint saved. Exiting.') self._exit_fn() else: logging.info('Continue training for the grace period.') self._final_checkpoint_countdown = True self._received_checkpoint_step.clear() elif self._received_own_sigterm.is_set(): # Only the worker who gets termination signal first among the cluster # will enter this branch. The following will happen in chronological # order: # 1. The worker just receives a preemption signal and enters this branch # for the first time. It will set a step-to-checkpoint and let the cluster # know. # 2. If there is a long grace period, it will also set # _final_checkpoint_countdown, so that during this grace period, it will # re-enter this branch to check if grace period is ending. # 3. If it is, set a step-to-checkpoint key again. if self._final_checkpoint_countdown: if self._target_time_for_termination < time.time(): logging.info( 'Grace period almost ended. Final call to save a checkpoint!' ) else: return step_to_save_at = str(self._run_counter + 1) logging.info( 'Termination caught in main thread on preempted worker') context.context().set_config_key_value(run_count_config_key, step_to_save_at) logging.info('%s set to %s', run_count_config_key, step_to_save_at) n_workers = multi_worker_util.worker_count( self._cluster_resolver.cluster_spec(), self._cluster_resolver.task_type) for i in range(n_workers): context.context().get_config_key_value( f'{_ACKNOWLEDGE_KEY}_{run_count_config_key}_{i}') logging.info( 'Sigterm acknowledgement from replica %d received', i) self._setup_countdown_if_has_grace_period_and_not_already_counting_down( )