def testCountPs(self):
   cluster_spec = {
       "chief": ["127.0.0.1:1234"],
       "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
   }
   # A "ps" job shouldn't call this method.
   with self.assertRaisesRegexp(ValueError, "Unexpected `task_type` 'ps'"):
     multi_worker_util.worker_count(cluster_spec, task_type="ps")
 def testCountPs(self):
     cluster_spec = {
         "chief": ["127.0.0.1:1234"],
         "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
     }
     # A "ps" job shouldn't call this method.
     with self.assertRaisesRegex(ValueError, "Unexpected `task_type` 'ps'"):
         multi_worker_util.worker_count(cluster_spec, task_type="ps")
Beispiel #3
0
 def testCountWorker(self):
   cluster_spec = {
       "chief": ["127.0.0.1:1234"],
       "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
       "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
   }
   self.assertEqual(
       multi_worker_util.worker_count(cluster_spec, task_type="chief"), 3)
   self.assertEqual(
       multi_worker_util.worker_count(cluster_spec, task_type="worker"), 3)
 def testCountWorker(self):
   cluster_spec = {
       "chief": ["127.0.0.1:1234"],
       "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
       "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
   }
   self.assertEqual(
       multi_worker_util.worker_count(cluster_spec, task_type="chief"), 3)
   self.assertEqual(
       multi_worker_util.worker_count(cluster_spec, task_type="worker"), 3)
    def _initialize_multi_worker(self, cluster_resolver):
        """Initializes the object for multi-worker training."""
        # TODO(yuefengz): The `num_gpus` is only for this particular task. It
        # assumes all workers have the same number of GPUs. We should remove this
        # assumption by querying all tasks for their numbers of GPUs.
        num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)
        cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_resolver.cluster_spec())
        task_type = cluster_resolver.task_type
        task_id = cluster_resolver.task_id
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id` in the `cluster_resolver`.")
        if task_type not in ("chief", "worker"):
            raise ValueError(
                "Unrecognized task_type: %r, valid task types are: \"chief\", "
                "\"worker\"." % task_type)

        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError("No `worker` or `chief` tasks can be found in "
                             "`cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)
        if num_gpus:
            local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
                                  for i in range(num_gpus))
        else:
            local_devices = (self._worker_device, )

        self._collective_keys = cross_device_utils.CollectiveKeys()
        super(CollectiveAllReduceExtended,
              self)._initialize_local(local_devices)
        self._input_workers = input_lib.InputWorkers(
            self._device_map, [(self._worker_device, self.worker_devices)])
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            num_workers=self._num_workers,
            num_gpus_per_worker=num_gpus,
            collective_keys=self._collective_keys)

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        self._cluster_spec = cluster_spec
        self._task_type = task_type
        self._task_id = task_id

        logging.info(
            "Multi-worker CollectiveAllReduceStrategy with "
            "cluster_spec = %r, task_type = %r, task_id = %r, "
            "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
            task_type, task_id, self._num_workers, local_devices)
def get_num_workers():
  cluster_resolver = TFConfigClusterResolver()
  cluster_spec = cluster_resolver.cluster_spec().as_dict()
  if cluster_spec:
    task_type = cluster_resolver.task_type
    return int(multi_worker_util.worker_count(cluster_spec, task_type))
  return 1
Beispiel #7
0
    def _checkpoint_if_preempted(self):
        """Checkpoint if any worker has received a preemption signal.

    This function handles preemption signal reported by any worker in the
    cluster. The current implementation relies on the fact that all workers in a
    MultiWorkerMirroredStrategy training cluster have a step number difference
    maximum of 1.
    - If the signal comes from the worker itself (i.e., where this failure
    handler sits), the worker will notify all peers to checkpoint after they
    finish CURRENT_STEP+1 steps, where CURRENT_STEP is the step this worker has
    just finished. And the worker will wait for all peers to acknowledge that
    they have received its preemption signal and the final-step number before
    the worker proceeds on training the final step.
    - If the signal comes from another member in the cluster but NO final-step
    info is available, proceed on training, because it will be available after
    finishing the next step.
    - If the signal comes from some other member in the cluster, and final-step
    info is available, if the worker has not finished these steps yet, keep
    training; otherwise, checkpoint and exit with a cluster-recognized restart
    code.
    """
        if self._received_sigterm_and_step.is_set():

            run_count_key = context.context().get_config_key_value(
                _RUN_COUNT_KEY)

            if run_count_key == str(self._run_counter):
                logging.info('Starting checkpoint and exit')

                self._checkpointed_runs.assign(self.total_runs)

                start_time = time.monotonic()
                self._save_checkpoint()
                end_time = time.monotonic()
                logging.info('Checkpoint finished at path %s',
                             self._write_checkpoint_manager.directory)
                logging.info('Checkpoint time: %f', end_time - start_time)

                sys.exit(self._exit_code)

        elif (self._received_own_sigterm.is_set()
              and (context.context().get_config_key_value(_PREEMPTION_KEY)
                   == self._id_in_cluster)):

            logging.info(
                'Termination caught in main thread on preempted worker')

            step_to_save_at = str(self._run_counter + 1)
            context.context().set_config_key_value(_RUN_COUNT_KEY,
                                                   step_to_save_at)
            logging.info('%s set to %s', _RUN_COUNT_KEY, step_to_save_at)

            n_workers = multi_worker_util.worker_count(
                self._cluster_resolver.cluster_spec(),
                self._cluster_resolver.task_type)
            for i in range(n_workers):
                context.context().get_config_key_value(
                    f'{_ACKNOWLEDGE_KEY}_{i}')
                logging.info(
                    'Sigterm acknowledgement from replica %d received', i)
Beispiel #8
0
def get_num_workers():
    cluster_resolver = TFConfigClusterResolver()
    cluster_spec = cluster_resolver.cluster_spec().as_dict()
    if cluster_spec:
        task_type = cluster_resolver.task_type
        return int(multi_worker_util.worker_count(cluster_spec, task_type))
    return 1
Beispiel #9
0
 def testCountEvaluator(self):
   cluster_spec = {
       "chief": ["127.0.0.1:1234"],
       "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
       "evaluator": ["127.0.0.1:7566"]
   }
   self.assertEqual(
       multi_worker_util.worker_count(cluster_spec, task_type="evaluator"), 1)
 def testCountEvaluator(self):
   cluster_spec = {
       "chief": ["127.0.0.1:1234"],
       "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
       "evaluator": ["127.0.0.1:7566"]
   }
   self.assertEqual(
       multi_worker_util.worker_count(cluster_spec, task_type="evaluator"), 1)
  def _initialize_multi_worker(self, cluster_resolver):
    """Initializes the object for multi-worker training."""
    # TODO(yuefengz): The `num_gpus` is only for this particular task. It
    # assumes all workers have the same number of GPUs. We should remove this
    # assumption by querying all tasks for their numbers of GPUs.
    num_gpus = cluster_resolver.num_accelerators()
    cluster_spec = multi_worker_util.normalize_cluster_spec(
        cluster_resolver.cluster_spec())
    task_type = cluster_resolver.task_type
    task_id = cluster_resolver.task_id
    if task_type is None or task_id is None:
      raise ValueError("When `cluster_spec` is given, you must also specify "
                       "`task_type` and `task_id` in the `cluster_resolver`.")
    if task_type not in ("chief", "worker"):
      raise ValueError(
          "Unrecognized task_type: %r, valid task types are: \"chief\", "
          "\"worker\"." % task_type)

    self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type)
    if not self._num_workers:
      raise ValueError("No `worker` or `chief` tasks can be found in "
                       "`cluster_spec`.")

    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                task_id)

    self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
    if num_gpus:
      local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
                            for i in range(num_gpus))
    else:
      local_devices = (self._worker_device,)

    self._collective_keys = cross_device_utils.CollectiveKeys()
    super(CollectiveAllReduceExtended, self)._initialize_local(local_devices)
    self._input_workers = input_lib.InputWorkers(
        self._device_map, [(self._worker_device, self.worker_devices)])
    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
        num_workers=self._num_workers,
        num_gpus_per_worker=num_gpus,
        collective_keys=self._collective_keys)

    # Add a default device so that ops without specified devices will not end up
    # on other workers.
    self._default_device = "/job:%s/task:%d" % (task_type, task_id)

    self._cluster_spec = cluster_spec
    self._task_type = task_type
    self._task_id = task_id

    logging.info(
        "Multi-worker CollectiveAllReduceStrategy with "
        "cluster_spec = %r, task_type = %r, task_id = %r, "
        "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
        task_type, task_id, self._num_workers, local_devices)
def maybe_shard_dataset(dataset):
  """Shard the dataset if running in multi-node environment."""
  cluster_resolver = TFConfigClusterResolver()
  cluster_spec = cluster_resolver.cluster_spec().as_dict()
  if cluster_spec:
    dataset = dataset.shard(
        multi_worker_util.worker_count(cluster_spec,
                                       cluster_resolver.task_type),
        multi_worker_util.id_in_cluster(
            cluster_spec, cluster_resolver.task_type, cluster_resolver.task_id))
  return dataset
Beispiel #13
0
    def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
                                 task_type, task_id):
        """Initializes the object for multi-worker training."""
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`")
        if task_type not in ("chief", "worker"):
            raise ValueError(
                "Unrecognized task_type: %r, valid task types are: \"chief\", "
                "\"worker\"." % task_type)
        cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError("No `worker` or `chief` tasks can be found in "
                             "`cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)
        if num_gpus_per_worker:
            local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
                                  for i in range(num_gpus_per_worker))
        else:
            local_devices = (self._worker_device, )

        self._collective_keys = cross_device_utils.CollectiveKeys()
        self._initialize_local(local_devices)
        self._input_workers = input_lib.InputWorkers(
            self._device_map, [(self._worker_device, self.worker_devices)])
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            num_workers=self._num_workers,
            num_gpus_per_worker=num_gpus_per_worker,
            collective_keys=self._collective_keys)

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        self._cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_spec)
        self._task_type = task_type
        self._task_id = task_id

        logging.info(
            "Multi-worker CollectiveAllReduceStrategy with "
            "cluster_spec = %r, task_type = %r, task_id = %r, "
            "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
            task_type, task_id, self._num_workers, local_devices)
Beispiel #14
0
    def _initialize_multi_worker(self, container_strategy, num_gpus_per_worker,
                                 cluster_spec, task_type, task_id):
        """Initializes the object for multi-worker training."""
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`")
        if task_type not in ["chief", "worker"]:
            raise ValueError(
                "Unrecognized task_type: %r, valid task types are: \"chief\", "
                "\"worker\"." % task_type)
        cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError("No `worker` or `chief` tasks can be found in "
                             "`cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        worker_device = "/job:%s/task:%d" % (task_type, task_id)
        if num_gpus_per_worker:
            local_devices = [
                "%s/device:GPU:%d" % (worker_device, i)
                for i in range(num_gpus_per_worker)
            ]
        else:
            local_devices = [worker_device]

        self._collective_keys = cross_device_utils.CollectiveKeys()
        super(CollectiveAllReduceExtended, self).__init__(
            container_strategy,
            devices=local_devices,
            cross_device_ops=cross_device_ops_lib.CollectiveAllReduce(
                num_workers=self._num_workers,
                num_gpus_per_worker=num_gpus_per_worker,
                collective_keys=self._collective_keys))

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        self._cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_spec)
        self._task_type = task_type
        self._task_id = task_id

        logging.info(
            "Multi-worker CollectiveAllReduceStrategy with "
            "cluster_spec = %r, task_type = %r, task_id = %r, "
            "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
            task_type, task_id, self._num_workers, local_devices)
Beispiel #15
0
def maybe_shard_dataset(dataset):
    """Shard the dataset if running in multi-node environment."""
    cluster_resolver = TFConfigClusterResolver()
    cluster_spec = cluster_resolver.cluster_spec().as_dict()
    if cluster_spec:
        dataset = dataset.shard(
            multi_worker_util.worker_count(cluster_spec,
                                           cluster_resolver.task_type),
            multi_worker_util.id_in_cluster(cluster_spec,
                                            cluster_resolver.task_type,
                                            cluster_resolver.task_id))
    return dataset
def batch_and_maybe_shard_dataset(dataset, global_batch_size):
  """Shard the dataset if running in multi-node environment."""

  cluster_resolver = TFConfigClusterResolver()
  cluster_spec = cluster_resolver.cluster_spec().as_dict()
  if cluster_spec:
    task_type = cluster_resolver.task_type
    task_id = cluster_resolver.task_id
    num_workers = int(multi_worker_util.worker_count(cluster_spec, task_type))
    id_in_cluster = int(
        multi_worker_util.id_in_cluster(cluster_spec, task_type, task_id))
    dataset = dataset.shard(num_workers, id_in_cluster)
  return dataset.batch(global_batch_size)
  def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
                               task_type, task_id):
    """Initializes the object for multi-worker training."""
    if task_type is None or task_id is None:
      raise ValueError("When `cluster_spec` is given, you must also specify "
                       "`task_type` and `task_id`")
    if task_type not in ("chief", "worker"):
      raise ValueError(
          "Unrecognized task_type: %r, valid task types are: \"chief\", "
          "\"worker\"." % task_type)
    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
    self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type)
    if not self._num_workers:
      raise ValueError("No `worker` or `chief` tasks can be found in "
                       "`cluster_spec`.")

    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                task_id)

    self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
    if num_gpus_per_worker:
      local_devices = tuple(
          "%s/device:GPU:%d" % (self._worker_device, i)
          for i in range(num_gpus_per_worker)
      )
    else:
      local_devices = (self._worker_device,)

    self._collective_keys = cross_device_utils.CollectiveKeys()
    self._initialize_local(local_devices)
    self._input_workers = input_lib.InputWorkers(
        self._device_map, [(self._worker_device, self.worker_devices)])
    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
        num_workers=self._num_workers,
        num_gpus_per_worker=num_gpus_per_worker,
        collective_keys=self._collective_keys)

    # Add a default device so that ops without specified devices will not end up
    # on other workers.
    self._default_device = "/job:%s/task:%d" % (task_type, task_id)

    self._cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
    self._task_type = task_type
    self._task_id = task_id

    logging.info(
        "Multi-worker CollectiveAllReduceStrategy with "
        "cluster_spec = %r, task_type = %r, task_id = %r, "
        "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
        task_type, task_id, self._num_workers, local_devices)
Beispiel #18
0
def batch_and_maybe_shard_dataset(dataset, global_batch_size):
    """Shard the dataset if running in multi-node environment."""

    cluster_resolver = TFConfigClusterResolver()
    cluster_spec = cluster_resolver.cluster_spec().as_dict()
    if cluster_spec:
        task_type = cluster_resolver.task_type
        task_id = cluster_resolver.task_id
        num_workers = int(
            multi_worker_util.worker_count(cluster_spec, task_type))
        id_in_cluster = int(
            multi_worker_util.id_in_cluster(cluster_spec, task_type, task_id))
        dataset = dataset.shard(num_workers, id_in_cluster)
    return dataset.batch(global_batch_size)
  def _initialize_multi_worker(self, container_strategy, num_gpus_per_worker,
                               cluster_spec, task_type, task_id):
    """Initializes the object for multi-worker training."""
    if task_type is None or task_id is None:
      raise ValueError("When `cluster_spec` is given, you must also specify "
                       "`task_type` and `task_id`")
    if task_type not in ["chief", "worker"]:
      raise ValueError(
          "Unrecognized task_type: %r, valid task types are: \"chief\", "
          "\"worker\"." % task_type)
    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
    self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type)
    if not self._num_workers:
      raise ValueError("No `worker` or `chief` tasks can be found in "
                       "`cluster_spec`.")

    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                task_id)

    worker_device = "/job:%s/task:%d" % (task_type, task_id)
    if num_gpus_per_worker:
      local_devices = [
          "%s/device:GPU:%d" % (worker_device, i)
          for i in range(num_gpus_per_worker)
      ]
    else:
      local_devices = [worker_device]

    self._collective_keys = cross_device_utils.CollectiveKeys()
    super(CollectiveAllReduceExtended, self).__init__(
        container_strategy,
        devices=local_devices,
        cross_device_ops=cross_device_ops_lib.CollectiveAllReduce(
            num_workers=self._num_workers,
            num_gpus_per_worker=num_gpus_per_worker,
            collective_keys=self._collective_keys))

    # Add a default device so that ops without specified devices will not end up
    # on other workers.
    self._default_device = "/job:%s/task:%d" % (task_type, task_id)

    self._cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
    self._task_type = task_type
    self._task_id = task_id

    logging.info(
        "Multi-worker CollectiveAllReduceStrategy with "
        "cluster_spec = %r, task_type = %r, task_id = %r, "
        "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
        task_type, task_id, self._num_workers, local_devices)
Beispiel #20
0
    def _experimental_distribute_datasets_from_function(self, dataset_fn):
        if self._cluster_spec:
            input_pipeline_id = multi_worker_util.id_in_cluster(
                self._cluster_spec, self._task_type, self._task_id)
            num_input_pipelines = multi_worker_util.worker_count(
                self._cluster_spec, self._task_type)
        else:
            input_pipeline_id = 0
            num_input_pipelines = 1

        input_context = distribute_lib.InputContext(
            num_input_pipelines=num_input_pipelines,
            input_pipeline_id=input_pipeline_id,
            num_replicas_in_sync=self._num_replicas_in_sync)

        return input_lib.get_distributed_datasets_from_function(
            dataset_fn, self._input_workers, [input_context],
            self._container_strategy())
 def _make_input_fn_iterator(
         self,
         input_fn,
         replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
     """Distributes the dataset to each local GPU."""
     if self._cluster_spec:
         input_pipeline_id = multi_worker_util.id_in_cluster(
             self._cluster_spec, self._task_type, self._task_id)
         num_input_pipelines = multi_worker_util.worker_count(
             self._cluster_spec, self._task_type)
     else:
         input_pipeline_id = 0
         num_input_pipelines = 1
     input_context = distribute_lib.InputContext(
         num_input_pipelines=num_input_pipelines,
         input_pipeline_id=input_pipeline_id,
         num_replicas_in_sync=self._num_replicas_in_sync)
     return input_lib.InputFunctionIterator(input_fn, self._input_workers,
                                            [input_context])
 def _make_input_fn_iterator(
     self,
     input_fn,
     replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
   """Distributes the dataset to each local GPU."""
   if self._cluster_spec:
     input_pipeline_id = multi_worker_util.id_in_cluster(
         self._cluster_spec, self._task_type, self._task_id)
     num_input_pipelines = multi_worker_util.worker_count(
         self._cluster_spec, self._task_type)
   else:
     input_pipeline_id = 0
     num_input_pipelines = 1
   input_context = distribute_lib.InputContext(
       num_input_pipelines=num_input_pipelines,
       input_pipeline_id=input_pipeline_id,
       num_replicas_in_sync=self._num_replicas_in_sync)
   return input_lib.InputFunctionIterator(input_fn, self._input_workers,
                                          [input_context])
Beispiel #23
0
 def _make_input_fn_iterator(
         self,
         input_fn,
         replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
     """Distributes the dataset to each local GPU."""
     if self._cluster_spec:
         input_pipeline_id = multi_worker_util.id_in_cluster(
             self._cluster_spec, self._task_type, self._task_id)
         num_input_pipelines = multi_worker_util.worker_count(
             self._cluster_spec, self._task_type)
     else:
         input_pipeline_id = 0
         num_input_pipelines = 1
     input_context = distribute_lib.InputContext(
         num_input_pipelines=num_input_pipelines,
         input_pipeline_id=input_pipeline_id,
         num_replicas_in_sync=self.num_replicas_in_sync)
     return values.PerReplicaDataset(
         self._call_dataset_fn(input_fn, input_context),
         self._compute_devices, True)
 def testTaskTypeNotFound(self):
   cluster_spec = {}
   with self.assertRaisesRegexp(
       ValueError, "`task_type` 'worker' not found in cluster_spec."):
     multi_worker_util.worker_count(cluster_spec, task_type="worker")
Beispiel #25
0
    def _checkpoint_if_preempted(self):
        """Checkpoint if any worker has received a preemption signal.

    This function handles preemption signal reported by any worker in the
    cluster. The current implementation relies on the fact that all workers in a
    MultiWorkerMirroredStrategy training cluster have a step number difference
    maximum of 1.
    - If the signal comes from the worker itself (i.e., where this failure
    handler sits), the worker will notify all peers to checkpoint after they
    finish CURRENT_STEP+1 steps, where CURRENT_STEP is the step this worker has
    just finished. And the worker will wait for all peers to acknowledge that
    they have received its preemption signal and the final-step number before
    the worker proceeds on training the final step.
    - If the signal comes from another member in the cluster but NO final-step
    info is available, proceed on training, because it will be available after
    finishing the next step.
    - If the signal comes from some other member in the cluster, and final-step
    info is available, if the worker has not finished these steps yet, keep
    training; otherwise, checkpoint and exit with a cluster-recognized restart
    code.
    """
        if self._received_sigterm_and_step.is_set():

            run_count_key = context.context().get_config_key_value(
                _RUN_COUNT_KEY)

            if run_count_key == str(self._run_counter):
                self._save_checkpoint_and_exit()

        elif self._received_own_sigterm.is_set():

            step_to_save_at = str(self._run_counter + 1)

            try:
                context.context().set_config_key_value(_RUN_COUNT_KEY,
                                                       step_to_save_at)
                logging.info(
                    'Termination caught in main thread on preempted worker')
                logging.info('%s set to %s', _RUN_COUNT_KEY, step_to_save_at)

                n_workers = multi_worker_util.worker_count(
                    self._cluster_resolver.cluster_spec(),
                    self._cluster_resolver.task_type)
                for i in range(n_workers):
                    context.context().get_config_key_value(
                        f'{_ACKNOWLEDGE_KEY}_{i}')
                    logging.info(
                        'Sigterm acknowledgement from replica %d received', i)
            # This is to handle the case that some other worker receives termination
            # notice as well, and it has made a step key available right before this
            # worker attempts to set it. In this case, it incurs a config key
            # AlreadyExistsError.
            # With MultiWorkerMirroredStrategy, every step contains collective ops
            # (all-reduce, all-gather, etc.) that require the participation of all
            # workers, which forms a synchronization point. Thus the max difference in
            # the training progresses made by the workers is less than one complete
            # step (e.g., one worker is finishing up the post-collective ops part of
            # step N, and another is doing the pre-collective ops part of step N+1.)
            #
            # We can safely ignore this AlreadyExistsError. Say both worker-a and
            # worker-b have received preemption notice, and worker-b encounters an
            # AlreadyExistsError here because worker-a has already uploaded a value as
            # the last step to finish before saving a checkpoint. Assume worker-b has
            # finished step N and attempt to set the last step as N+1. If the training
            # progress made by worker-a is ahead of that of worker-b, then worker-a
            # must be running step N+1 due to the mechanism mentioned above and has
            # set the last step as step N+1. If worker-a is behind worker-b, then it
            # cannot possibly have set the last step as step N (not to mention a step
            # number less than N). Because worker-a would only do so before executing
            # step N. Consider that when a worker resolves and sets the last step to
            # finish, it waits until receiving acknowledgment from all workers before
            # continuing to train the next step. And thus, in this case, worker-b
            # would never have finished step N, which requires the participation of
            # worker-a. In either case, we can safely ignore the error and revisit the
            # checkpoint and exit option after finishing the current step, step N+1.
            # At that time, it will be re-direct to the earlier branch.
            except errors.AlreadyExistsError:
                logging.info(
                    'Member %s has received termination notice. But some other'
                    ' worker has received it as well! Leaving'
                    ' it to them to decide when to checkpoint. ',
                    self._id_in_cluster)

                return
Beispiel #26
0
    def _initialize_multi_worker(self, cluster_resolver):
        """Initializes the object for multi-worker training."""
        cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_resolver.cluster_spec())
        task_type = cluster_resolver.task_type
        task_id = cluster_resolver.task_id
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`.")
        self._cluster_spec = cluster_spec
        self._task_type = task_type
        self._task_id = task_id

        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError(
                "No `worker`, `chief` or `evaluator` tasks can be found "
                "in `cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)

        if (ops.executing_eagerly_outside_functions() and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            context.context().configure_collective_ops(
                collective_leader=multi_worker_util.collective_leader(
                    cluster_spec, task_type, task_id),
                scoped_allocator_enabled_ops=("CollectiveReduce", ),
                use_nccl_communication=(
                    self._communication ==
                    cross_device_ops_lib.CollectiveCommunication.NCCL),
                device_filters=("/job:%s/task:%d" % (task_type, task_id), ))
            self._collective_ops_configured = True

        # Starting a std server in eager mode and in independent worker mode.
        if (context.executing_eagerly()
                and not getattr(self, "_std_server_started", False) and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            # Checking _local_or_standalone_client_mode as well because we should not
            # create the std server in standalone client mode.
            config_proto = config_pb2.ConfigProto()
            config_proto = self._update_config_proto(config_proto)
            server_def = tensorflow_server_pb2.ServerDef(
                cluster=cluster_spec.as_cluster_def(),
                default_session_config=config_proto,
                job_name=task_type,
                task_index=task_id,
                protocol=cluster_resolver.rpc_layer or "grpc")
            context.context().enable_collective_ops(server_def)
            self._std_server_started = True
            # The `ensure_initialized` is needed before calling
            # `context.context().devices()`.
            context.context().ensure_initialized()
            logging.info(
                "Enabled multi-worker collective ops with available devices: %r",
                context.context().devices())

        # TODO(yuefengz): The `num_gpus` is only for this particular task. It
        # assumes all workers have the same number of GPUs. We should remove this
        # assumption by querying all tasks for their numbers of GPUs.
        # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
        # some cases.
        if isinstance(cluster_resolver, TFConfigClusterResolver):
            num_gpus = context.num_gpus()
        else:
            num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)

        if num_gpus:
            local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
                                  for i in range(num_gpus))
        else:
            local_devices = (self._worker_device, )

        self._collective_keys = cross_device_utils.CollectiveKeys()
        super(CollectiveAllReduceExtended,
              self)._initialize_local(local_devices)
        self._input_workers = input_lib.InputWorkers(
            self._device_map, [(self._worker_device, self.worker_devices)])
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            num_workers=self._num_workers,
            num_gpus_per_worker=num_gpus,
            collective_keys=self._collective_keys)

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        # Save the num_gpus_per_worker and rpc_layer for configure method.
        self._num_gpus_per_worker = num_gpus
        self._rpc_layer = cluster_resolver.rpc_layer
        self._warn_nccl_no_gpu()

        logging.info(
            "Multi-worker CollectiveAllReduceStrategy with cluster_spec = %r, "
            "task_type = %r, task_id = %r, num_workers = %r, local_devices = %r, "
            "communication = %s", cluster_spec.as_dict(), task_type, task_id,
            self._num_workers, local_devices, self._communication)
  def _initialize_multi_worker(self, cluster_resolver):
    """Initializes the object for multi-worker training."""
    # TODO(yuefengz): The `num_gpus` is only for this particular task. It
    # assumes all workers have the same number of GPUs. We should remove this
    # assumption by querying all tasks for their numbers of GPUs.
    # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
    # some cases.
    if isinstance(cluster_resolver, TFConfigClusterResolver):
      num_gpus = context.num_gpus()
    else:
      num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)

    cluster_spec = multi_worker_util.normalize_cluster_spec(
        cluster_resolver.cluster_spec())
    task_type = cluster_resolver.task_type
    task_id = cluster_resolver.task_id
    if task_type is None or task_id is None:
      raise ValueError("When `cluster_spec` is given, you must also specify "
                       "`task_type` and `task_id`.")

    self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type)
    if not self._num_workers:
      raise ValueError("No `worker`, `chief` or `evaluator` tasks can be found "
                       "in `cluster_spec`.")

    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                task_id)

    self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
    if num_gpus:
      local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
                            for i in range(num_gpus))
    else:
      local_devices = (self._worker_device,)

    self._collective_keys = cross_device_utils.CollectiveKeys()
    super(CollectiveAllReduceExtended, self)._initialize_local(local_devices)
    self._input_workers = input_lib.InputWorkers(
        self._device_map, [(self._worker_device, self.worker_devices)])
    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
        num_workers=self._num_workers,
        num_gpus_per_worker=num_gpus,
        collective_keys=self._collective_keys)

    # Add a default device so that ops without specified devices will not end up
    # on other workers.
    self._default_device = "/job:%s/task:%d" % (task_type, task_id)

    self._cluster_spec = cluster_spec
    self._task_type = task_type
    self._task_id = task_id

    # Save the num_gpus_per_worker and rpc_layer for configure method.
    self._num_gpus_per_worker = num_gpus
    self._rpc_layer = cluster_resolver.rpc_layer

    logging.info(
        "Multi-worker CollectiveAllReduceStrategy with cluster_spec = %r, "
        "task_type = %r, task_id = %r, num_workers = %r, local_devices = %r, "
        "communication = %s", cluster_spec.as_dict(), task_type,
        task_id, self._num_workers, local_devices,
        self._communication)

    if (context.executing_eagerly() and
        not getattr(self, "_std_server_started", False) and
        not getattr(self, "_local_or_standalone_client_mode", False)):
      # Checking _local_or_standalone_client_mode as well because we should not
      # create the std server in standalone client mode.
      config_proto = config_pb2.ConfigProto()
      config_proto = self._update_config_proto(config_proto)
      server_def = tensorflow_server_pb2.ServerDef(
          cluster=cluster_spec.as_cluster_def(),
          default_session_config=config_proto,
          job_name=task_type,
          task_index=task_id,
          protocol=cluster_resolver.rpc_layer or "grpc")
      context.context().enable_collective_ops(server_def)
      self._std_server_started = True
      logging.info(
          "Enabled multi-worker collective ops with available devices: %r",
          context.context().devices())
 def testTaskTypeNotFound(self):
     cluster_spec = {}
     with self.assertRaisesRegex(
             ValueError, "`task_type` 'worker' not found in cluster_spec."):
         multi_worker_util.worker_count(cluster_spec, task_type="worker")
Beispiel #29
0
    def _initialize_multi_worker(self, cluster_resolver):
        """Initializes the object for multi-worker training."""
        cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_resolver.cluster_spec())
        task_type = cluster_resolver.task_type
        task_id = cluster_resolver.task_id
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`.")
        self._cluster_spec = cluster_spec
        self._task_type = task_type
        self._task_id = task_id
        self._id_in_cluster = multi_worker_util.id_in_cluster(
            self._cluster_spec, self._task_type, self._task_id)

        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError(
                "No `worker`, `chief` or `evaluator` tasks can be found "
                "in `cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)

        if (ops.executing_eagerly_outside_functions() and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            context.context().configure_collective_ops(
                collective_leader=multi_worker_util.collective_leader(
                    cluster_spec, task_type, task_id),
                scoped_allocator_enabled_ops=("CollectiveReduce", ),
                device_filters=("/job:%s/task:%d" % (task_type, task_id), ))
            self._collective_ops_configured = True
            if context.context().coordination_service is None:
                coordinated_jobs = ["chief", "worker"]
                if task_type in coordinated_jobs:
                    context.context().configure_coordination_service(
                        service_type="standalone",
                        service_leader=multi_worker_util.coordination_leader(
                            cluster_spec),
                        coordinated_jobs=coordinated_jobs)

        # Starting a std server in eager mode and in independent worker mode.
        if (context.executing_eagerly()
                and not getattr(self, "_std_server_started", False) and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            # Checking _local_or_standalone_client_mode as well because we should not
            # create the std server in standalone client mode.
            config_proto = copy.deepcopy(context.context().config)
            config_proto = self._update_config_proto(config_proto)

            # If coordination service is enabled, use its internal heartbeat to detect
            # peer failures instead of the Python-level health check.
            if config_proto.experimental.coordination_config.service_type:
                self._enable_check_health = False

            if hasattr(cluster_resolver, "port"):
                port = cluster_resolver.port
            else:
                port = 0
            server_def = tensorflow_server_pb2.ServerDef(
                cluster=cluster_spec.as_cluster_def(),
                default_session_config=config_proto,
                job_name=task_type,
                task_index=task_id,
                protocol=cluster_resolver.rpc_layer or "grpc",
                port=port)
            context.context().enable_collective_ops(server_def)
            self._std_server_started = True
            # The `ensure_initialized` is needed before calling
            # `context.context().devices()`.
            context.context().ensure_initialized()
            logging.info(
                "Enabled multi-worker collective ops with available devices: %r",
                context.context().devices())

        # TODO(yuefengz): The `num_gpus` is only for this particular task. It
        # assumes all workers have the same number of GPUs. We should remove this
        # assumption by querying all tasks for their numbers of GPUs.
        # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
        # some cases.
        local_devices, local_device_type = self._initialize_local_devices(
            cluster_resolver, self._worker_device)
        if local_device_type == "TPU":
            tpu_strategy_util.initialize_tpu_system()

        self._collective_keys = cross_device_utils.CollectiveKeys(
            group_key_start=1 + self._collective_key_base)
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            devices=local_devices,
            group_size=len(local_devices) * self._num_workers,
            options=self._communication_options,
            collective_keys=self._collective_keys)
        # CrossDeviceOps for per host tensors.
        self._host_cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            devices=[self._worker_device],
            group_size=self._num_workers,
            options=self._communication_options,
            collective_keys=self._collective_keys)
        super(CollectiveAllReduceExtended,
              self)._initialize_single_worker(local_devices)

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        # Save the num_devices_per_worker and rpc_layer for configure method.
        self._num_devices_per_worker = len(local_devices)
        self._local_device_type = local_device_type
        self._rpc_layer = cluster_resolver.rpc_layer
        self._warn_nccl_no_gpu()

        if self._enable_check_health and context.executing_eagerly():
            self._start_check_health_thread()
        else:
            logging.info("Check health not enabled.")

        logging.info(
            "MultiWorkerMirroredStrategy with cluster_spec = %r, task_type = %r, "
            "task_id = %r, num_workers = %r, local_devices = %r, "
            "communication = %s", cluster_spec.as_dict(), task_type, task_id,
            self._num_workers, local_devices,
            self._communication_options.implementation)
Beispiel #30
0
    def _checkpoint_if_preempted(self):
        """Checkpoint if any worker has received a preemption signal.

    This function handles preemption signal reported by any worker in the
    cluster. The current implementation relies on the fact that all workers in a
    MultiWorkerMirroredStrategy training cluster have a step number difference
    maximum of 1.
    - If the signal comes from the worker itself (i.e., where this failure
    handler sits), the worker will notify all peers to checkpoint after they
    finish CURRENT_STEP+1 steps, where CURRENT_STEP is the step this worker has
    just finished. And the worker will wait for all peers to acknowledge that
    they have received its preemption signal and the final-step number before
    the worker proceeds on training the final step.
    - If the signal comes from another member in the cluster but NO final-step
    info is available, proceed on training, because it will be available after
    finishing the next step.
    - If the signal comes from some other member in the cluster, and final-step
    info is available, if the worker has not finished these steps yet, keep
    training; otherwise, checkpoint and exit with a cluster-recognized restart
    code.
    """
        if self._final_checkpoint_countdown:
            run_count_config_key = _FINAL_RUN_COUNT_KEY

        else:
            run_count_config_key = _INITIAL_RUN_COUNT_KEY

        if self._received_checkpoint_step.is_set():

            run_count_key = context.context().get_config_key_value(
                run_count_config_key)

            if run_count_key == str(self._run_counter):
                self._save_checkpoint()

                if self._time_to_exit():
                    self._stop_poll_termination_signal_thread()
                    self._stop_cluster_wise_termination_watcher_thread()
                    logging.info(
                        'WorkerPreemptionHandler: checkpoint saved. Exiting.')
                    self._exit_fn()

                else:
                    logging.info('Continue training for the grace period.')
                    self._final_checkpoint_countdown = True
                    self._received_checkpoint_step.clear()

        elif self._received_own_sigterm.is_set():
            # Only the worker who gets termination signal first among the cluster
            # will enter this branch. The following will happen in chronological
            # order:
            # 1. The worker just receives a preemption signal and enters this branch
            # for the first time. It will set a step-to-checkpoint and let the cluster
            # know.
            # 2. If there is a long grace period, it will also set
            # _final_checkpoint_countdown, so that during this grace period, it will
            # re-enter this branch to check if grace period is ending.
            # 3. If it is, set a step-to-checkpoint key again.

            if self._final_checkpoint_countdown:
                if self._target_time_for_termination < time.time():
                    logging.info(
                        'Grace period almost ended. Final call to save a checkpoint!'
                    )
                else:
                    return

            step_to_save_at = str(self._run_counter + 1)

            logging.info(
                'Termination caught in main thread on preempted worker')
            context.context().set_config_key_value(run_count_config_key,
                                                   step_to_save_at)
            logging.info('%s set to %s', run_count_config_key, step_to_save_at)

            n_workers = multi_worker_util.worker_count(
                self._cluster_resolver.cluster_spec(),
                self._cluster_resolver.task_type)
            for i in range(n_workers):
                context.context().get_config_key_value(
                    f'{_ACKNOWLEDGE_KEY}_{run_count_config_key}_{i}')
                logging.info(
                    'Sigterm acknowledgement from replica %d received', i)

            self._setup_countdown_if_has_grace_period_and_not_already_counting_down(
            )