def testClusterWithChief(self):
   cluster_spec = {
       "chief": ["127.0.0.1:1234"],
       "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
       "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
   }
   self.assertTrue(multi_worker_util.is_chief(cluster_spec, "chief", 0))
   self.assertFalse(multi_worker_util.is_chief(cluster_spec, "worker", 0))
 def testClusterWithChief(self):
     cluster_spec = {
         "chief": ["127.0.0.1:1234"],
         "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
         "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
     }
     self.assertTrue(multi_worker_util.is_chief(cluster_spec, "chief", 0))
     self.assertFalse(multi_worker_util.is_chief(cluster_spec, "worker", 0))
Example #3
0
 def _maybe_create_checkpoint_manager(self,
                                      checkpoint_or_checkpoint_manager,
                                      checkpoint_dir, cluster_resolver):
     """Create CheckpointManager(s) if a checkpoint is passed else take it."""
     if isinstance(checkpoint_or_checkpoint_manager,
                   checkpoint_management.CheckpointManager):
         self._read_checkpoint_manager = checkpoint_or_checkpoint_manager
         self._write_checkpoint_manager = checkpoint_or_checkpoint_manager
         self._api_made_checkpoint_manager = False
     else:
         self._api_made_checkpoint_manager = True
         # Make CheckpointManagers. MultiWorkerMirroredStrategy requires different
         # setup on chief and on other workers.
         self._read_checkpoint_manager = checkpoint_management.CheckpointManager(
             checkpoint_or_checkpoint_manager,
             directory=checkpoint_dir,
             max_to_keep=1)
         if multi_worker_util.is_chief(
                 cluster_spec=cluster_resolver.cluster_spec(),
                 task_type=cluster_resolver.task_type,
                 task_id=cluster_resolver.task_id):
             self._write_checkpoint_manager = self._read_checkpoint_manager
         else:
             self._write_checkpoint_manager = (
                 checkpoint_management.CheckpointManager(
                     checkpoint_or_checkpoint_manager,
                     _non_chief_checkpoint_dir(checkpoint_dir,
                                               cluster_resolver.task_id),
                     max_to_keep=1))
Example #4
0
 def testEvaluatorIsChief(self):
   cluster_spec = {
       "chief": ["127.0.0.1:1234"],
       "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
       "evaluator": ["127.0.0.1:2019"]
   }
   self.assertTrue(multi_worker_util.is_chief(cluster_spec, "evaluator", 0))
    def _initialize_multi_worker(self, cluster_resolver):
        """Initializes the object for multi-worker training."""
        # TODO(yuefengz): The `num_gpus` is only for this particular task. It
        # assumes all workers have the same number of GPUs. We should remove this
        # assumption by querying all tasks for their numbers of GPUs.
        num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)
        cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_resolver.cluster_spec())
        task_type = cluster_resolver.task_type
        task_id = cluster_resolver.task_id
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id` in the `cluster_resolver`.")
        if task_type not in ("chief", "worker"):
            raise ValueError(
                "Unrecognized task_type: %r, valid task types are: \"chief\", "
                "\"worker\"." % task_type)

        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError("No `worker` or `chief` tasks can be found in "
                             "`cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)
        if num_gpus:
            local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
                                  for i in range(num_gpus))
        else:
            local_devices = (self._worker_device, )

        self._collective_keys = cross_device_utils.CollectiveKeys()
        super(CollectiveAllReduceExtended,
              self)._initialize_local(local_devices)
        self._input_workers = input_lib.InputWorkers(
            self._device_map, [(self._worker_device, self.worker_devices)])
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            num_workers=self._num_workers,
            num_gpus_per_worker=num_gpus,
            collective_keys=self._collective_keys)

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        self._cluster_spec = cluster_spec
        self._task_type = task_type
        self._task_id = task_id

        logging.info(
            "Multi-worker CollectiveAllReduceStrategy with "
            "cluster_spec = %r, task_type = %r, task_id = %r, "
            "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
            task_type, task_id, self._num_workers, local_devices)
Example #6
0
    def _save_checkpoint_and_exit(self):
        """Saves the checkpoint and exit program."""
        logging.info('Starting checkpoint and exit')
        self._checkpointed_runs.assign(self.total_runs)

        start_time = time.monotonic()

        self._write_checkpoint_manager.save()
        # All workers need to participate in saving a checkpoint to avoid
        # deadlock. They need to write to different paths so that they would not
        # override each other. We make temporary directories for non-chief
        # workers to write to, and clean them up afterward.
        if not multi_worker_util.is_chief(
                cluster_spec=self._cluster_resolver.cluster_spec(),
                task_type=self._cluster_resolver.task_type,
                task_id=self._cluster_resolver.task_id):
            gfile.DeleteRecursively(
                os.path.dirname(self._write_checkpoint_manager.directory))

        end_time = time.monotonic()

        logging.info('Checkpoint finished at path %s',
                     self._write_checkpoint_manager.directory)
        logging.info('Checkpoint time: %f', end_time - start_time)
        self._stop_poll_termination_signal_thread()
        self._stop_cluster_wise_termination_watcher_thread()
        sys.exit(self._restart_code)
  def _initialize_multi_worker(self, cluster_resolver):
    """Initializes the object for multi-worker training."""
    # TODO(yuefengz): The `num_gpus` is only for this particular task. It
    # assumes all workers have the same number of GPUs. We should remove this
    # assumption by querying all tasks for their numbers of GPUs.
    num_gpus = cluster_resolver.num_accelerators()
    cluster_spec = multi_worker_util.normalize_cluster_spec(
        cluster_resolver.cluster_spec())
    task_type = cluster_resolver.task_type
    task_id = cluster_resolver.task_id
    if task_type is None or task_id is None:
      raise ValueError("When `cluster_spec` is given, you must also specify "
                       "`task_type` and `task_id` in the `cluster_resolver`.")
    if task_type not in ("chief", "worker"):
      raise ValueError(
          "Unrecognized task_type: %r, valid task types are: \"chief\", "
          "\"worker\"." % task_type)

    self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type)
    if not self._num_workers:
      raise ValueError("No `worker` or `chief` tasks can be found in "
                       "`cluster_spec`.")

    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                task_id)

    self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
    if num_gpus:
      local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
                            for i in range(num_gpus))
    else:
      local_devices = (self._worker_device,)

    self._collective_keys = cross_device_utils.CollectiveKeys()
    super(CollectiveAllReduceExtended, self)._initialize_local(local_devices)
    self._input_workers = input_lib.InputWorkers(
        self._device_map, [(self._worker_device, self.worker_devices)])
    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
        num_workers=self._num_workers,
        num_gpus_per_worker=num_gpus,
        collective_keys=self._collective_keys)

    # Add a default device so that ops without specified devices will not end up
    # on other workers.
    self._default_device = "/job:%s/task:%d" % (task_type, task_id)

    self._cluster_spec = cluster_spec
    self._task_type = task_type
    self._task_id = task_id

    logging.info(
        "Multi-worker CollectiveAllReduceStrategy with "
        "cluster_spec = %r, task_type = %r, task_id = %r, "
        "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
        task_type, task_id, self._num_workers, local_devices)
Example #8
0
 def _mwms_write_checkpoint_dir(self, checkpoint_dir, cluster_spec, task_type,
                                task_id):
   dirpath = os.path.dirname(checkpoint_dir)
   base = os.path.basename(checkpoint_dir)
   if not multi_worker_util.is_chief(
       cluster_spec=cluster_spec, task_type=task_type, task_id=task_id):
     base_dirpath = 'workertemp_' + str(task_id)
     dirpath = os.path.join(dirpath, base_dirpath)
     gfile.MakeDirs(dirpath)
   return os.path.join(dirpath, base)
Example #9
0
def _mwms_write_checkpoint_dir(checkpoint_dir, task_type, task_id,
                               cluster_spec):
    """Returns checkpoint_dir for chief and a temp dir for any other worker."""
    dirpath = os.path.dirname(checkpoint_dir)
    base = os.path.basename(checkpoint_dir)
    if not multi_worker_util.is_chief(
            cluster_spec=cluster_spec, task_type=task_type, task_id=task_id):
        base_dirpath = 'workertemp_' + str(task_id)
        dirpath = os.path.join(dirpath, base_dirpath)
        gfile.MakeDirs(dirpath)
    return os.path.join(dirpath, base)
def _make_checkpoint_manager(checkpoint, checkpoint_dir, cluster_resolver):
    if multi_worker_util.is_chief(cluster_spec=cluster_resolver.cluster_spec(),
                                  task_type=cluster_resolver.task_type,
                                  task_id=cluster_resolver.task_id):
        return checkpoint_management.CheckpointManager(
            checkpoint, directory=checkpoint_dir, max_to_keep=1)
    else:
        return checkpoint_management.CheckpointManager(
            checkpoint,
            directory=failure_handling._non_chief_checkpoint_dir(
                checkpoint_dir, cluster_resolver.task_id),
            max_to_keep=1)
Example #11
0
    def _initialize_multi_worker(self, container_strategy, num_gpus_per_worker,
                                 cluster_spec, task_type, task_id):
        """Initializes the object for multi-worker training."""
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`")
        if task_type not in ["chief", "worker"]:
            raise ValueError(
                "Unrecognized task_type: %r, valid task types are: \"chief\", "
                "\"worker\"." % task_type)
        cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError("No `worker` or `chief` tasks can be found in "
                             "`cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        worker_device = "/job:%s/task:%d" % (task_type, task_id)
        if num_gpus_per_worker:
            local_devices = [
                "%s/device:GPU:%d" % (worker_device, i)
                for i in range(num_gpus_per_worker)
            ]
        else:
            local_devices = [worker_device]

        self._collective_keys = cross_device_utils.CollectiveKeys()
        super(CollectiveAllReduceExtended, self).__init__(
            container_strategy,
            devices=local_devices,
            cross_device_ops=cross_device_ops_lib.CollectiveAllReduce(
                num_workers=self._num_workers,
                num_gpus_per_worker=num_gpus_per_worker,
                collective_keys=self._collective_keys))

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        self._cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_spec)
        self._task_type = task_type
        self._task_id = task_id

        logging.info(
            "Multi-worker CollectiveAllReduceStrategy with "
            "cluster_spec = %r, task_type = %r, task_id = %r, "
            "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
            task_type, task_id, self._num_workers, local_devices)
Example #12
0
    def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
                                 task_type, task_id):
        """Initializes the object for multi-worker training."""
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`")
        if task_type not in ("chief", "worker"):
            raise ValueError(
                "Unrecognized task_type: %r, valid task types are: \"chief\", "
                "\"worker\"." % task_type)
        cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError("No `worker` or `chief` tasks can be found in "
                             "`cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)
        if num_gpus_per_worker:
            local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
                                  for i in range(num_gpus_per_worker))
        else:
            local_devices = (self._worker_device, )

        self._collective_keys = cross_device_utils.CollectiveKeys()
        self._initialize_local(local_devices)
        self._input_workers = input_lib.InputWorkers(
            self._device_map, [(self._worker_device, self.worker_devices)])
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            num_workers=self._num_workers,
            num_gpus_per_worker=num_gpus_per_worker,
            collective_keys=self._collective_keys)

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        self._cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_spec)
        self._task_type = task_type
        self._task_id = task_id

        logging.info(
            "Multi-worker CollectiveAllReduceStrategy with "
            "cluster_spec = %r, task_type = %r, task_id = %r, "
            "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
            task_type, task_id, self._num_workers, local_devices)
Example #13
0
 def _save_checkpoint(self):
     """Saves the checkpoint."""
     self._write_checkpoint_manager.save()
     # All workers need to participate in saving a checkpoint to avoid
     # deadlock. They need to write to different paths so that they would not
     # override each other. We make temporary directories for non-chief
     # workers to write to, and clean them up afterward.
     if not multi_worker_util.is_chief(
             cluster_spec=self._cluster_resolver.cluster_spec(),
             task_type=self._cluster_resolver.task_type,
             task_id=self._cluster_resolver.task_id):
         gfile.DeleteRecursively(
             os.path.dirname(self._write_checkpoint_manager.directory))
    def _initialize(self, cluster_spec, task_type, task_id):
        if cluster_spec:
            if task_type is None or task_id is None:
                raise ValueError(
                    "When `cluster_spec` is given, you must also specify "
                    "`task_type` and `task_id`")
            if task_type not in ["chief", "worker"]:
                raise ValueError(
                    "Unrecognized task_type: %r, valid task types are: \"chief\", "
                    "\"worker\"." % task_type)
            self._cluster_spec = multi_worker_util.normalize_cluster_spec(
                cluster_spec)
            worker_device = "/job:%s/task:%d" % (task_type, task_id)
            num_workers = len(self._cluster_spec.as_dict().get(
                "worker", [])) + len(self._cluster_spec.as_dict().get(
                    "chief", []))
            if not num_workers:
                raise ValueError(
                    "No `worker` or `chief` tasks can be found in "
                    "`cluster_spec`.")

            self._is_chief = multi_worker_util.is_chief(
                cluster_spec, task_type, task_id)
        else:
            self._cluster_spec = None
            self._is_chief = True
            worker_device = ""
            num_workers = 1
        self._num_workers = num_workers

        if self._num_gpus_per_worker:
            local_devices = [
                "%s/device:GPU:%d" % (worker_device, i)
                for i in range(self._num_gpus_per_worker)
            ]
        else:
            local_devices = [worker_device]

        self._collective_keys = cross_tower_utils.CollectiveKeys()
        super(CollectiveAllReduceStrategy, self).__init__(
            devices=local_devices,
            cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce(
                num_workers=num_workers,
                num_gpus_per_worker=self._num_gpus_per_worker,
                collective_keys=self._collective_keys))

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        if cluster_spec:
            self._default_device = "/job:%s/replica:0/task:%d" % (task_type,
                                                                  task_id)
  def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
                               task_type, task_id):
    """Initializes the object for multi-worker training."""
    if task_type is None or task_id is None:
      raise ValueError("When `cluster_spec` is given, you must also specify "
                       "`task_type` and `task_id`")
    if task_type not in ("chief", "worker"):
      raise ValueError(
          "Unrecognized task_type: %r, valid task types are: \"chief\", "
          "\"worker\"." % task_type)
    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
    self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type)
    if not self._num_workers:
      raise ValueError("No `worker` or `chief` tasks can be found in "
                       "`cluster_spec`.")

    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                task_id)

    self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
    if num_gpus_per_worker:
      local_devices = tuple(
          "%s/device:GPU:%d" % (self._worker_device, i)
          for i in range(num_gpus_per_worker)
      )
    else:
      local_devices = (self._worker_device,)

    self._collective_keys = cross_device_utils.CollectiveKeys()
    self._initialize_local(local_devices)
    self._input_workers = input_lib.InputWorkers(
        self._device_map, [(self._worker_device, self.worker_devices)])
    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
        num_workers=self._num_workers,
        num_gpus_per_worker=num_gpus_per_worker,
        collective_keys=self._collective_keys)

    # Add a default device so that ops without specified devices will not end up
    # on other workers.
    self._default_device = "/job:%s/task:%d" % (task_type, task_id)

    self._cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
    self._task_type = task_type
    self._task_id = task_id

    logging.info(
        "Multi-worker CollectiveAllReduceStrategy with "
        "cluster_spec = %r, task_type = %r, task_id = %r, "
        "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
        task_type, task_id, self._num_workers, local_devices)
  def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
                               task_type, task_id):
    """Initializes the object for multi-worker training."""
    if task_type is None or task_id is None:
      raise ValueError("When `cluster_spec` is given, you must also specify "
                       "`task_type` and `task_id`")
    if task_type not in ["chief", "worker"]:
      raise ValueError(
          "Unrecognized task_type: %r, valid task types are: \"chief\", "
          "\"worker\"." % task_type)
    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
    self._num_workers = len(cluster_spec.as_dict().get("worker", [])) + len(
        cluster_spec.as_dict().get("chief", []))
    if not self._num_workers:
      raise ValueError("No `worker` or `chief` tasks can be found in "
                       "`cluster_spec`.")

    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                task_id)

    worker_device = "/job:%s/task:%d" % (task_type, task_id)
    if num_gpus_per_worker:
      local_devices = [
          "%s/device:GPU:%d" % (worker_device, i)
          for i in range(num_gpus_per_worker)
      ]
    else:
      local_devices = [worker_device]

    self._collective_keys = cross_tower_utils.CollectiveKeys()
    super(CollectiveAllReduceStrategy, self).__init__(
        devices=local_devices,
        cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce(
            num_workers=self._num_workers,
            num_gpus_per_worker=num_gpus_per_worker,
            collective_keys=self._collective_keys))

    # Add a default device so that ops without specified devices will not end up
    # on other workers.
    self._default_device = "/job:%s/task:%d" % (task_type, task_id)

    self._cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
    self._task_type = task_type
    self._task_id = task_id

    logging.info(
        "Multi-worker CollectiveAllReduceStrategy with "
        "cluster_spec = %r, task_type = %r, task_id = %r, "
        "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
        task_type, task_id, self._num_workers, local_devices)
  def testClusterWithoutChief(self):
    cluster_spec = {"worker": ["127.0.0.1:8964", "127.0.0.1:2333"]}
    self.assertTrue(multi_worker_util.is_chief(cluster_spec, "worker", 0))
    self.assertFalse(multi_worker_util.is_chief(cluster_spec, "worker", 1))

    with self.assertRaisesRegexp(
        ValueError, "The task_type \"chief\" is not in the `cluster_spec`."):
      multi_worker_util.is_chief(cluster_spec, "chief", 0)

    with self.assertRaisesRegexp(
        ValueError, "The `task_id` 2 exceeds the maximum id of worker."):
      multi_worker_util.is_chief(cluster_spec, "worker", 2)
Example #18
0
  def testClusterWithoutChief(self):
    cluster_spec = {"worker": ["127.0.0.1:8964", "127.0.0.1:2333"]}
    self.assertTrue(multi_worker_util.is_chief(cluster_spec, "worker", 0))
    self.assertFalse(multi_worker_util.is_chief(cluster_spec, "worker", 1))

    with self.assertRaisesRegexp(
        ValueError, "The task_type \"chief\" is not in the `cluster_spec`."):
      multi_worker_util.is_chief(cluster_spec, "chief", 0)

    with self.assertRaisesRegexp(
        ValueError, "The `task_id` 2 exceeds the maximum id of worker."):
      multi_worker_util.is_chief(cluster_spec, "worker", 2)
Example #19
0
 def on_train_begin(self, logs):
     if not multi_worker_util.is_chief():
         # Non-chief workers shouldn't run this callback.
         self.filtered_correctly = False
Example #20
0
    def _test_minimize_loss_graph(self,
                                  task_type,
                                  task_id,
                                  num_gpus,
                                  use_core_strategy=False):
        d, master_target, sess_config = self._get_test_objects(
            task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
        if task_type:
            # Multi-worker
            assert hasattr(d.extended,
                           '_cluster_spec') and d.extended._cluster_spec
            num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
            if CHIEF in d.extended._cluster_spec.as_dict():
                num_workers += 1
        else:
            # local
            num_workers = 1

        with ops.Graph().as_default(), \
             self.cached_session(target=master_target,
                                 config=sess_config) as sess, \
             d.scope():
            l = core.Dense(1, use_bias=False)

            def loss_fn(x):
                y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
                return y * y

            # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for
            # multiple graphs (b/111216820).
            def grad_fn(x):
                loss = loss_fn(x)
                var_list = (variables.trainable_variables() +
                            ops.get_collection(
                                ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
                grads = gradients.gradients(loss, var_list)
                ret = list(zip(grads, var_list))
                return ret

            def update(v, g):
                return v.assign_sub(0.05 * g, use_locking=True)

            one = constant_op.constant([[1.]])

            def step():
                """Perform one optimization step."""
                # Run forward & backward to get gradients, variables list.
                g_v = d.extended.call_for_each_replica(grad_fn, args=(one, ))
                # Update the variables using the gradients and the update() function.
                before_list = []
                after_list = []
                for g, v in g_v:
                    fetched = d.extended.read_var(v)
                    before_list.append(fetched)
                    with ops.control_dependencies([fetched]):
                        # TODO(yuefengz): support non-Mirrored variable as destinations.
                        g = d.extended.reduce_to(reduce_util.ReduceOp.SUM,
                                                 g,
                                                 destinations=v)
                        with ops.control_dependencies(
                                d.extended.update(v,
                                                  update,
                                                  args=(g, ),
                                                  group=False)):
                            after_list.append(d.extended.read_var(v))
                return before_list, after_list

            before_out, after_out = step()

            if context.num_gpus() < d.extended._num_gpus_per_worker:
                return True

            if (not task_type or multi_worker_util.is_chief(
                    d.extended._cluster_spec, task_type, task_id)):
                variables.global_variables_initializer().run()

            # Workers waiting for chief worker's initializing variables.
            self._init_condition.acquire()
            self._init_reached += 1
            while self._init_reached != num_workers:
                self._init_condition.wait()
            self._init_condition.notify_all()
            self._init_condition.release()

            for i in range(10):
                b, a = sess.run((before_out, after_out))
                if i == 0:
                    before, = b
                after, = a

            error_before = abs(before - 1)
            error_after = abs(after - 1)
            # Error should go down
            self.assertLess(error_after, error_before)
            return error_after < error_before
  def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
    d, master_target, sess_config = self._get_test_objects(
        task_type, task_id, num_gpus)
    assert hasattr(d, '_cluster_spec') and d._cluster_spec
    num_workers = len(d._cluster_spec.as_dict().get(WORKER))
    if CHIEF in d._cluster_spec.as_dict():
      num_workers += 1

    with ops.Graph().as_default(), \
         self.test_session(target=master_target,
                           config=sess_config) as sess, \
         d.scope():
      l = core.Dense(1, use_bias=False)

      def loss_fn(x):
        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
        return y * y

      # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for
      # multiple graphs (b/111216820).
      def grad_fn(x):
        loss = loss_fn(x)
        var_list = (
            variables.trainable_variables() + ops.get_collection(
                ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
        grads = gradients.gradients(loss, var_list)
        ret = list(zip(grads, var_list))
        return ret

      def update(v, g):
        return v.assign_sub(0.05 * g, use_locking=True)

      one = d.broadcast(constant_op.constant([[1.]]))

      def step():
        """Perform one optimization step."""
        # Run forward & backward to get gradients, variables list.
        g_v = d.call_for_each_tower(grad_fn, one)
        # Update the variables using the gradients and the update() function.
        before_list = []
        after_list = []
        for g, v in g_v:
          fetched = d.read_var(v)
          before_list.append(fetched)
          with ops.control_dependencies([fetched]):
            # TODO(yuefengz): support non-Mirrored variable as destinations.
            g = d.reduce(
                variable_scope.VariableAggregation.SUM, g, destinations=v)
            with ops.control_dependencies(d.unwrap(d.update(v, update, g))):
              after_list.append(d.read_var(v))
        return before_list, after_list

      before_out, after_out = step()

      if context.num_gpus() < d._num_gpus_per_worker:
        return True

      if multi_worker_util.is_chief(d._cluster_spec, task_type, task_id):
        variables.global_variables_initializer().run()

      # Workers waiting for chief worker's initializing variables.
      self._init_condition.acquire()
      self._init_reached += 1
      while self._init_reached != num_workers:
        self._init_condition.wait()
      self._init_condition.notify_all()
      self._init_condition.release()

      for i in range(10):
        b, a = sess.run((before_out, after_out))
        if i == 0:
          before, = b
        after, = a

      error_before = abs(before - 1)
      error_after = abs(after - 1)
      # Error should go down
      self.assertLess(error_after, error_before)
      return error_after < error_before
  def _initialize_devices(self, num_gpus_per_worker, cluster_spec, task_type,
                          task_id):
    """Initialize internal devices.

    It creates variable devices and compute devices. Variables and operations
    will be assigned to them respectively. We have one compute device per tower.
    The variable device is a device function or device string. The default
    variable device assigns variables to parameter servers in a round-robin
    fashion.

    Args:
      num_gpus_per_worker: number of local GPUs or GPUs per worker.
      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
        cluster configurations.
      task_type: the current task type.
      task_id: the current task id.

    Raises:
      ValueError: if the cluster_spec doesn't have ps jobs.
    """
    self._task_type = task_type or "worker"
    self._task_id = task_id or 0
    self._worker_device = "/job:%s/task:%d" % (self._task_type, self._task_id)

    # TODO(yuefengz): maybe clearer to split it into two classes, one for
    # the distribuetd case and one for the local case, once we have the factory
    # class/method.

    # Define compute devices which is a list of device strings and one for each
    # tower. When there are GPUs, replicate operations on these GPUs. Otherwise,
    # place operations on CPU.
    if cluster_spec is None:
      # Local mode.
      if num_gpus_per_worker > 0:
        self._compute_devices = list(
            map("/device:GPU:{}".format, range(num_gpus_per_worker)))
      else:
        self._compute_devices = [_LOCAL_CPU]
    else:
      # Distributed mode.
      if num_gpus_per_worker > 0:
        self._compute_devices = [
            "%s/device:GPU:%d" % (self._worker_device, i)
            for i in range(num_gpus_per_worker)
        ]
      else:
        self._compute_devices = [self._worker_device]

    self._compute_devices = list(
        map(device_util.resolve, self._compute_devices))
    self._canonical_compute_device_set = set(self._compute_devices)

    # Define variable device which is a device string in the local case and a
    # device function in the distributed case. It is used to open a device scope
    # where varibles are defined.
    # The `_parameter_devices` is needed for the `parameter_devices` property
    # and is a list of all variable devices.
    if cluster_spec is None:
      # Local mode. If there is only one GPU, put everything on that GPU.
      # Otherwise, place variables on CPU.
      if num_gpus_per_worker == 1:
        assert len(list(self._compute_devices)) == 1
        self._variable_device = _LOCAL_GPU_0
        self._parameter_devices = [_LOCAL_GPU_0]
      else:
        self._variable_device = _LOCAL_CPU
        self._parameter_devices = [_LOCAL_CPU]
    else:
      # Distributed mode. Place variables on ps jobs in a round-robin fashion.
      # Note that devices returned from `replica_device_setter` are not
      # canonical and therefore we don't canonicalize all variable devices to
      # make them consistent.
      # TODO(yuefengz): support passing a strategy object to control variable
      # assignment.
      # TODO(yuefengz): merge the logic of replica_device_setter into this
      # class.
      num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
      if num_ps_replicas == 0:
        raise ValueError("The cluster spec needs to have `ps` jobs.")
      self._variable_device = device_setter.replica_device_setter(
          ps_tasks=num_ps_replicas,
          worker_device=self._worker_device,
          merge_devices=True,
          cluster=cluster_spec)

      # Parameter devices are all tasks of the "ps" job.
      self._parameter_devices = map("/job:ps/task:{}".format,
                                    range(num_ps_replicas))

    # Define the default device in cross-tower mode. In the distributed case, we
    # set the default device to the corresponding worker to prevent these ops
    # from being placed on other workers.
    if cluster_spec is None:
      self._default_device = None
    else:
      self._default_device = self._worker_device

    self._is_chief = cluster_spec is None or multi_worker_util.is_chief(
        cluster_spec, task_type, task_id)
Example #23
0
    def _initialize_devices(self, num_gpus_per_worker, cluster_spec, task_type,
                            task_id):
        """Initialize internal devices.

    It creates variable devices and compute devices. Variables and operations
    will be assigned to them respectively. We have one compute device per tower.
    The variable device is a device function or device string. The default
    variable device assigns variables to parameter servers in a round-robin
    fashion.

    Args:
      num_gpus_per_worker: number of local GPUs or GPUs per worker.
      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
        cluster configurations.
      task_type: the current task type.
      task_id: the current task id.

    Raises:
      ValueError: if the cluster_spec doesn't have ps jobs.
    """
        self._task_type = task_type or "worker"
        self._task_id = task_id or 0
        self._worker_device = "/job:%s/task:%d" % (self._task_type,
                                                   self._task_id)

        # TODO(yuefengz): maybe clearer to split it into two classes, one for
        # the distribuetd case and one for the local case, once we have the factory
        # class/method.

        # Define compute devices which is a list of device strings and one for each
        # tower. When there are GPUs, replicate operations on these GPUs. Otherwise,
        # place operations on CPU.
        if cluster_spec is None:
            # Local mode.
            if num_gpus_per_worker > 0:
                self._compute_devices = list(
                    map("/device:GPU:{}".format, range(num_gpus_per_worker)))
            else:
                self._compute_devices = [_LOCAL_CPU]
        else:
            # Distributed mode.
            if num_gpus_per_worker > 0:
                self._compute_devices = [
                    "%s/device:GPU:%d" % (self._worker_device, i)
                    for i in range(num_gpus_per_worker)
                ]
            else:
                self._compute_devices = [self._worker_device]

        self._compute_devices = list(
            map(device_util.resolve, self._compute_devices))
        self._canonical_compute_device_set = set(self._compute_devices)

        # Define variable device which is a device string in the local case and a
        # device function in the distributed case. It is used to open a device scope
        # where varibles are defined.
        # The `_parameter_devices` is needed for the `parameter_devices` property
        # and is a list of all variable devices.
        if cluster_spec is None:
            # Local mode. If there is only one GPU, put everything on that GPU.
            # Otherwise, place variables on CPU.
            if num_gpus_per_worker == 1:
                assert len(list(self._compute_devices)) == 1
                self._variable_device = _LOCAL_GPU_0
                self._parameter_devices = [_LOCAL_GPU_0]
            else:
                self._variable_device = _LOCAL_CPU
                self._parameter_devices = [_LOCAL_CPU]
        else:
            # Distributed mode. Place variables on ps jobs in a round-robin fashion.
            # Note that devices returned from `replica_device_setter` are not
            # canonical and therefore we don't canonicalize all variable devices to
            # make them consistent.
            # TODO(yuefengz): support passing a strategy object to control variable
            # assignment.
            # TODO(yuefengz): merge the logic of replica_device_setter into this
            # class.
            num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
            if num_ps_replicas == 0:
                raise ValueError("The cluster spec needs to have `ps` jobs.")
            self._variable_device = device_setter.replica_device_setter(
                ps_tasks=num_ps_replicas,
                worker_device=self._worker_device,
                merge_devices=True,
                cluster=cluster_spec)

            # Parameter devices are all tasks of the "ps" job.
            self._parameter_devices = map("/job:ps/task:{}".format,
                                          range(num_ps_replicas))

        # Define the default device in cross-tower mode. In the distributed case, we
        # set the default device to the corresponding worker to prevent these ops
        # from being placed on other workers.
        if cluster_spec is None:
            self._default_device = None
        else:
            self._default_device = self._worker_device

        self._is_chief = cluster_spec is None or multi_worker_util.is_chief(
            cluster_spec, task_type, task_id)
Example #24
0
    def _initialize_multi_worker(self, cluster_resolver):
        """Initializes the object for multi-worker training."""
        cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_resolver.cluster_spec())
        task_type = cluster_resolver.task_type
        task_id = cluster_resolver.task_id
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`.")
        self._cluster_spec = cluster_spec
        self._task_type = task_type
        self._task_id = task_id

        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError(
                "No `worker`, `chief` or `evaluator` tasks can be found "
                "in `cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)

        if (ops.executing_eagerly_outside_functions() and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            context.context().configure_collective_ops(
                collective_leader=multi_worker_util.collective_leader(
                    cluster_spec, task_type, task_id),
                scoped_allocator_enabled_ops=("CollectiveReduce", ),
                use_nccl_communication=(
                    self._communication ==
                    cross_device_ops_lib.CollectiveCommunication.NCCL),
                device_filters=("/job:%s/task:%d" % (task_type, task_id), ))
            self._collective_ops_configured = True

        # Starting a std server in eager mode and in independent worker mode.
        if (context.executing_eagerly()
                and not getattr(self, "_std_server_started", False) and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            # Checking _local_or_standalone_client_mode as well because we should not
            # create the std server in standalone client mode.
            config_proto = config_pb2.ConfigProto()
            config_proto = self._update_config_proto(config_proto)
            server_def = tensorflow_server_pb2.ServerDef(
                cluster=cluster_spec.as_cluster_def(),
                default_session_config=config_proto,
                job_name=task_type,
                task_index=task_id,
                protocol=cluster_resolver.rpc_layer or "grpc")
            context.context().enable_collective_ops(server_def)
            self._std_server_started = True
            # The `ensure_initialized` is needed before calling
            # `context.context().devices()`.
            context.context().ensure_initialized()
            logging.info(
                "Enabled multi-worker collective ops with available devices: %r",
                context.context().devices())

        # TODO(yuefengz): The `num_gpus` is only for this particular task. It
        # assumes all workers have the same number of GPUs. We should remove this
        # assumption by querying all tasks for their numbers of GPUs.
        # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
        # some cases.
        if isinstance(cluster_resolver, TFConfigClusterResolver):
            num_gpus = context.num_gpus()
        else:
            num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)

        if num_gpus:
            local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
                                  for i in range(num_gpus))
        else:
            local_devices = (self._worker_device, )

        self._collective_keys = cross_device_utils.CollectiveKeys()
        super(CollectiveAllReduceExtended,
              self)._initialize_local(local_devices)
        self._input_workers = input_lib.InputWorkers(
            self._device_map, [(self._worker_device, self.worker_devices)])
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            num_workers=self._num_workers,
            num_gpus_per_worker=num_gpus,
            collective_keys=self._collective_keys)

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        # Save the num_gpus_per_worker and rpc_layer for configure method.
        self._num_gpus_per_worker = num_gpus
        self._rpc_layer = cluster_resolver.rpc_layer
        self._warn_nccl_no_gpu()

        logging.info(
            "Multi-worker CollectiveAllReduceStrategy with cluster_spec = %r, "
            "task_type = %r, task_id = %r, num_workers = %r, local_devices = %r, "
            "communication = %s", cluster_spec.as_dict(), task_type, task_id,
            self._num_workers, local_devices, self._communication)
  def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
                               task_type, task_id):
    """Initialize devices for multiple workers.

    It creates variable devices and compute devices. Variables and operations
    will be assigned to them respectively. We have one compute device per
    replica. The variable device is a device function or device string. The
    default variable device assigns variables to parameter servers in a
    round-robin fashion.

    Args:
      num_gpus_per_worker: number of local GPUs or GPUs per worker.
      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
        cluster configurations.
      task_type: the current task type.
      task_id: the current task id.

    Raises:
      ValueError: if the cluster_spec doesn't have ps jobs.
    """
    assert cluster_spec
    if not task_type or task_id is None:
      raise ValueError("When `cluster_spec` is given, you must also specify "
                       "`task_type` and `task_id`")
    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)

    self._worker_device = "/job:%s/task:%d" % (self._task_type, self._task_id)

    # Define compute devices which is a list of device strings and one for each
    # replica. When there are GPUs, replicate operations on these GPUs.
    # Otherwise, place operations on CPU.
    if num_gpus_per_worker > 0:
      self._compute_devices = [
          "%s/device:GPU:%d" % (self._worker_device, i)
          for i in range(num_gpus_per_worker)
      ]
    else:
      self._compute_devices = [self._worker_device]

    self._compute_devices = list(
        map(device_util.resolve, self._compute_devices))
    self._canonical_compute_device_set = set(self._compute_devices)

    # In distributed mode, place variables on ps jobs in a round-robin fashion.
    # Note that devices returned from `replica_device_setter` are not
    # canonical and therefore we don't canonicalize all variable devices to
    # make them consistent.
    # TODO(yuefengz): support passing a strategy object to control variable
    # assignment.
    # TODO(yuefengz): merge the logic of replica_device_setter into this
    # class.
    num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
    if num_ps_replicas == 0:
      raise ValueError("The cluster spec needs to have `ps` jobs.")
    self._variable_device = device_setter.replica_device_setter(
        ps_tasks=num_ps_replicas,
        worker_device=self._worker_device,
        merge_devices=True,
        cluster=cluster_spec)

    # The `_parameter_devices` is needed for the `parameter_devices` property
    # and is a list of all variable devices. Here parameter devices are all
    # tasks of the "ps" job.
    self._parameter_devices = map("/job:ps/task:{}".format,
                                  range(num_ps_replicas))

    # Add a default device so that ops without specified devices will not end up
    # on other workers.
    self._default_device = self._worker_device

    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                task_id)
    self._cluster_spec = cluster_spec
    self._task_type = task_type
    self._task_id = task_id

    logging.info(
        "Multi-worker ParameterServerStrategy with "
        "cluster_spec = %r, task_type = %r, task_id = %r, "
        "num_ps_replicas = %r, is_chief = %r, compute_devices = %r, "
        "variable_device = %r", cluster_spec.as_dict(), task_type, task_id,
        num_ps_replicas, self._is_chief, self._compute_devices,
        self._variable_device)
Example #26
0
    def _checkpoint_if_preempted(self):
        """Checkpoint if any worker has received a preemption signal.

    This function handles preemption signal reported by any worker in the
    cluster. The current implementation relies on the fact that all workers in a
    MultiWorkerMirroredStrategy training cluster have a step number difference
    maximum of 1.
    - If the signal comes from the worker itself (i.e., where this failure
    handler sits), the worker will notify all peers to checkpoint after they
    finish CURRENT_STEP+1 steps, where CURRENT_STEP is the step this worker has
    just finished. And the worker will wait for all peers to acknowledge that
    they have received its preemption signal and the final-step number before
    the worker proceeds on training the final step.
    - If the signal comes from another member in the cluster but NO final-step
    info is available, proceed on training, because it will be available after
    finishing the next step.
    - If the signal comes from some other member in the cluster, and final-step
    info is available, if the worker has not finished these steps yet, keep
    training; otherwise, checkpoint and exit with a cluster-recognized restart
    code.
    """
        if self._final_checkpoint_countdown:
            run_count_config_key = _FINAL_RUN_COUNT_KEY

        else:
            run_count_config_key = _INITIAL_RUN_COUNT_KEY

        if self._received_checkpoint_step.is_set():

            run_count_key = context.context().get_config_key_value(
                run_count_config_key)

            if run_count_key == str(self._run_counter):
                self._save_checkpoint()

                if self._time_to_exit():
                    self._stop_poll_termination_signal_thread()
                    self._stop_cluster_wise_termination_watcher_thread()
                    if self._api_made_checkpoint_manager and (
                            not multi_worker_util.is_chief(
                                cluster_spec=self._cluster_resolver.
                                cluster_spec(),
                                task_type=self._cluster_resolver.task_type,
                                task_id=self._cluster_resolver.task_id)):
                        gfile.DeleteRecursively(
                            os.path.dirname(
                                self._write_checkpoint_manager.directory))
                    logging.info(
                        'PreemptionCheckpointHandler: checkpoint saved. Exiting.'
                    )

                    self._exit_fn()

                else:
                    logging.info('Continue training for the grace period.')
                    self._final_checkpoint_countdown = True
                    self._received_checkpoint_step.clear()

        elif self._received_own_sigterm.is_set():
            # Only the worker who gets termination signal first among the cluster
            # will enter this branch. The following will happen in chronological
            # order:
            # 1. The worker just receives a preemption signal and enters this branch
            # for the first time. It will set a step-to-checkpoint and let the cluster
            # know.
            # 2. If there is a long grace period, it will also set
            # _final_checkpoint_countdown, so that during this grace period, it will
            # re-enter this branch to check if grace period is ending.
            # 3. If it is, set a step-to-checkpoint key again.

            if self._final_checkpoint_countdown:
                if self._target_time_for_termination < time.time():
                    logging.info(
                        'Grace period almost ended. Final call to save a checkpoint!'
                    )
                else:
                    return

            step_to_save_at = str(self._run_counter + 1)

            logging.info(
                'Termination caught in main thread on preempted worker')
            context.context().set_config_key_value(run_count_config_key,
                                                   step_to_save_at)
            logging.info('%s set to %s', run_count_config_key, step_to_save_at)

            n_workers = multi_worker_util.worker_count(
                self._cluster_resolver.cluster_spec(),
                self._cluster_resolver.task_type)
            for i in range(n_workers):
                context.context().get_config_key_value(
                    f'{_ACKNOWLEDGE_KEY}_{run_count_config_key}_{i}')
                logging.info(
                    'Sigterm acknowledgement from replica %d received', i)

            self._setup_countdown_if_has_grace_period_and_not_already_counting_down(
            )
  def _initialize_multi_worker(self, cluster_resolver):
    """Initialize devices for multiple workers.

    It creates variable devices and compute devices. Variables and operations
    will be assigned to them respectively. We have one compute device per
    replica. The variable device is a device function or device string. The
    default variable device assigns variables to parameter servers in a
    round-robin fashion.

    Args:
      cluster_resolver: a descendant of `ClusterResolver` object.

    Raises:
      ValueError: if the cluster doesn't have ps jobs.
    """
    # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
    # some cases.
    if isinstance(cluster_resolver, TFConfigClusterResolver):
      num_gpus = context.num_gpus()
    else:
      num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)

    # Save the num_gpus_per_worker for configure method.
    self._num_gpus_per_worker = num_gpus

    cluster_spec = cluster_resolver.cluster_spec()
    task_type = cluster_resolver.task_type
    task_id = cluster_resolver.task_id
    if not task_type or task_id is None:
      raise ValueError("When `cluster_spec` is given, you must also specify "
                       "`task_type` and `task_id`")
    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
    assert cluster_spec.as_dict()

    worker_device = "/job:%s/task:%d" % (task_type, task_id)
    self._input_host_device = numpy_dataset.SingleDevice(worker_device)

    # Define compute devices which is a list of device strings and one for each
    # replica. When there are GPUs, replicate operations on these GPUs.
    # Otherwise, place operations on CPU.
    if num_gpus > 0:
      compute_devices = tuple(
          "%s/device:GPU:%d" % (worker_device, i) for i in range(num_gpus))
    else:
      compute_devices = (worker_device,)

    self._device_map = values.ReplicaDeviceMap(compute_devices)
    self._input_workers = input_lib.InputWorkers(
        self._device_map, [(worker_device, compute_devices)])

    # In distributed mode, place variables on ps jobs in a round-robin fashion.
    # Note that devices returned from `replica_device_setter` are not
    # canonical and therefore we don't canonicalize all variable devices to
    # make them consistent.
    # TODO(yuefengz): support passing a strategy object to control variable
    # assignment.
    # TODO(yuefengz): merge the logic of replica_device_setter into this
    # class.
    num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
    if num_ps_replicas == 0:
      raise ValueError("The cluster spec needs to have `ps` jobs.")
    self._variable_device = device_setter.replica_device_setter(
        ps_tasks=num_ps_replicas,
        worker_device=worker_device,
        merge_devices=True,
        cluster=cluster_spec)

    # The `_parameter_devices` is needed for the `parameter_devices` property
    # and is a list of all variable devices. Here parameter devices are all
    # tasks of the "ps" job.
    self._parameter_devices = tuple(map("/job:ps/task:{}".format,
                                        range(num_ps_replicas)))

    # Add a default device so that ops without specified devices will not end up
    # on other workers.
    self._default_device = worker_device

    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                task_id)
    self._cluster_spec = cluster_spec
    self._task_type = task_type
    self._task_id = task_id

    logging.info(
        "Multi-worker ParameterServerStrategy with "
        "cluster_spec = %r, task_type = %r, task_id = %r, "
        "num_ps_replicas = %r, is_chief = %r, device_map = %r, "
        "variable_device = %r", cluster_spec.as_dict(), task_type, task_id,
        num_ps_replicas, self._is_chief, self._device_map,
        self._variable_device)
Example #28
0
    def __init__(self, cluster_resolver, checkpoint, checkpoint_dir):
        """Creates the failure handler.

    Args:
      cluster_resolver: a `tf.distribute.cluster_resolver.ClusterResolver`. You
        may also get it through the `cluster_resolver` attribute of the
        strategy in use.
      checkpoint: a `tf.train.Checkpoint` that will be saved upon preemption and
        loaded upon restart by the `CoordinatedCheckpointManager` API
        automatically.
      checkpoint_dir: a directory for the `CoordinatedCheckpointManager` to play
        with checkpoints. `CoordinatedCheckpointManager` will create a
        `tf.train.CheckpointManager` to manage the passed-in `checkpoint`. Since
        only one `tf.train.CheckpointManager` should be active in a particular
        directory at a time, this `checkpoint_dir` arg should preferably be
        separated from where the user saves their checkpoint for non-fault
        tolerance purpose.
    """
        self._cluster_resolver = cluster_resolver
        self._checkpoint = checkpoint
        self._id_in_cluster = str(
            multi_worker_util.id_in_cluster(
                self._cluster_resolver.cluster_spec(),
                self._cluster_resolver.task_type,
                self._cluster_resolver.task_id))

        # The number of calls to `CoordinatedCheckpointManager.run` when the latest
        # checkpoint was saved.
        self._checkpointed_runs = variables.Variable(
            initial_value=constant_op.constant(0, dtype=dtypes.int64),
            trainable=False,
            name=_ITERATION_VARIABLE)
        if not hasattr(self._checkpoint, _ITERATION_VARIABLE):
            setattr(self._checkpoint, _ITERATION_VARIABLE,
                    self._checkpointed_runs)

        # Make CheckpointManagers. MultiWorkerMirroredStrategy requires different
        # setup on chief and on other workers.
        self._read_checkpoint_manager = checkpoint_management.CheckpointManager(
            checkpoint, directory=checkpoint_dir, max_to_keep=1)
        if multi_worker_util.is_chief(
                cluster_spec=cluster_resolver.cluster_spec(),
                task_type=cluster_resolver.task_type,
                task_id=cluster_resolver.task_id):
            self._write_checkpoint_manager = self._read_checkpoint_manager
        else:
            self._write_checkpoint_manager = checkpoint_management.CheckpointManager(
                checkpoint,
                _mwms_write_checkpoint_dir(checkpoint_dir,
                                           cluster_resolver.task_type,
                                           cluster_resolver.task_id,
                                           cluster_resolver.cluster_spec()),
                max_to_keep=1)

        self._read_checkpoint_manager.restore_or_initialize()

        # An internal step counter that's restored to checkpointed_iterations when
        # training is restored. It increments by one every time
        # `CoordinatedCheckpointManager.run` is called. Note that in this case, the
        # user must pass a single-step training function to
        # `CoordinatedCheckpointManager.run` instead of a multiple-step one.
        self._run_counter = self._checkpointed_runs.numpy()

        # The worker itself has received preeption signal.
        self._received_own_sigterm = threading.Event()

        # Some member (could be oneself) has received preemption signal, and the
        # step number to save a checkpoint has been aligned.
        self._received_sigterm_and_step = threading.Event()

        # When training is interrupted, we explicitly call the cleanup methods for
        # the thread watching for local worker's termination signal and the thread
        # watching for clusterwise information before we save a checkpoint and exit.
        # In the final chapter of the training where no interruption is encountered,
        # we rely on __del__ to clean up. However, there is no guarantee when or
        # whether __del__ is executed, thus we make the threads daemon to avoid it
        # preventing program from exit.
        self._cluster_wise_termination_watcher_thread = threading.Thread(
            target=self._wait_for_signal,
            name='PeerTerminationWatcher-%s' % self._id_in_cluster,
            daemon=True)
        self._cluster_wise_termination_watcher_thread.start()

        self._poll_gce_signal_thread = None
        self._platform_device = gce_util.detect_platform()
        if self._platform_device is gce_util.PlatformDevice.GCE_GPU:
            self._start_polling_for_gce_signal()
            self._exit_code = gce_util._RESTARTABLE_EXIT_CODE
        elif self._platform_device is gce_util.PlatformDevice.INTERNAL:
            self._start_watching_for_signal()
            self._exit_code = _RESTARTABLE_EXIT_CODE
        else:
            raise NotImplementedError(
                'CoordinatedCheckpointManager is only supported'
                ' for MultiWorkerMirroredStrategy with GPU.')
Example #29
0
    def _initialize_multi_worker(self, cluster_resolver):
        """Initializes the object for multi-worker training."""
        cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_resolver.cluster_spec())
        task_type = cluster_resolver.task_type
        task_id = cluster_resolver.task_id
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`.")
        self._cluster_spec = cluster_spec
        self._task_type = task_type
        self._task_id = task_id
        self._id_in_cluster = multi_worker_util.id_in_cluster(
            self._cluster_spec, self._task_type, self._task_id)

        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError(
                "No `worker`, `chief` or `evaluator` tasks can be found "
                "in `cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)

        if (ops.executing_eagerly_outside_functions() and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            context.context().configure_collective_ops(
                collective_leader=multi_worker_util.collective_leader(
                    cluster_spec, task_type, task_id),
                scoped_allocator_enabled_ops=("CollectiveReduce", ),
                device_filters=("/job:%s/task:%d" % (task_type, task_id), ))
            self._collective_ops_configured = True
            if context.context().coordination_service is None:
                coordinated_jobs = ["chief", "worker"]
                if task_type in coordinated_jobs:
                    context.context().configure_coordination_service(
                        service_type="standalone",
                        service_leader=multi_worker_util.coordination_leader(
                            cluster_spec),
                        coordinated_jobs=coordinated_jobs)

        # Starting a std server in eager mode and in independent worker mode.
        if (context.executing_eagerly()
                and not getattr(self, "_std_server_started", False) and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            # Checking _local_or_standalone_client_mode as well because we should not
            # create the std server in standalone client mode.
            config_proto = copy.deepcopy(context.context().config)
            config_proto = self._update_config_proto(config_proto)

            # If coordination service is enabled, use its internal heartbeat to detect
            # peer failures instead of the Python-level health check.
            if config_proto.experimental.coordination_config.service_type:
                self._enable_check_health = False

            if hasattr(cluster_resolver, "port"):
                port = cluster_resolver.port
            else:
                port = 0
            server_def = tensorflow_server_pb2.ServerDef(
                cluster=cluster_spec.as_cluster_def(),
                default_session_config=config_proto,
                job_name=task_type,
                task_index=task_id,
                protocol=cluster_resolver.rpc_layer or "grpc",
                port=port)
            context.context().enable_collective_ops(server_def)
            self._std_server_started = True
            # The `ensure_initialized` is needed before calling
            # `context.context().devices()`.
            context.context().ensure_initialized()
            logging.info(
                "Enabled multi-worker collective ops with available devices: %r",
                context.context().devices())

        # TODO(yuefengz): The `num_gpus` is only for this particular task. It
        # assumes all workers have the same number of GPUs. We should remove this
        # assumption by querying all tasks for their numbers of GPUs.
        # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
        # some cases.
        local_devices, local_device_type = self._initialize_local_devices(
            cluster_resolver, self._worker_device)
        if local_device_type == "TPU":
            tpu_strategy_util.initialize_tpu_system()

        self._collective_keys = cross_device_utils.CollectiveKeys(
            group_key_start=1 + self._collective_key_base)
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            devices=local_devices,
            group_size=len(local_devices) * self._num_workers,
            options=self._communication_options,
            collective_keys=self._collective_keys)
        # CrossDeviceOps for per host tensors.
        self._host_cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            devices=[self._worker_device],
            group_size=self._num_workers,
            options=self._communication_options,
            collective_keys=self._collective_keys)
        super(CollectiveAllReduceExtended,
              self)._initialize_single_worker(local_devices)

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        # Save the num_devices_per_worker and rpc_layer for configure method.
        self._num_devices_per_worker = len(local_devices)
        self._local_device_type = local_device_type
        self._rpc_layer = cluster_resolver.rpc_layer
        self._warn_nccl_no_gpu()

        if self._enable_check_health and context.executing_eagerly():
            self._start_check_health_thread()
        else:
            logging.info("Check health not enabled.")

        logging.info(
            "MultiWorkerMirroredStrategy with cluster_spec = %r, task_type = %r, "
            "task_id = %r, num_workers = %r, local_devices = %r, "
            "communication = %s", cluster_spec.as_dict(), task_type, task_id,
            self._num_workers, local_devices,
            self._communication_options.implementation)
  def _initialize_multi_worker(self, cluster_resolver):
    """Initializes the object for multi-worker training."""
    # TODO(yuefengz): The `num_gpus` is only for this particular task. It
    # assumes all workers have the same number of GPUs. We should remove this
    # assumption by querying all tasks for their numbers of GPUs.
    # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
    # some cases.
    if isinstance(cluster_resolver, TFConfigClusterResolver):
      num_gpus = context.num_gpus()
    else:
      num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)

    cluster_spec = multi_worker_util.normalize_cluster_spec(
        cluster_resolver.cluster_spec())
    task_type = cluster_resolver.task_type
    task_id = cluster_resolver.task_id
    if task_type is None or task_id is None:
      raise ValueError("When `cluster_spec` is given, you must also specify "
                       "`task_type` and `task_id`.")

    self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type)
    if not self._num_workers:
      raise ValueError("No `worker`, `chief` or `evaluator` tasks can be found "
                       "in `cluster_spec`.")

    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                task_id)

    self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
    if num_gpus:
      local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
                            for i in range(num_gpus))
    else:
      local_devices = (self._worker_device,)

    self._collective_keys = cross_device_utils.CollectiveKeys()
    super(CollectiveAllReduceExtended, self)._initialize_local(local_devices)
    self._input_workers = input_lib.InputWorkers(
        self._device_map, [(self._worker_device, self.worker_devices)])
    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
        num_workers=self._num_workers,
        num_gpus_per_worker=num_gpus,
        collective_keys=self._collective_keys)

    # Add a default device so that ops without specified devices will not end up
    # on other workers.
    self._default_device = "/job:%s/task:%d" % (task_type, task_id)

    self._cluster_spec = cluster_spec
    self._task_type = task_type
    self._task_id = task_id

    # Save the num_gpus_per_worker and rpc_layer for configure method.
    self._num_gpus_per_worker = num_gpus
    self._rpc_layer = cluster_resolver.rpc_layer

    logging.info(
        "Multi-worker CollectiveAllReduceStrategy with cluster_spec = %r, "
        "task_type = %r, task_id = %r, num_workers = %r, local_devices = %r, "
        "communication = %s", cluster_spec.as_dict(), task_type,
        task_id, self._num_workers, local_devices,
        self._communication)

    if (context.executing_eagerly() and
        not getattr(self, "_std_server_started", False) and
        not getattr(self, "_local_or_standalone_client_mode", False)):
      # Checking _local_or_standalone_client_mode as well because we should not
      # create the std server in standalone client mode.
      config_proto = config_pb2.ConfigProto()
      config_proto = self._update_config_proto(config_proto)
      server_def = tensorflow_server_pb2.ServerDef(
          cluster=cluster_spec.as_cluster_def(),
          default_session_config=config_proto,
          job_name=task_type,
          task_index=task_id,
          protocol=cluster_resolver.rpc_layer or "grpc")
      context.context().enable_collective_ops(server_def)
      self._std_server_started = True
      logging.info(
          "Enabled multi-worker collective ops with available devices: %r",
          context.context().devices())
Example #31
0
    def __init__(self,
                 cluster_resolver,
                 checkpoint,
                 checkpoint_dir,
                 termination_config=TerminationConfig()):
        """Creates the failure handler.

    Args:
      cluster_resolver: a `tf.distribute.cluster_resolver.ClusterResolver`. You
        may also get it through the `cluster_resolver` attribute of the strategy
        in use.
      checkpoint: a `tf.train.Checkpoint` that will be saved upon preemption and
        loaded upon restart by the `WorkerPreemptionHandler` API automatically.
      checkpoint_dir: a directory for the `WorkerPreemptionHandler` to play with
        checkpoints. `WorkerPreemptionHandler` will create a
        `tf.train.CheckpointManager` to manage the passed-in `checkpoint`. Since
        only one `tf.train.CheckpointManager` should be active in a particular
        directory at a time, this `checkpoint_dir` arg should preferably be
        separated from where the user saves their checkpoint for non-fault
        tolerance purpose.
      termination_config: a `TerminationConfig` object to configure for a
        platform other than Google Borg or GCP.
    """
        self._cluster_resolver = cluster_resolver
        self._checkpoint = checkpoint
        self._id_in_cluster = str(
            multi_worker_util.id_in_cluster(
                self._cluster_resolver.cluster_spec(),
                self._cluster_resolver.task_type,
                self._cluster_resolver.task_id))

        # The number of calls to `WorkerPreemptionHandler.run` when the latest
        # checkpoint was saved.
        self._checkpointed_runs = variables.Variable(
            initial_value=constant_op.constant(0, dtype=dtypes.int64),
            trainable=False,
            name=_ITERATION_VARIABLE)
        if not hasattr(self._checkpoint, _ITERATION_VARIABLE):
            setattr(self._checkpoint, _ITERATION_VARIABLE,
                    self._checkpointed_runs)

        # Make CheckpointManagers. MultiWorkerMirroredStrategy requires different
        # setup on chief and on other workers.
        self._read_checkpoint_manager = checkpoint_management.CheckpointManager(
            checkpoint, directory=checkpoint_dir, max_to_keep=1)
        if multi_worker_util.is_chief(
                cluster_spec=cluster_resolver.cluster_spec(),
                task_type=cluster_resolver.task_type,
                task_id=cluster_resolver.task_id):
            self._write_checkpoint_manager = self._read_checkpoint_manager
        else:
            self._write_checkpoint_manager = checkpoint_management.CheckpointManager(
                checkpoint,
                _mwms_write_checkpoint_dir(checkpoint_dir,
                                           cluster_resolver.task_type,
                                           cluster_resolver.task_id,
                                           cluster_resolver.cluster_spec()),
                max_to_keep=1)

        self._read_checkpoint_manager.restore_or_initialize()

        # grace period countdown. Set to True for all workers once they finish
        # timing saving a checkpoint. Once entering this phase, new
        # preemption/maintenance notice will not be handled, since the whole cluster
        # goes down as the worker who first initiates the grace period goes down.
        self._final_checkpoint_countdown = False

        self._estimated_run_time = 0

        # An internal step counter that's restored to checkpointed_iterations when
        # training is restored. It increments by one every time
        # `WorkerPreemptionHandler.run` is called. Note that in this case, the
        # user must pass a single-step training function to
        # `WorkerPreemptionHandler.run` instead of a multiple-step one.
        self._run_counter = self._checkpointed_runs.numpy()

        # The worker itself has received preeption signal.
        self._received_own_sigterm = threading.Event()

        # Some member (could be oneself) has received preemption signal, and the
        # step number to save a checkpoint has been aligned.
        self._received_checkpoint_step = threading.Event()

        self._platform_device = gce_util.detect_platform()

        completed_termination_config = _complete_config_for_environement(
            self._platform_device, termination_config)
        self._termination_watcher_function = completed_termination_config.termination_watcher_function
        self._exit_fn = completed_termination_config.exit_fn
        self._grace_period = completed_termination_config.time_till_termination

        # When training is interrupted, we explicitly call the cleanup methods for
        # the thread watching for local worker's termination signal and the thread
        # watching for clusterwise information before we save a checkpoint and exit.
        # In the final chapter of the training where no interruption is encountered,
        # we rely on __del__ to clean up. However, there is no guarantee when or
        # whether __del__ is executed, thus we make the threads daemon to avoid it
        # preventing program from exit.
        self._cluster_wise_termination_watcher_thread = threading.Thread(
            target=self._watch_step_to_save_key,
            name='PeerTerminationWatcher-%s' % self._id_in_cluster,
            daemon=True)
        logging.info('Start watcher for peer\'s signal.')
        self._cluster_wise_termination_watcher_thread.start()

        self._poll_termination_signal_thread = None

        if completed_termination_config.termination_watcher_function:
            self._start_polling_for_termination_signal()
        else:
            self._start_watching_for_signal()
Example #32
0
    def __init__(self, cluster_resolver, checkpoint, checkpoint_dir):
        """Creates the failure handler.

    Args:
      cluster_resolver: a `tf.distribute.cluster_resolver.ClusterResolver`. You
        may also get it through the `cluster_resolver` attribute of the
        strategy in use.
      checkpoint: a `tf.train.Checkpoint` that will be saved upon preemption and
        loaded upon restart by the `CoordinatedCheckpointManager` API
        automatically.
      checkpoint_dir: a directory for the `CoordinatedCheckpointManager` to play
        with checkpoints. `CoordinatedCheckpointManager` will create a
        `tf.train.CheckpointManager` to manage the passed-in `checkpoint`. Since
        only one `tf.train.CheckpointManager` should be active in a particular
        directory at a time, this `checkpoint_dir` arg should preferably be
        separated from where the user saves their checkpoint for non-fault
        tolerance purpose.
    """
        self._cluster_resolver = cluster_resolver
        self._checkpoint = checkpoint
        self._id_in_cluster = str(
            multi_worker_util.id_in_cluster(
                self._cluster_resolver.cluster_spec(),
                self._cluster_resolver.task_type,
                self._cluster_resolver.task_id))

        # The number of calls to `CoordinatedCheckpointManager.run` when the latest
        # checkpoint was saved.
        self._checkpointed_runs = variables.Variable(
            initial_value=constant_op.constant(0, dtype=dtypes.int64),
            trainable=False,
            name=_ITERATION_VARIABLE)
        if not hasattr(self._checkpoint, _ITERATION_VARIABLE):
            setattr(self._checkpoint, _ITERATION_VARIABLE,
                    self._checkpointed_runs)

        # Make CheckpointManagers. MultiWorkerMirroredStrategy requires different
        # setup on chief and on other workers.
        self._read_checkpoint_manager = checkpoint_management.CheckpointManager(
            checkpoint, directory=checkpoint_dir, max_to_keep=1)
        if multi_worker_util.is_chief(
                cluster_spec=cluster_resolver.cluster_spec(),
                task_type=cluster_resolver.task_type,
                task_id=cluster_resolver.task_id):
            self._write_checkpoint_manager = self._read_checkpoint_manager
        else:
            self._write_checkpoint_manager = checkpoint_management.CheckpointManager(
                checkpoint,
                _mwms_write_checkpoint_dir(checkpoint_dir,
                                           cluster_resolver.task_type,
                                           cluster_resolver.task_id,
                                           cluster_resolver.cluster_spec()),
                max_to_keep=1)

        self._read_checkpoint_manager.restore_or_initialize()

        # An internal step counter that's restored to checkpointed_iterations when
        # training is restored. It increments by one every time
        # `CoordinatedCheckpointManager.run` is called. Note that in this case, the
        # user must pass a single-step training function to
        # `CoordinatedCheckpointManager.run` instead of a multiple-step one.
        self._run_counter = self._checkpointed_runs.numpy()

        # The worker itself has received preeption signal.
        self._received_own_sigterm = threading.Event()

        # Some member (could be oneself) has received preemption signal, and the
        # step number to save a checkpoint has been aligned.
        self._received_sigterm_and_step = threading.Event()

        # TODO(wxinyi): Enforce that only one instance of this class is created
        # per program.
        # TODO(wxinyi): make the thread non-daemon.
        threading.Thread(target=self._wait_for_signal, daemon=True).start()
        signal.signal(signal.SIGTERM, self._sigterm_handler_fn)
Example #33
0
    def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
                                 task_type, task_id):
        """Initialize devices for multiple workers.

    It creates variable devices and compute devices. Variables and operations
    will be assigned to them respectively. We have one compute device per
    replica. The variable device is a device function or device string. The
    default variable device assigns variables to parameter servers in a
    round-robin fashion.

    Args:
      num_gpus_per_worker: number of local GPUs or GPUs per worker.
      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
        cluster configurations.
      task_type: the current task type.
      task_id: the current task id.

    Raises:
      ValueError: if the cluster_spec doesn't have ps jobs.
    """
        assert cluster_spec
        if not task_type or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`")
        cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)

        self._worker_device = "/job:%s/task:%d" % (self._task_type,
                                                   self._task_id)

        # Define compute devices which is a list of device strings and one for each
        # replica. When there are GPUs, replicate operations on these GPUs.
        # Otherwise, place operations on CPU.
        if num_gpus_per_worker > 0:
            self._compute_devices = [
                "%s/device:GPU:%d" % (self._worker_device, i)
                for i in range(num_gpus_per_worker)
            ]
        else:
            self._compute_devices = [self._worker_device]

        self._compute_devices = list(
            map(device_util.resolve, self._compute_devices))
        self._canonical_compute_device_set = set(self._compute_devices)

        # In distributed mode, place variables on ps jobs in a round-robin fashion.
        # Note that devices returned from `replica_device_setter` are not
        # canonical and therefore we don't canonicalize all variable devices to
        # make them consistent.
        # TODO(yuefengz): support passing a strategy object to control variable
        # assignment.
        # TODO(yuefengz): merge the logic of replica_device_setter into this
        # class.
        num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
        if num_ps_replicas == 0:
            raise ValueError("The cluster spec needs to have `ps` jobs.")
        self._variable_device = device_setter.replica_device_setter(
            ps_tasks=num_ps_replicas,
            worker_device=self._worker_device,
            merge_devices=True,
            cluster=cluster_spec)

        # The `_parameter_devices` is needed for the `parameter_devices` property
        # and is a list of all variable devices. Here parameter devices are all
        # tasks of the "ps" job.
        self._parameter_devices = map("/job:ps/task:{}".format,
                                      range(num_ps_replicas))

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = self._worker_device

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)
        self._cluster_spec = cluster_spec
        self._task_type = task_type
        self._task_id = task_id

        logging.info(
            "Multi-worker ParameterServerStrategy with "
            "cluster_spec = %r, task_type = %r, task_id = %r, "
            "num_ps_replicas = %r, is_chief = %r, compute_devices = %r, "
            "variable_device = %r", cluster_spec.as_dict(), task_type, task_id,
            num_ps_replicas, self._is_chief, self._compute_devices,
            self._variable_device)
    def _initialize_multi_worker(self, cluster_resolver):
        """Initialize devices for multiple workers.

    It creates variable devices and compute devices. Variables and operations
    will be assigned to them respectively. We have one compute device per
    replica. The variable device is a device function or device string. The
    default variable device assigns variables to parameter servers in a
    round-robin fashion.

    Args:
      cluster_resolver: a descendant of `ClusterResolver` object.

    Raises:
      ValueError: if the cluster doesn't have ps jobs.
    """
        num_gpus = cluster_resolver.num_accelerators()
        cluster_spec = cluster_resolver.cluster_spec()
        task_type = cluster_resolver.task_type
        task_id = cluster_resolver.task_id
        if not task_type or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`")
        cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
        assert cluster_spec.as_dict()

        worker_device = "/job:%s/task:%d" % (task_type, task_id)
        self._input_host_device = numpy_dataset.SingleDevice(worker_device)

        # Define compute devices which is a list of device strings and one for each
        # replica. When there are GPUs, replicate operations on these GPUs.
        # Otherwise, place operations on CPU.
        if num_gpus > 0:
            compute_devices = tuple("%s/device:GPU:%d" % (worker_device, i)
                                    for i in range(num_gpus))
        else:
            compute_devices = (worker_device, )

        self._device_map = values.ReplicaDeviceMap(compute_devices)
        self._input_workers = input_lib.InputWorkers(
            self._device_map, [(worker_device, compute_devices)])

        # In distributed mode, place variables on ps jobs in a round-robin fashion.
        # Note that devices returned from `replica_device_setter` are not
        # canonical and therefore we don't canonicalize all variable devices to
        # make them consistent.
        # TODO(yuefengz): support passing a strategy object to control variable
        # assignment.
        # TODO(yuefengz): merge the logic of replica_device_setter into this
        # class.
        num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
        if num_ps_replicas == 0:
            raise ValueError("The cluster spec needs to have `ps` jobs.")
        self._variable_device = device_setter.replica_device_setter(
            ps_tasks=num_ps_replicas,
            worker_device=worker_device,
            merge_devices=True,
            cluster=cluster_spec)

        # The `_parameter_devices` is needed for the `parameter_devices` property
        # and is a list of all variable devices. Here parameter devices are all
        # tasks of the "ps" job.
        self._parameter_devices = tuple(
            map("/job:ps/task:{}".format, range(num_ps_replicas)))

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = worker_device

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)
        self._cluster_spec = cluster_spec
        self._task_type = task_type
        self._task_id = task_id

        logging.info(
            "Multi-worker ParameterServerStrategy with "
            "cluster_spec = %r, task_type = %r, task_id = %r, "
            "num_ps_replicas = %r, is_chief = %r, device_map = %r, "
            "variable_device = %r", cluster_spec.as_dict(), task_type, task_id,
            num_ps_replicas, self._is_chief, self._device_map,
            self._variable_device)
Example #35
0
    def __init__(self,
                 cluster_resolver,
                 checkpoint_or_checkpoint_manager,
                 checkpoint_dir=None,
                 termination_config=None):
        """Creates the `PreemptionCheckpointHandler`.

    Args:
      cluster_resolver: a `tf.distribute.cluster_resolver.ClusterResolver`
        object. You may also obtain it through the `cluster_resolver` attribute
        of the distribution strategy in use.
      checkpoint_or_checkpoint_manager: a `tf.train.CheckpointManager` or a
        `tf.train.Checkpoint`. If you are using a `tf.train.CheckpointManager`
        to manage checkpoints outside the `PreemptionCheckpointHandler` for
        backup purpose as well, pass it as `checkpoint_or_checkpoint_manager`
        argument. Otherwise, pass a `tf.train.Checkpoint` and the
        `PreemptionCheckpointHandler` will create
        a `tf.train.CheckpointManager` to manage it in the `checkpoint_dir`.
      checkpoint_dir: a directory where the `PreemptionCheckpointHandler` saves
        and restores checkpoints. When a `PreemptionCheckpointHandler` is
        created, the latest checkpoint in the `checkpoint_dir` will be restored.
        (This is not needed if a `tf.train.CheckpointManager` instead of a
        `tf.train.Checkpoint` is passed as the
        `checkpoint_or_checkpoint_manager` argument.)
      termination_config: optional, a
        `tf.distribute.experimental.TerminationConfig` object to configure for a
        platform other than Google Borg or GCP.
    """
        self._cluster_resolver = cluster_resolver

        if not cluster_resolver.cluster_spec().jobs:
            # For local-mode MultiWorkerMirroredStrategy, an empty cluster spec is
            # passed, and coordination service is not enabled nor is it needed (since
            # it's used for cross-worker communication). Thus we will directly name
            # the worker id and is_chief properties and also skip the
            # uploading/reading from coordination service logic.
            self._local_mode = True
            self._id_in_cluster = 'single_worker'
            self._is_chief = True
        else:
            self._local_mode = False
            self._id_in_cluster = str(
                multi_worker_util.id_in_cluster(
                    self._cluster_resolver.cluster_spec(),
                    self._cluster_resolver.task_type,
                    self._cluster_resolver.task_id))
            self._is_chief = multi_worker_util.is_chief(
                cluster_spec=cluster_resolver.cluster_spec(),
                task_type=cluster_resolver.task_type,
                task_id=cluster_resolver.task_id)
        if isinstance(checkpoint_or_checkpoint_manager,
                      checkpoint_lib.Checkpoint) and not checkpoint_dir:
            raise errors.InvalidArgumentError(
                'When a checkpoint is passed, a '
                'checkpoint_dir must be passed as well'
                '.')

        # The number of calls to `PreemptionCheckpointHandler.run` when the latest
        # checkpoint was saved.
        self._checkpointed_runs = variables.Variable(
            initial_value=constant_op.constant(0, dtype=dtypes.int64),
            trainable=False,
            name=_ITERATION_VARIABLE)

        self._maybe_create_checkpoint_manager(checkpoint_or_checkpoint_manager,
                                              checkpoint_dir, cluster_resolver)

        if not hasattr(self._write_checkpoint_manager._checkpoint,
                       _ITERATION_VARIABLE):
            setattr(self._write_checkpoint_manager._checkpoint,
                    _ITERATION_VARIABLE, self._checkpointed_runs)

        if not hasattr(self._read_checkpoint_manager._checkpoint,
                       _ITERATION_VARIABLE):
            setattr(self._read_checkpoint_manager._checkpoint,
                    _ITERATION_VARIABLE, self._checkpointed_runs)

        self._read_checkpoint_manager.restore_or_initialize()

        # grace period countdown. Set to True for all workers once they finish
        # timing saving a checkpoint. Once entering this phase, new
        # preemption/maintenance notice will not be handled, since the whole cluster
        # goes down as the worker who first initiates the grace period goes down.
        self._final_checkpoint_countdown = False

        self._estimated_run_time = 0

        # An internal step counter that's restored to checkpointed_iterations when
        # training is restored. It increments by one every time
        # `PreemptionCheckpointHandler.run` is called. Note that in this case, the
        # user must pass a single-step training function to
        # `PreemptionCheckpointHandler.run` instead of a multiple-step one.
        self._run_counter = self._checkpointed_runs.numpy()

        # The worker itself has received preeption signal.
        self._received_own_sigterm = threading.Event()

        # Some member (could be oneself) has received preemption signal, and the
        # step number to save a checkpoint has been aligned.
        self._received_checkpoint_step = threading.Event()

        self._platform_device = gce_util.detect_platform()

        if self._platform_device in (gce_util.PlatformDevice.GCE_TPU,
                                     gce_util.PlatformDevice.GCE_CPU):
            # While running MultiWorkerMirroredStrategy training with GPUs and CPUs
            # are the same on Borg, GCE CPU VM and GPU VM are different in terms
            # of live migration, grace period, etc. We can make it work upon request.
            raise NotImplementedError(
                'PreemptionCheckpointHandler does not support '
                'training with TPU or CPU device on GCP.')

        completed_termination_config = _complete_config_for_environment(
            self._platform_device, termination_config)
        self._termination_watcher_fn = completed_termination_config.termination_watcher_fn
        self._exit_fn = completed_termination_config.exit_fn
        self._grace_period = completed_termination_config.grace_period

        if not self._local_mode:
            # When training is interrupted, we explicitly call the cleanup methods for
            # the thread watching for local worker's termination signal and the thread
            # watching for clusterwise information before we save a checkpoint and
            # exit. In the final chapter of the training where no interruption is
            # encountered, we rely on __del__ to clean up. However, there is no
            # guarantee when or whether __del__ is executed, thus we make the threads
            # daemon to avoid it preventing program from exit.
            self._cluster_wise_termination_watcher_thread = threading.Thread(
                target=self._watch_step_to_save_key,
                name='PeerTerminationWatcher-%s' % self._id_in_cluster,
                daemon=True)
            logging.info('Start watcher for peer\'s signal.')
            self._cluster_wise_termination_watcher_thread.start()

        else:
            self._cluster_wise_termination_watcher_thread = None

        self._poll_termination_signal_thread = None

        if completed_termination_config.termination_watcher_fn:
            self._start_polling_for_termination_signal()
        else:
            self._start_watching_for_signal()