def _initialize_local(self, cluster_resolver):
    """Initializes the object for local training."""
    self._is_chief = True
    self._num_workers = 1

    num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)
    if num_gpus:
      local_devices = tuple("/device:GPU:%d" % i for i in range(num_gpus))
    else:
      local_devices = ("/device:CPU:0",)
    self._worker_device = device_util.canonicalize("/device:CPU:0")
    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)

    self._collective_keys = cross_device_utils.CollectiveKeys()
    super(CollectiveAllReduceExtended, self)._initialize_local(local_devices)
    # TODO(yuefengz): remove num_gpus_per_worker from CollectiveAllReduce.
    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
        num_workers=self._num_workers,
        num_gpus_per_worker=num_gpus,
        collective_keys=self._collective_keys)

    self._cluster_spec = None
    self._task_type = None
    self._task_id = None

    logging.info("CollectiveAllReduceStrategy with local_devices = %r",
                 local_devices)
Example #2
0
 def _get_test_objects(self, task_type, task_id, num_gpus=0, local_mode=False):
   collective_keys = cross_device_utils.CollectiveKeys(
       group_key_start=10 * num_gpus +
       MultiWorkerCollectiveAllReduceTest.collective_key_base,
       instance_key_start=num_gpus * 100 +
       MultiWorkerCollectiveAllReduceTest.collective_key_base,
       instance_key_with_id_start=num_gpus * 10000 +
       MultiWorkerCollectiveAllReduceTest.collective_key_base)
   if local_mode:
     collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
         1, num_gpus, collective_keys=collective_keys)
     if num_gpus:
       devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
     else:
       devices = ["/device:CPU:0"]
     return collective_all_reduce_ops, devices, ""
   else:
     collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
         3, num_gpus, collective_keys=collective_keys)
     if num_gpus:
       devices = [
           "/job:%s/task:%d/device:GPU:%d" % (task_type, task_id, i)
           for i in range(num_gpus)
       ]
     else:
       devices = ["/job:%s/task:%d" % (task_type, task_id)]
     return (collective_all_reduce_ops, devices,
             "grpc://" + self._cluster_spec[task_type][task_id])
Example #3
0
    def _initialize_local(self, cluster_resolver):
        """Initializes the object for local training."""
        self._is_chief = True
        self._num_workers = 1

        if ops.executing_eagerly_outside_functions():
            try:
                context.context().configure_collective_ops(
                    scoped_allocator_enabled_ops=("CollectiveReduce", ),
                    use_nccl_communication=(
                        self._communication ==
                        cross_device_ops_lib.CollectiveCommunication.NCCL))
            except RuntimeError:
                logging.warning(
                    "Collective ops is not configured at program startup. "
                    "Some performance features may not be enabled.")
            self._collective_ops_configured = True

        # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
        # some cases.
        if isinstance(cluster_resolver, TFConfigClusterResolver):
            num_gpus = context.num_gpus()
        else:
            num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)

        if num_gpus:
            local_devices = tuple("/device:GPU:%d" % i
                                  for i in range(num_gpus))
        else:
            local_devices = ("/device:CPU:0", )
        self._worker_device = device_util.canonicalize("/device:CPU:0")
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)

        self._collective_keys = cross_device_utils.CollectiveKeys()
        super(CollectiveAllReduceExtended,
              self)._initialize_local(local_devices)
        # TODO(yuefengz): remove num_gpus_per_worker from CollectiveAllReduce.
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            num_workers=self._num_workers,
            num_gpus_per_worker=num_gpus,
            collective_keys=self._collective_keys)

        self._cluster_spec = None
        self._task_type = None
        self._task_id = None

        # This is a mark to tell whether we are running with standalone client or
        # independent worker. Right now with standalone client, strategy object is
        # created as local strategy and then turn into multi-worker strategy via
        # configure call.
        self._local_or_standalone_client_mode = True

        # Save the num_gpus_per_worker and rpc_layer for configure method.
        self._num_gpus_per_worker = num_gpus
        self._rpc_layer = cluster_resolver.rpc_layer
        self._warn_nccl_no_gpu()

        logging.info("CollectiveAllReduceStrategy with local_devices = %r",
                     local_devices)
    def _initialize_multi_worker(self, cluster_resolver):
        """Initializes the object for multi-worker training."""
        # TODO(yuefengz): The `num_gpus` is only for this particular task. It
        # assumes all workers have the same number of GPUs. We should remove this
        # assumption by querying all tasks for their numbers of GPUs.
        num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)
        cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_resolver.cluster_spec())
        task_type = cluster_resolver.task_type
        task_id = cluster_resolver.task_id
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id` in the `cluster_resolver`.")
        if task_type not in ("chief", "worker"):
            raise ValueError(
                "Unrecognized task_type: %r, valid task types are: \"chief\", "
                "\"worker\"." % task_type)

        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError("No `worker` or `chief` tasks can be found in "
                             "`cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)
        if num_gpus:
            local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
                                  for i in range(num_gpus))
        else:
            local_devices = (self._worker_device, )

        self._collective_keys = cross_device_utils.CollectiveKeys()
        super(CollectiveAllReduceExtended,
              self)._initialize_local(local_devices)
        self._input_workers = input_lib.InputWorkers(
            self._device_map, [(self._worker_device, self.worker_devices)])
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            num_workers=self._num_workers,
            num_gpus_per_worker=num_gpus,
            collective_keys=self._collective_keys)

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        self._cluster_spec = cluster_spec
        self._task_type = task_type
        self._task_id = task_id

        logging.info(
            "Multi-worker CollectiveAllReduceStrategy with "
            "cluster_spec = %r, task_type = %r, task_id = %r, "
            "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
            task_type, task_id, self._num_workers, local_devices)
Example #5
0
    def _initialize_local_worker(self, container_strategy,
                                 num_gpus_per_worker):
        """Initializes the object for local training."""
        self._is_chief = True
        self._num_workers = 1

        if num_gpus_per_worker:
            local_devices = [
                "/device:GPU:%d" % i for i in range(num_gpus_per_worker)
            ]
        else:
            local_devices = ["/device:CPU:0"]

        self._collective_keys = cross_device_utils.CollectiveKeys()
        super(CollectiveAllReduceExtended, self).__init__(
            container_strategy,
            devices=local_devices,
            cross_device_ops=cross_device_ops_lib.CollectiveAllReduce(
                num_workers=1,
                num_gpus_per_worker=num_gpus_per_worker,
                collective_keys=self._collective_keys))

        self._cluster_spec = None
        self._task_type = None
        self._task_id = None

        logging.info("CollectiveAllReduceStrategy with local_devices = %r",
                     local_devices)
Example #6
0
 def _make_collective_ops(self, devices):
   self._collective_keys = cross_device_utils.CollectiveKeys(
       group_key_start=1 + self._collective_key_base)  # pylint: disable=protected-access
   return cross_device_ops_lib.CollectiveAllReduce(
       devices=self._devices,
       group_size=len(self._devices),
       collective_keys=self._collective_keys)
Example #7
0
  def _initialize_local(self, cluster_resolver):
    """Initializes the object for local training."""
    self._is_chief = True
    self._num_workers = 1

    num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)
    if num_gpus:
      local_devices = tuple("/device:GPU:%d" % i for i in range(num_gpus))
    else:
      local_devices = ("/device:CPU:0",)
    self._worker_device = device_util.canonicalize("/device:CPU:0")
    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)

    self._collective_keys = cross_device_utils.CollectiveKeys()
    super(CollectiveAllReduceExtended, self)._initialize_local(local_devices)
    # TODO(yuefengz): remove num_gpus_per_worker from CollectiveAllReduce.
    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
        num_workers=self._num_workers,
        num_gpus_per_worker=num_gpus,
        collective_keys=self._collective_keys)

    self._cluster_spec = None
    self._task_type = None
    self._task_id = None

    # This is a mark to tell whether we are running with standalone client or
    # independent worker. Right now with standalone client, strategy object is
    # created as local strategy and then turn into multi-worker strategy via
    # configure call.
    self._local_or_standalone_client_mode = True

    logging.info("CollectiveAllReduceStrategy with local_devices = %r",
                 local_devices)
Example #8
0
 def _get_test_object(self, task_type, task_id, num_gpus=0):
   distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
       num_gpus_per_worker=num_gpus)
   session_config = config_pb2.ConfigProto()
   if task_type and task_id is not None:
     distribution.configure(
         session_config=session_config,
         cluster_spec=self._cluster_spec,
         task_type=task_type,
         task_id=task_id)
   collective_keys = cross_device_utils.CollectiveKeys(
       group_key_start=10 * num_gpus +
       CollectiveAllReduceStrategyTestBase.collective_key_base,
       instance_key_start=num_gpus * 100 +
       CollectiveAllReduceStrategyTestBase.collective_key_base,
       instance_key_with_id_start=num_gpus * 10000 +
       CollectiveAllReduceStrategyTestBase.collective_key_base)
   distribution.extended._collective_keys = collective_keys
   distribution.extended._cross_device_ops._collective_keys = (
       collective_keys)
   if task_type and task_id is not None:
     return distribution, 'grpc://' + self._cluster_spec[task_type][
         task_id], session_config
   else:
     return distribution, '', session_config
Example #9
0
    def _initialize_local_worker(self, num_gpus_per_worker):
        """Initializes the object for local training."""
        self._is_chief = True
        self._num_workers = 1

        if num_gpus_per_worker:
            local_devices = tuple("/device:GPU:%d" % i
                                  for i in range(num_gpus_per_worker))
        else:
            local_devices = ("/device:CPU:0", )
        self._worker_device = device_util.canonicalize("/device:CPU:0")

        self._collective_keys = cross_device_utils.CollectiveKeys()
        self._initialize_local(local_devices)
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            num_workers=self._num_workers,
            num_gpus_per_worker=num_gpus_per_worker,
            collective_keys=self._collective_keys)

        self._cluster_spec = None
        self._task_type = None
        self._task_id = None

        logging.info("CollectiveAllReduceStrategy with local_devices = %r",
                     local_devices)
 def _make_collective_ops(self, devices):
     self._collective_keys = cross_device_utils.CollectiveKeys(
         group_key_start=1 + self._collective_key_base)
     return cross_device_ops_lib.CollectiveAllReduce(
         devices=self._devices,
         group_size=len(self._devices),
         options=self._communication_options,
         collective_keys=self._collective_keys)
Example #11
0
    def _get_test_objects(self,
                          task_type,
                          task_id,
                          num_gpus=0,
                          use_strategy_object=False,
                          local_mode=False):
        collective_keys = cross_device_utils.CollectiveKeys(
            group_key_start=10 * num_gpus +
            CollectiveAllReduceTest.collective_key_base,
            instance_key_start=num_gpus * 100 +
            CollectiveAllReduceTest.collective_key_base,
            instance_key_with_id_start=num_gpus * 10000 +
            CollectiveAllReduceTest.collective_key_base)
        if local_mode:
            if num_gpus:
                devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
            else:
                devices = ["/device:CPU:0"]

            if use_strategy_object:
                # Still using contrib CollectiveAllReduceStrategy because we can specify
                # num_gpus in its constructor.
                strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
                    num_gpus_per_worker=num_gpus)
                strategy.extended._collective_keys = collective_keys
                strategy.extended._cross_device_ops._collective_keys = collective_keys
                return strategy, devices, ""
            else:
                collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
                    1, num_gpus, collective_keys=collective_keys)
                return collective_all_reduce_ops, devices, ""
        else:
            if num_gpus:
                devices = [
                    "/job:%s/task:%d/replica:0/device:GPU:%d" %
                    (task_type, task_id, i) for i in range(num_gpus)
                ]
            else:
                devices = [
                    "/job:%s/task:%d/replica:0/device:CPU:0" %
                    (task_type, task_id)
                ]

            if use_strategy_object:
                strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
                    num_gpus_per_worker=num_gpus)
                strategy.configure(cluster_spec=self._cluster_spec,
                                   task_type=task_type,
                                   task_id=task_id)
                strategy.extended._collective_keys = collective_keys
                strategy.extended._cross_device_ops._collective_keys = collective_keys
                return (strategy, devices,
                        "grpc://" + self._cluster_spec[task_type][task_id])
            else:
                collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
                    NUM_WORKERS, num_gpus, collective_keys=collective_keys)
                return (collective_all_reduce_ops, devices,
                        "grpc://" + self._cluster_spec[task_type][task_id])
Example #12
0
  def _get_test_objects(self,
                        task_type,
                        task_id,
                        num_gpus=0,
                        communication=CollectiveCommunication.AUTO,
                        use_strategy_object=False,
                        local_mode=False):
    collective_keys = cross_device_utils.CollectiveKeys(
        group_key_start=10 + CollectiveAllReduceTest.collective_key_base)
    if local_mode:
      if num_gpus:
        devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
      else:
        devices = ["/device:CPU:0"]

      if use_strategy_object:
        strategy = (mwms_lib.CollectiveAllReduceStrategy
                    ._from_local_devices(devices, communication=communication))  # pylint: disable=protected-access
        return strategy, devices, ""
      else:
        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
            devices=devices,
            group_size=len(devices),
            collective_keys=collective_keys)
        return collective_all_reduce_ops, devices, ""
    else:
      # NCCL requires physical GPUs for every replica, which we can't do with
      # simulated multi host set up now.
      assert communication != CollectiveCommunication.NCCL
      if num_gpus:
        devices = [
            "/job:%s/task:%d/replica:0/device:GPU:%d" % (task_type, task_id, i)
            for i in range(num_gpus)
        ]
      else:
        devices = [
            "/job:%s/task:%d/replica:0/device:CPU:0" % (task_type, task_id)
        ]

      if use_strategy_object:
        resolver = cluster_resolver.SimpleClusterResolver(
            cluster_spec=multi_worker_util.normalize_cluster_spec(
                self._cluster_spec),
            task_type=task_type,
            task_id=task_id,
            num_accelerators={"GPU": num_gpus})
        strategy = mwms_lib.CollectiveAllReduceStrategy(
            cluster_resolver=resolver, communication=communication)
        return (strategy, devices,
                "grpc://" + self._cluster_spec[task_type][task_id])
      else:
        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
            devices=devices,
            group_size=len(devices) * NUM_WORKERS,
            collective_keys=collective_keys)
        return (collective_all_reduce_ops, devices,
                "grpc://" + self._cluster_spec[task_type][task_id])
Example #13
0
  def _get_test_objects(self,
                        task_type,
                        task_id,
                        num_gpus=0,
                        use_strategy_object=False,
                        local_mode=False,
                        num_packs=1):
    collective_keys = cross_device_utils.CollectiveKeys(
        group_key_start=10 + CollectiveAllReduceTest.collective_key_base,
        op_instance_key_start=100 + CollectiveAllReduceTest.collective_key_base,
        variable_instance_key_start=10000 +
        CollectiveAllReduceTest.collective_key_base)
    if local_mode:
      if num_gpus:
        devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
      else:
        devices = ["/device:CPU:0"]

      if use_strategy_object:
        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
        strategy.extended._collective_keys = collective_keys
        strategy.extended._cross_device_ops._collective_keys = collective_keys
        return strategy, devices, ""
      else:
        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
            1, num_gpus, collective_keys=collective_keys, num_packs=num_packs)
        return collective_all_reduce_ops, devices, ""
    else:
      if num_gpus:
        devices = [
            "/job:%s/task:%d/replica:0/device:GPU:%d" % (task_type, task_id, i)
            for i in range(num_gpus)
        ]
      else:
        devices = [
            "/job:%s/task:%d/replica:0/device:CPU:0" % (task_type, task_id)
        ]

      if use_strategy_object:
        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
        strategy.configure(
            cluster_spec=self._cluster_spec,
            task_type=task_type,
            task_id=task_id)
        strategy.extended._collective_keys = collective_keys
        strategy.extended._cross_device_ops._collective_keys = collective_keys
        return (strategy, devices,
                "grpc://" + self._cluster_spec[task_type][task_id])
      else:
        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
            NUM_WORKERS, num_gpus, collective_keys=collective_keys,
            num_packs=num_packs)
        return (collective_all_reduce_ops, devices,
                "grpc://" + self._cluster_spec[task_type][task_id])
Example #14
0
    def _initialize_multi_worker(self, container_strategy, num_gpus_per_worker,
                                 cluster_spec, task_type, task_id):
        """Initializes the object for multi-worker training."""
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`")
        if task_type not in ["chief", "worker"]:
            raise ValueError(
                "Unrecognized task_type: %r, valid task types are: \"chief\", "
                "\"worker\"." % task_type)
        cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError("No `worker` or `chief` tasks can be found in "
                             "`cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        worker_device = "/job:%s/task:%d" % (task_type, task_id)
        if num_gpus_per_worker:
            local_devices = [
                "%s/device:GPU:%d" % (worker_device, i)
                for i in range(num_gpus_per_worker)
            ]
        else:
            local_devices = [worker_device]

        self._collective_keys = cross_device_utils.CollectiveKeys()
        super(CollectiveAllReduceExtended, self).__init__(
            container_strategy,
            devices=local_devices,
            cross_device_ops=cross_device_ops_lib.CollectiveAllReduce(
                num_workers=self._num_workers,
                num_gpus_per_worker=num_gpus_per_worker,
                collective_keys=self._collective_keys))

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        self._cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_spec)
        self._task_type = task_type
        self._task_id = task_id

        logging.info(
            "Multi-worker CollectiveAllReduceStrategy with "
            "cluster_spec = %r, task_type = %r, task_id = %r, "
            "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
            task_type, task_id, self._num_workers, local_devices)
Example #15
0
    def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
                                 task_type, task_id):
        """Initializes the object for multi-worker training."""
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`")
        if task_type not in ("chief", "worker"):
            raise ValueError(
                "Unrecognized task_type: %r, valid task types are: \"chief\", "
                "\"worker\"." % task_type)
        cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError("No `worker` or `chief` tasks can be found in "
                             "`cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)
        if num_gpus_per_worker:
            local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
                                  for i in range(num_gpus_per_worker))
        else:
            local_devices = (self._worker_device, )

        self._collective_keys = cross_device_utils.CollectiveKeys()
        self._initialize_local(local_devices)
        self._input_workers = input_lib.InputWorkers(
            self._device_map, [(self._worker_device, self.worker_devices)])
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            num_workers=self._num_workers,
            num_gpus_per_worker=num_gpus_per_worker,
            collective_keys=self._collective_keys)

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        self._cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_spec)
        self._task_type = task_type
        self._task_id = task_id

        logging.info(
            "Multi-worker CollectiveAllReduceStrategy with "
            "cluster_spec = %r, task_type = %r, task_id = %r, "
            "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
            task_type, task_id, self._num_workers, local_devices)
Example #16
0
  def _make_collective_ops(self, devices):
    if ops.executing_eagerly_outside_functions():
      try:
        context.context().configure_collective_ops(
            scoped_allocator_enabled_ops=("CollectiveReduce",))
      except RuntimeError:
        logging.warning("Collective ops is not configured at program startup."
                        " Some performance features may not be enabled.")

    self._collective_keys = cross_device_utils.CollectiveKeys(
        group_key_start=1 + self._collective_key_base)  # pylint: disable=protected-access
    return cross_device_ops_lib.CollectiveAllReduce(
        devices=self._devices,
        group_size=len(self._devices),
        collective_keys=self._collective_keys)
Example #17
0
    def __init__(self,
                 devices,
                 group_size,
                 collective_keys=None,
                 communication=CollectiveCommunication.AUTO):
        """Initializes the object.

    Args:
      devices: a list of device strings to run collectives on.
      group_size: the global group size. For between-graph replicated training
        it's the total number of devices across all workers.
      collective_keys: an optional CollectiveKey object.
      communication: indicates which collective communication to use.
    """
        if group_size % len(devices) > 0:
            raise ValueError(
                "group_size must be divisible by the number of devices.")

        self._devices = tuple(device_util.canonicalize(d) for d in devices)
        self._group_size = group_size
        self._collective_keys = (collective_keys
                                 or cross_device_utils.CollectiveKeys())
        self._communication = communication
        # This lock guards all collective launches, i.e. calls to
        # cross_device_utils.build_collectve_*.
        #
        # In a multi threaded eager program we need to ensure different groups of
        # collectives don't interleave each other, otherwise there couuld be
        # deadlocks. E.g. if two user threads both are launching collectives:
        #   user-thread-0  device0                 device1
        #   user-thread-1          device0 device1
        # In eager mode, we use one executor per device. Executors use single FIFO
        # queues, so the above launch sequences end up with the following queues:
        #   device-0  collective-0  collective-1
        #   device-1  collective-1  collective-0
        # This deadlocks since neither collective is able to finish.
        self._lock = threading.Lock()

        # Collective ops requires all devices to participate and is blocking. In
        # eager, we need one async executor for each device to be able to launch
        # them altogether. Note that async doesn't imply concurrency. Within an
        # async executor operations are still executed sequentially. In graph or
        # function building, the executors are not used.
        self._executors = []
        for _ in range(len(devices)):
            self._executors.append(executor.new_executor(enable_async=True))

        super(CollectiveAllReduce, self).__init__()
Example #18
0
  def _get_test_object(self, task_type, task_id, num_gpus=0):
    strategy, target, session_config = create_test_objects(
        cluster_spec=self._cluster_spec,
        task_type=task_type,
        task_id=task_id,
        num_gpus=num_gpus)

    collective_keys = cross_device_utils.CollectiveKeys(
        group_key_start=10 +
        CollectiveAllReduceStrategyTestBase.collective_key_base,
        op_instance_key_start=100 +
        CollectiveAllReduceStrategyTestBase.collective_key_base,
        variable_instance_key_start=10000 +
        CollectiveAllReduceStrategyTestBase.collective_key_base)
    strategy.extended._collective_keys = collective_keys
    strategy.extended._cross_device_ops._collective_keys = (collective_keys)

    return strategy, target, session_config
  def __init__(self,
               num_workers=1,
               num_gpus_per_worker=0,
               num_packs=1,
               collective_keys=None):
    """Initializes the object.

    Args:
      num_workers: number of workers in the between-graph replicated training.
      num_gpus_per_worker: number of GPUs per worker.
      num_packs: gradients will be packed into `num_packs` chunks.
      collective_keys: an optional CollectiveKey object.
    """
    self._num_workers = num_workers
    self._num_gpus_per_worker = num_gpus_per_worker
    self._num_packs = num_packs
    self._collective_keys = (collective_keys or
                             cross_device_utils.CollectiveKeys())
    super(CollectiveAllReduce, self).__init__()
Example #20
0
    def __init__(self,
                 num_workers=1,
                 num_gpus_per_worker=0,
                 all_reduce_merge_scope=32,
                 collective_keys=None):
        """Initializes the object.

    Args:
      num_workers: number of workers in the between-graph replicated training.
      num_gpus_per_worker: number of GPUs per worker.
      all_reduce_merge_scope: size of groups into which to partition consecutive
        gradients grouped under a common 'allreduce' name scope. This is useful
        for some optimization of collective ops.
      collective_keys: an optional CollectiveKey object.
    """
        self._num_workers = num_workers
        self._num_gpus_per_worker = num_gpus_per_worker
        self._all_reduce_merge_scope = all_reduce_merge_scope
        self._collective_keys = (collective_keys
                                 or cross_device_utils.CollectiveKeys())
        super(CollectiveAllReduce, self).__init__()
Example #21
0
  def _get_test_object(self,
                       task_type,
                       task_id,
                       num_gpus=0,
                       use_core_strategy=False):
    strategy, target, session_config = create_test_objects(
        cluster_spec=self._cluster_spec,
        task_type=task_type,
        task_id=task_id,
        num_gpus=num_gpus,
        use_core_strategy=use_core_strategy)

    collective_keys = cross_device_utils.CollectiveKeys(
        group_key_start=10 * num_gpus +
        CollectiveAllReduceStrategyTestBase.collective_key_base,
        instance_key_start=num_gpus * 100 +
        CollectiveAllReduceStrategyTestBase.collective_key_base,
        instance_key_with_id_start=num_gpus * 10000 +
        CollectiveAllReduceStrategyTestBase.collective_key_base)
    strategy.extended._collective_keys = collective_keys
    strategy.extended._cross_device_ops._collective_keys = (collective_keys)

    return strategy, target, session_config
Example #22
0
  def __init__(self,
               devices,
               group_size,
               collective_keys=None,
               communication=CollectiveCommunication.AUTO):
    """Initializes the object.

    Args:
      devices: a list of device strings to run collectives on.
      group_size: the global group size. For between-graph replicated training
        it's the total number of devices across all workers.
      collective_keys: an optional CollectiveKey object.
      communication: indicates which collective communication to use.
    """
    if group_size % len(devices) > 0:
      raise ValueError("group_size must be divisible by the number of devices.")

    self._devices = tuple(device_util.canonicalize(d) for d in devices)
    self._group_size = group_size
    self._collective_keys = (collective_keys or
                             cross_device_utils.CollectiveKeys())
    self._communication = communication
    # In a multi threaded eager program we need to ensure different groups of
    # collectives don't interleave each other, otherwise there will be deadlock.
    self._lock = threading.Lock()

    # Collective ops requires all devices to participate and is blocking. In
    # eager, we need one async executor for each device to be able to launch
    # them altogether. Note that async doesn't imply concurrency. Within an
    # async executor operations are still executed sequentially. In graph or
    # function building, the executors are not used.
    self._executors = []
    for _ in range(len(devices)):
      self._executors.append(executor.new_executor(enable_async=True))

    super(CollectiveAllReduce, self).__init__()
Example #23
0
    def _initialize_multi_worker(self, cluster_resolver):
        """Initializes the object for multi-worker training."""
        cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_resolver.cluster_spec())
        task_type = cluster_resolver.task_type
        task_id = cluster_resolver.task_id
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`.")
        self._cluster_spec = cluster_spec
        self._task_type = task_type
        self._task_id = task_id

        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError(
                "No `worker`, `chief` or `evaluator` tasks can be found "
                "in `cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)

        if (ops.executing_eagerly_outside_functions() and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            context.context().configure_collective_ops(
                collective_leader=multi_worker_util.collective_leader(
                    cluster_spec, task_type, task_id),
                scoped_allocator_enabled_ops=("CollectiveReduce", ),
                use_nccl_communication=(
                    self._communication ==
                    cross_device_ops_lib.CollectiveCommunication.NCCL),
                device_filters=("/job:%s/task:%d" % (task_type, task_id), ))
            self._collective_ops_configured = True

        # Starting a std server in eager mode and in independent worker mode.
        if (context.executing_eagerly()
                and not getattr(self, "_std_server_started", False) and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            # Checking _local_or_standalone_client_mode as well because we should not
            # create the std server in standalone client mode.
            config_proto = config_pb2.ConfigProto()
            config_proto = self._update_config_proto(config_proto)
            server_def = tensorflow_server_pb2.ServerDef(
                cluster=cluster_spec.as_cluster_def(),
                default_session_config=config_proto,
                job_name=task_type,
                task_index=task_id,
                protocol=cluster_resolver.rpc_layer or "grpc")
            context.context().enable_collective_ops(server_def)
            self._std_server_started = True
            # The `ensure_initialized` is needed before calling
            # `context.context().devices()`.
            context.context().ensure_initialized()
            logging.info(
                "Enabled multi-worker collective ops with available devices: %r",
                context.context().devices())

        # TODO(yuefengz): The `num_gpus` is only for this particular task. It
        # assumes all workers have the same number of GPUs. We should remove this
        # assumption by querying all tasks for their numbers of GPUs.
        # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
        # some cases.
        if isinstance(cluster_resolver, TFConfigClusterResolver):
            num_gpus = context.num_gpus()
        else:
            num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)

        if num_gpus:
            local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
                                  for i in range(num_gpus))
        else:
            local_devices = (self._worker_device, )

        self._collective_keys = cross_device_utils.CollectiveKeys()
        super(CollectiveAllReduceExtended,
              self)._initialize_local(local_devices)
        self._input_workers = input_lib.InputWorkers(
            self._device_map, [(self._worker_device, self.worker_devices)])
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            num_workers=self._num_workers,
            num_gpus_per_worker=num_gpus,
            collective_keys=self._collective_keys)

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        # Save the num_gpus_per_worker and rpc_layer for configure method.
        self._num_gpus_per_worker = num_gpus
        self._rpc_layer = cluster_resolver.rpc_layer
        self._warn_nccl_no_gpu()

        logging.info(
            "Multi-worker CollectiveAllReduceStrategy with cluster_spec = %r, "
            "task_type = %r, task_id = %r, num_workers = %r, local_devices = %r, "
            "communication = %s", cluster_spec.as_dict(), task_type, task_id,
            self._num_workers, local_devices, self._communication)
Example #24
0
    def _initialize_multi_worker(self, cluster_resolver):
        """Initializes the object for multi-worker training."""
        cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_resolver.cluster_spec())
        task_type = cluster_resolver.task_type
        task_id = cluster_resolver.task_id
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`.")
        self._cluster_spec = cluster_spec
        self._task_type = task_type
        self._task_id = task_id
        self._id_in_cluster = multi_worker_util.id_in_cluster(
            self._cluster_spec, self._task_type, self._task_id)

        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError(
                "No `worker`, `chief` or `evaluator` tasks can be found "
                "in `cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)

        if (ops.executing_eagerly_outside_functions() and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            context.context().configure_collective_ops(
                collective_leader=multi_worker_util.collective_leader(
                    cluster_spec, task_type, task_id),
                scoped_allocator_enabled_ops=("CollectiveReduce", ),
                device_filters=("/job:%s/task:%d" % (task_type, task_id), ))
            self._collective_ops_configured = True
            if context.context().coordination_service is None:
                coordinated_jobs = ["chief", "worker"]
                if task_type in coordinated_jobs:
                    context.context().configure_coordination_service(
                        service_type="standalone",
                        service_leader=multi_worker_util.coordination_leader(
                            cluster_spec),
                        coordinated_jobs=coordinated_jobs)

        # Starting a std server in eager mode and in independent worker mode.
        if (context.executing_eagerly()
                and not getattr(self, "_std_server_started", False) and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            # Checking _local_or_standalone_client_mode as well because we should not
            # create the std server in standalone client mode.
            config_proto = copy.deepcopy(context.context().config)
            config_proto = self._update_config_proto(config_proto)

            # If coordination service is enabled, use its internal heartbeat to detect
            # peer failures instead of the Python-level health check.
            if config_proto.experimental.coordination_config.service_type:
                self._enable_check_health = False

            if hasattr(cluster_resolver, "port"):
                port = cluster_resolver.port
            else:
                port = 0
            server_def = tensorflow_server_pb2.ServerDef(
                cluster=cluster_spec.as_cluster_def(),
                default_session_config=config_proto,
                job_name=task_type,
                task_index=task_id,
                protocol=cluster_resolver.rpc_layer or "grpc",
                port=port)
            context.context().enable_collective_ops(server_def)
            self._std_server_started = True
            # The `ensure_initialized` is needed before calling
            # `context.context().devices()`.
            context.context().ensure_initialized()
            logging.info(
                "Enabled multi-worker collective ops with available devices: %r",
                context.context().devices())

        # TODO(yuefengz): The `num_gpus` is only for this particular task. It
        # assumes all workers have the same number of GPUs. We should remove this
        # assumption by querying all tasks for their numbers of GPUs.
        # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
        # some cases.
        local_devices, local_device_type = self._initialize_local_devices(
            cluster_resolver, self._worker_device)
        if local_device_type == "TPU":
            tpu_strategy_util.initialize_tpu_system()

        self._collective_keys = cross_device_utils.CollectiveKeys(
            group_key_start=1 + self._collective_key_base)
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            devices=local_devices,
            group_size=len(local_devices) * self._num_workers,
            options=self._communication_options,
            collective_keys=self._collective_keys)
        # CrossDeviceOps for per host tensors.
        self._host_cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            devices=[self._worker_device],
            group_size=self._num_workers,
            options=self._communication_options,
            collective_keys=self._collective_keys)
        super(CollectiveAllReduceExtended,
              self)._initialize_single_worker(local_devices)

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        # Save the num_devices_per_worker and rpc_layer for configure method.
        self._num_devices_per_worker = len(local_devices)
        self._local_device_type = local_device_type
        self._rpc_layer = cluster_resolver.rpc_layer
        self._warn_nccl_no_gpu()

        if self._enable_check_health and context.executing_eagerly():
            self._start_check_health_thread()
        else:
            logging.info("Check health not enabled.")

        logging.info(
            "MultiWorkerMirroredStrategy with cluster_spec = %r, task_type = %r, "
            "task_id = %r, num_workers = %r, local_devices = %r, "
            "communication = %s", cluster_spec.as_dict(), task_type, task_id,
            self._num_workers, local_devices,
            self._communication_options.implementation)
Example #25
0
    def _initialize_local(self, cluster_resolver, devices=None):
        """Initializes the object for local training."""
        self._is_chief = True
        self._num_workers = 1

        if ops.executing_eagerly_outside_functions():
            try:
                context.context().configure_collective_ops(
                    scoped_allocator_enabled_ops=("CollectiveReduce", ))
            except RuntimeError:
                logging.warning(
                    "Collective ops is not configured at program startup. "
                    "Some performance features may not be enabled.")
            self._collective_ops_configured = True

        if devices:
            local_devices = devices
            if "GPU" in devices[0]:
                local_device_type = "GPU"
            elif "TPU" in devices[0]:
                local_device_type = "TPU"
            else:
                local_device_type = "CPU"
        else:
            local_devices, local_device_type = self._initialize_local_devices(
                cluster_resolver, worker_device="")

        self._worker_device = device_util.canonicalize("/device:CPU:0")
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)

        self._collective_keys = cross_device_utils.CollectiveKeys(
            group_key_start=1 + self._collective_key_base)
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            devices=local_devices,
            group_size=len(local_devices),
            options=self._communication_options,
            collective_keys=self._collective_keys)
        # CrossDeviceOps for per host tensors.
        self._host_cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            devices=[self._worker_device],
            group_size=self._num_workers,
            options=self._communication_options,
            collective_keys=self._collective_keys)
        super(CollectiveAllReduceExtended,
              self)._initialize_single_worker(local_devices)

        self._cluster_spec = None
        self._task_type = None
        self._task_id = None
        self._id_in_cluster = 0

        # This is a mark to tell whether we are running with standalone client or
        # independent worker. Right now with standalone client, strategy object is
        # created as local strategy and then turn into multi-worker strategy via
        # configure call.
        self._local_or_standalone_client_mode = True

        # Save the num_devices_per_worker and rpc_layer for configure method.
        self._num_devices_per_worker = len(local_devices)
        self._local_device_type = local_device_type
        self._rpc_layer = cluster_resolver.rpc_layer
        self._warn_nccl_no_gpu()

        logging.info(
            "Single-worker MultiWorkerMirroredStrategy with local_devices "
            "= %r, communication = %s", local_devices,
            self._communication_options.implementation)
    def _get_test_objects(self,
                          task_type,
                          task_id,
                          num_gpus=0,
                          communication=CollectiveCommunication.AUTO,
                          use_strategy_object=False,
                          local_mode=False):
        collective_keys = cross_device_utils.CollectiveKeys(
            group_key_start=10 + CollectiveAllReduceTest.collective_key_base,
            op_instance_key_start=100 +
            CollectiveAllReduceTest.collective_key_base,
            variable_instance_key_start=10000 +
            CollectiveAllReduceTest.collective_key_base)
        if local_mode:
            if num_gpus:
                devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
            else:
                devices = ["/device:CPU:0"]

            if use_strategy_object:
                strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
                    communication=communication)
                strategy.extended._collective_keys = collective_keys
                strategy.extended._cross_device_ops._collective_keys = collective_keys
                return strategy, devices, ""
            else:
                collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
                    1,
                    num_gpus,
                    collective_keys=collective_keys,
                    communication=communication)
                return collective_all_reduce_ops, devices, ""
        else:
            # NCCL requires physical GPUs for every replica, which we can't do with
            # simulated multi host set up now.
            assert communication != CollectiveCommunication.NCCL
            if num_gpus:
                devices = [
                    "/job:%s/task:%d/replica:0/device:GPU:%d" %
                    (task_type, task_id, i) for i in range(num_gpus)
                ]
            else:
                devices = [
                    "/job:%s/task:%d/replica:0/device:CPU:0" %
                    (task_type, task_id)
                ]

            if use_strategy_object:
                strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
                    communication=communication)
                strategy.configure(cluster_spec=self._cluster_spec,
                                   task_type=task_type,
                                   task_id=task_id)
                strategy.extended._collective_keys = collective_keys
                strategy.extended._cross_device_ops._collective_keys = collective_keys
                return (strategy, devices,
                        "grpc://" + self._cluster_spec[task_type][task_id])
            else:
                collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
                    NUM_WORKERS,
                    num_gpus,
                    collective_keys=collective_keys,
                    communication=communication)
                return (collective_all_reduce_ops, devices,
                        "grpc://" + self._cluster_spec[task_type][task_id])