Ejemplo n.º 1
0
  def _update_config_proto(self, config_proto):
    updated_config = copy.deepcopy(config_proto)
    # Enable the scoped allocator optimization for CollectiveOps.  This
    # optimization converts many small all-reduces into fewer larger
    # all-reduces.
    rewrite_options = updated_config.graph_options.rewrite_options
    rewrite_options.scoped_allocator_optimization = (
        rewriter_config_pb2.RewriterConfig.ON)
    # We turn on ScopedAllocator only for CollectiveReduce op, i.e. enable_op =
    # ["CollectiveReduce"].  Since we can't assign to a repeated proto field, we
    # clear and then append.
    del rewrite_options.scoped_allocator_opts.enable_op[:]
    rewrite_options.scoped_allocator_opts.enable_op.append("CollectiveReduce")

    if self._communication == cross_device_ops_lib.CollectiveCommunication.NCCL:
      updated_config.experimental.collective_nccl = True

    if not self._cluster_spec:
      return updated_config

    assert self._task_type
    assert self._task_id is not None

    # Collective group leader is needed for collective ops to coordinate
    # workers.
    updated_config.experimental.collective_group_leader = (
        multi_worker_util.collective_leader(self._cluster_spec, self._task_type,
                                            self._task_id))

    # The device filters prevent communication between workers.
    del updated_config.device_filters[:]
    updated_config.device_filters.append(
        "/job:%s/task:%d" % (self._task_type, self._task_id))

    return updated_config
 def testWorkerAsLeader(self):
     cluster_spec = {
         "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
         "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
     }
     self.assertEqual(
         multi_worker_util.collective_leader(cluster_spec, "worker", 1),
         "/job:worker/replica:0/task:0")
Ejemplo n.º 3
0
 def testLeaderForEvaluator(self):
   cluster_spec = {
       "chief": ["127.0.0.1:1234"],
       "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
       "ps": ["127.0.0.1:1926", "127.0.0.1:3141"],
       "evaluator": ["127.0.0.1:2019"]
   }
   self.assertEqual(
       multi_worker_util.collective_leader(cluster_spec, "evaluator", 0), "")
  def _update_config_proto(self, config_proto):
    updated_config = copy.deepcopy(config_proto)
    # Enable the scoped allocator optimization for CollectiveOps.  This
    # optimization converts many small all-reduces into fewer larger
    # all-reduces.
    rewrite_options = updated_config.graph_options.rewrite_options
    rewrite_options.scoped_allocator_optimization = (
        rewriter_config_pb2.RewriterConfig.ON)
    # We turn on ScopedAllocator only for CollectiveReduce op, i.e. enable_op =
    # ["CollectiveReduce"].  Since we can't assign to a repeated proto field, we
    # clear and then append.
    del rewrite_options.scoped_allocator_opts.enable_op[:]
    rewrite_options.scoped_allocator_opts.enable_op.append("CollectiveReduce")

    if ((self._communication ==
         cross_device_ops_lib.CollectiveCommunication.NCCL) and
        self._num_gpus_per_worker > 0):
      updated_config.experimental.collective_nccl = True

    if not self._cluster_spec:
      return updated_config

    assert self._task_type
    assert self._task_id is not None

    # Collective group leader is needed for collective ops to coordinate
    # workers.
    updated_config.experimental.collective_group_leader = (
        multi_worker_util.collective_leader(self._cluster_spec, self._task_type,
                                            self._task_id))

    # The device filters prevent communication between workers.
    del updated_config.device_filters[:]
    updated_config.device_filters.append(
        "/job:%s/task:%d" % (self._task_type, self._task_id))

    return updated_config
Ejemplo n.º 5
0
    def _initialize_multi_worker(self, cluster_resolver):
        """Initializes the object for multi-worker training."""
        cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_resolver.cluster_spec())
        task_type = cluster_resolver.task_type
        task_id = cluster_resolver.task_id
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`.")
        self._cluster_spec = cluster_spec
        self._task_type = task_type
        self._task_id = task_id

        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError(
                "No `worker`, `chief` or `evaluator` tasks can be found "
                "in `cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)

        if (ops.executing_eagerly_outside_functions() and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            context.context().configure_collective_ops(
                collective_leader=multi_worker_util.collective_leader(
                    cluster_spec, task_type, task_id),
                scoped_allocator_enabled_ops=("CollectiveReduce", ),
                use_nccl_communication=(
                    self._communication ==
                    cross_device_ops_lib.CollectiveCommunication.NCCL),
                device_filters=("/job:%s/task:%d" % (task_type, task_id), ))
            self._collective_ops_configured = True

        # Starting a std server in eager mode and in independent worker mode.
        if (context.executing_eagerly()
                and not getattr(self, "_std_server_started", False) and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            # Checking _local_or_standalone_client_mode as well because we should not
            # create the std server in standalone client mode.
            config_proto = config_pb2.ConfigProto()
            config_proto = self._update_config_proto(config_proto)
            server_def = tensorflow_server_pb2.ServerDef(
                cluster=cluster_spec.as_cluster_def(),
                default_session_config=config_proto,
                job_name=task_type,
                task_index=task_id,
                protocol=cluster_resolver.rpc_layer or "grpc")
            context.context().enable_collective_ops(server_def)
            self._std_server_started = True
            # The `ensure_initialized` is needed before calling
            # `context.context().devices()`.
            context.context().ensure_initialized()
            logging.info(
                "Enabled multi-worker collective ops with available devices: %r",
                context.context().devices())

        # TODO(yuefengz): The `num_gpus` is only for this particular task. It
        # assumes all workers have the same number of GPUs. We should remove this
        # assumption by querying all tasks for their numbers of GPUs.
        # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
        # some cases.
        if isinstance(cluster_resolver, TFConfigClusterResolver):
            num_gpus = context.num_gpus()
        else:
            num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)

        if num_gpus:
            local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
                                  for i in range(num_gpus))
        else:
            local_devices = (self._worker_device, )

        self._collective_keys = cross_device_utils.CollectiveKeys()
        super(CollectiveAllReduceExtended,
              self)._initialize_local(local_devices)
        self._input_workers = input_lib.InputWorkers(
            self._device_map, [(self._worker_device, self.worker_devices)])
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            num_workers=self._num_workers,
            num_gpus_per_worker=num_gpus,
            collective_keys=self._collective_keys)

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        # Save the num_gpus_per_worker and rpc_layer for configure method.
        self._num_gpus_per_worker = num_gpus
        self._rpc_layer = cluster_resolver.rpc_layer
        self._warn_nccl_no_gpu()

        logging.info(
            "Multi-worker CollectiveAllReduceStrategy with cluster_spec = %r, "
            "task_type = %r, task_id = %r, num_workers = %r, local_devices = %r, "
            "communication = %s", cluster_spec.as_dict(), task_type, task_id,
            self._num_workers, local_devices, self._communication)
Ejemplo n.º 6
0
    def _initialize_multi_worker(self, cluster_resolver):
        """Initializes the object for multi-worker training."""
        cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_resolver.cluster_spec())
        task_type = cluster_resolver.task_type
        task_id = cluster_resolver.task_id
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`.")
        self._cluster_spec = cluster_spec
        self._task_type = task_type
        self._task_id = task_id
        self._id_in_cluster = multi_worker_util.id_in_cluster(
            self._cluster_spec, self._task_type, self._task_id)

        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError(
                "No `worker`, `chief` or `evaluator` tasks can be found "
                "in `cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
        self._host_input_device = numpy_dataset.SingleDevice(
            self._worker_device)

        if (ops.executing_eagerly_outside_functions() and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            context.context().configure_collective_ops(
                collective_leader=multi_worker_util.collective_leader(
                    cluster_spec, task_type, task_id),
                scoped_allocator_enabled_ops=("CollectiveReduce", ),
                device_filters=("/job:%s/task:%d" % (task_type, task_id), ))
            self._collective_ops_configured = True
            if context.context().coordination_service is None:
                coordinated_jobs = ["chief", "worker"]
                if task_type in coordinated_jobs:
                    context.context().configure_coordination_service(
                        service_type="standalone",
                        service_leader=multi_worker_util.coordination_leader(
                            cluster_spec),
                        coordinated_jobs=coordinated_jobs)

        # Starting a std server in eager mode and in independent worker mode.
        if (context.executing_eagerly()
                and not getattr(self, "_std_server_started", False) and
                not getattr(self, "_local_or_standalone_client_mode", False)):
            # Checking _local_or_standalone_client_mode as well because we should not
            # create the std server in standalone client mode.
            config_proto = copy.deepcopy(context.context().config)
            config_proto = self._update_config_proto(config_proto)

            # If coordination service is enabled, use its internal heartbeat to detect
            # peer failures instead of the Python-level health check.
            if config_proto.experimental.coordination_config.service_type:
                self._enable_check_health = False

            if hasattr(cluster_resolver, "port"):
                port = cluster_resolver.port
            else:
                port = 0
            server_def = tensorflow_server_pb2.ServerDef(
                cluster=cluster_spec.as_cluster_def(),
                default_session_config=config_proto,
                job_name=task_type,
                task_index=task_id,
                protocol=cluster_resolver.rpc_layer or "grpc",
                port=port)
            context.context().enable_collective_ops(server_def)
            self._std_server_started = True
            # The `ensure_initialized` is needed before calling
            # `context.context().devices()`.
            context.context().ensure_initialized()
            logging.info(
                "Enabled multi-worker collective ops with available devices: %r",
                context.context().devices())

        # TODO(yuefengz): The `num_gpus` is only for this particular task. It
        # assumes all workers have the same number of GPUs. We should remove this
        # assumption by querying all tasks for their numbers of GPUs.
        # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
        # some cases.
        local_devices, local_device_type = self._initialize_local_devices(
            cluster_resolver, self._worker_device)
        if local_device_type == "TPU":
            tpu_strategy_util.initialize_tpu_system()

        self._collective_keys = cross_device_utils.CollectiveKeys(
            group_key_start=1 + self._collective_key_base)
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            devices=local_devices,
            group_size=len(local_devices) * self._num_workers,
            options=self._communication_options,
            collective_keys=self._collective_keys)
        # CrossDeviceOps for per host tensors.
        self._host_cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            devices=[self._worker_device],
            group_size=self._num_workers,
            options=self._communication_options,
            collective_keys=self._collective_keys)
        super(CollectiveAllReduceExtended,
              self)._initialize_single_worker(local_devices)

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        # Save the num_devices_per_worker and rpc_layer for configure method.
        self._num_devices_per_worker = len(local_devices)
        self._local_device_type = local_device_type
        self._rpc_layer = cluster_resolver.rpc_layer
        self._warn_nccl_no_gpu()

        if self._enable_check_health and context.executing_eagerly():
            self._start_check_health_thread()
        else:
            logging.info("Check health not enabled.")

        logging.info(
            "MultiWorkerMirroredStrategy with cluster_spec = %r, task_type = %r, "
            "task_id = %r, num_workers = %r, local_devices = %r, "
            "communication = %s", cluster_spec.as_dict(), task_type, task_id,
            self._num_workers, local_devices,
            self._communication_options.implementation)
 def testLocalLeader(self):
     cluster_spec = {}
     self.assertEqual(
         multi_worker_util.collective_leader(cluster_spec, None, 0), "")