Exemple #1
0
    def _create_parameter_server():
        if framework_test_util.is_xla_enabled():
            # To address test failures resulting in XLA with MultiProcessRunner,
            # continue to use in-process cluster for XLA tests.
            cluster_def = multi_worker_test_base.create_in_process_cluster(
                num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
            resolver = cluster_resolver.SimpleClusterResolver(
                server_lib.ClusterSpec(cluster_def),
                num_accelerators={"GPU": required_gpus},
                rpc_layer="grpc")
            return _create_ps_strategy(resolver, variable_partitioner)
        else:
            tf_config = cluster_resolver.TFConfigClusterResolver()
            cluster_def = tf_config.cluster_spec().as_dict()
            if not cluster_def:
                # When MultiProcessRunner cluster is used, the cluster is not created
                # initially when the decorator is called. When the test runs, initially
                # this method is invoked via decorator before setting up the
                # MultiProcessRunner with worker and ps in the combinations.py. After
                # setup is done, the subprocess invokes this method again to get
                # strategy object. We return None strategy when the main thread invokes
                # this method before setting up cluster.
                # Returning None is fine here, since this thread will proceed to create
                # MultiProcessRunner and invoke tests with decorator inside
                # subprocesses.
                return None
            # MultiProcessRunner is already setup and this method is invoked from a
            # subprocess running the actual test.
            resolver = cluster_resolver.SimpleClusterResolver(
                server_lib.ClusterSpec(cluster_def),
                num_accelerators={"GPU": required_gpus},
                task_type=tf_config.task_type,
                task_id=tf_config.task_id,
                environment=tf_config.environment,
                rpc_layer=tf_config.rpc_layer or "grpc")
            if tf_config.task_type in ("worker", "ps"):
                worker_config = config_pb2.ConfigProto()
                worker_config.inter_op_parallelism_threads = 4  # max num_workers + 1

                try:
                    server = server_lib.Server(cluster_def,
                                               job_name=tf_config.task_type,
                                               task_index=tf_config.task_id,
                                               protocol="grpc",
                                               config=worker_config)
                except errors.UnknownError as e:
                    if "Could not start gRPC server" in e.message:
                        raise unittest.SkipTest("Cannot start std servers.")
                    else:
                        raise

                # Blocking the process that starts a server from exiting.
                server.join()

            return _create_ps_strategy(resolver, variable_partitioner)
Exemple #2
0
 def _create_multi_worker_mirrored():
     tf_config = cluster_resolver.TFConfigClusterResolver()
     master = tf_config.master()
     if tf_config.rpc_layer:
         # Strip off the rpc_layer suffix.
         master = master[len("%s://" % tf_config.rpc_layer):]
     resolver = cluster_resolver.SimpleClusterResolver(
         cluster_spec=tf_config.cluster_spec(),
         task_type=tf_config.task_type,
         task_id=tf_config.task_id,
         master=master,
         environment=tf_config.environment,
         num_accelerators={"GPU": required_gpus},
         rpc_layer=tf_config.rpc_layer or "grpc",
     )
     # Always create the strategy in eager mode so that it starts the server and
     # configures the eager context. The eager context can no longer be
     # configured after initialization.
     with context.eager_mode():
         strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
             cluster_resolver=resolver)
     # TODO(b/152320929): Wait for the cluster before proceeding, otherwise
     # collectives may hang if any worker launches collectives before the chief
     # creates the strategy.
     try:
         multi_process_runner.barrier().wait()
     except ValueError:
         # If the creator is called in the main process,
         # multi_process_runner.barrier() raises ValueError, which is safe to
         # ignore.
         pass
     return strategy
    def testKeepLogicalDevice(self):
        gpus = tf_config.list_physical_devices('GPU')
        if len(gpus) > 1:
            self.skipTest(
                'Skip logical device test on multi GPUs, since partial GPU '
                'virtualization is not permitted.')
        # Cannot change logical device after the context initialization.
        context._reset_context()  # pylint: disable=protected-access
        cluster_spec = multi_worker_test_base.create_cluster_spec(
            has_chief=False, num_workers=1)
        resolver = cluster_resolver_lib.SimpleClusterResolver(
            cluster_spec=multi_worker_util.normalize_cluster_spec(
                cluster_spec),
            task_type='worker',
            task_id=0)

        logical_gpus = len(gpus) * 2
        for i, device in enumerate(gpus):
            n = (i +
                 1) * logical_gpus // len(gpus) - i * logical_gpus // len(gpus)
            assert n > 0  # guaranteed if count >= len(devices)
            configs = []
            for ordinal in range(n):
                config = context.LogicalDeviceConfiguration(
                    memory_limit=64, experimental_device_ordinal=ordinal)
                configs.append(config)

            tf_config.set_logical_device_configuration(device, configs)

        collective_all_reduce_strategy.CollectiveAllReduceStrategy(
            cluster_resolver=resolver)
        # Since we create two logical GPUs out of the last GPU, there should be one
        # more logical GPUs than physical GPUs.
        self.assertLen(tf_config.list_logical_devices('GPU'), logical_gpus)
        context._reset_context()  # pylint: disable=protected-access
Exemple #4
0
  def _get_test_objects(self,
                        task_type,
                        task_id,
                        num_gpus=0,
                        communication=CollectiveCommunication.AUTO,
                        use_strategy_object=False,
                        local_mode=False):
    collective_keys = cross_device_utils.CollectiveKeys(
        group_key_start=10 + CollectiveAllReduceTest.collective_key_base)
    if local_mode:
      if num_gpus:
        devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
      else:
        devices = ["/device:CPU:0"]

      if use_strategy_object:
        strategy = (mwms_lib.CollectiveAllReduceStrategy
                    ._from_local_devices(devices, communication=communication))  # pylint: disable=protected-access
        return strategy, devices, ""
      else:
        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
            devices=devices,
            group_size=len(devices),
            collective_keys=collective_keys)
        return collective_all_reduce_ops, devices, ""
    else:
      # NCCL requires physical GPUs for every replica, which we can't do with
      # simulated multi host set up now.
      assert communication != CollectiveCommunication.NCCL
      if num_gpus:
        devices = [
            "/job:%s/task:%d/replica:0/device:GPU:%d" % (task_type, task_id, i)
            for i in range(num_gpus)
        ]
      else:
        devices = [
            "/job:%s/task:%d/replica:0/device:CPU:0" % (task_type, task_id)
        ]

      if use_strategy_object:
        resolver = cluster_resolver.SimpleClusterResolver(
            cluster_spec=multi_worker_util.normalize_cluster_spec(
                self._cluster_spec),
            task_type=task_type,
            task_id=task_id,
            num_accelerators={"GPU": num_gpus})
        strategy = mwms_lib.CollectiveAllReduceStrategy(
            cluster_resolver=resolver, communication=communication)
        return (strategy, devices,
                "grpc://" + self._cluster_spec[task_type][task_id])
      else:
        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
            devices=devices,
            group_size=len(devices) * NUM_WORKERS,
            collective_keys=collective_keys)
        return (collective_all_reduce_ops, devices,
                "grpc://" + self._cluster_spec[task_type][task_id])
Exemple #5
0
    def _create_parameter_server():

        cluster_def = multi_worker_test_base.create_in_process_cluster(
            num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
        resolver = cluster_resolver.SimpleClusterResolver(
            ClusterSpec(cluster_def),
            num_accelerators={"GPU": required_gpus},
            rpc_layer="grpc")
        strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
            resolver,
            variable_partitioner=sharded_variable.FixedShardsPartitioner(2))
        return strategy
Exemple #6
0
    def _create_multi_worker_mirrored():
        tf_config = cluster_resolver.TFConfigClusterResolver()
        master = tf_config.master()
        if tf_config.rpc_layer:
            # Strip off the rpc_layer suffix.
            master = master[len("%s://" % tf_config.rpc_layer):]
        resolver = cluster_resolver.SimpleClusterResolver(
            cluster_spec=tf_config.cluster_spec(),
            task_type=tf_config.task_type,
            task_id=tf_config.task_id,
            master=master,
            environment=tf_config.environment,
            num_accelerators={"GPU": required_gpus},
            rpc_layer=tf_config.rpc_layer or "grpc",
        )
        # Disable health check and coordination service. We don't have a reliable
        # way to shutdown the strategy (and thus the strategy health check or
        # coordination service heartbeat) at the end of a test. Turning on the
        # strategy health check or coordination service heartbeat causes some
        # flakiness since we re-create part of the server when creating a strategy,
        # and our tests are capable of handling failures.
        CollectiveAllReduceExtended._enable_check_health = False  # pylint: disable=protected-access
        context.context().configure_coordination_service(service_type="")
        # Always create the strategy in eager mode so that it starts the server and
        # configures the eager context. The eager context can no longer be
        # configured after initialization.
        with context.eager_mode():
            strategy = CollectiveAllReduceStrategy(cluster_resolver=resolver)

        if not use_merge_call:
            strategy.extended._use_merge_call = lambda: False  # pylint: disable=protected-access
        # TODO(b/152320929): Wait for the cluster before proceeding, otherwise
        # collectives may hang if any worker launches collectives before the chief
        # creates the strategy.
        try:
            multi_process_runner.get_barrier().wait()
        except ValueError:
            # If the creator is called in the main process,
            # multi_process_runner.get_barrier() raises ValueError, which is safe to
            # ignore.
            pass
        return strategy
 def testKeepLogicalDevice(self):
   # Cannot change logical device after the context initialization.
   context._reset_context()  # pylint: disable=protected-access
   cluster_spec = multi_worker_test_base.create_cluster_spec(
       has_chief=False, num_workers=1)
   resolver = cluster_resolver_lib.SimpleClusterResolver(
       cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
       task_type='worker',
       task_id=0)
   gpus = tf_config.list_physical_devices('GPU')
   tf_config.set_logical_device_configuration(gpus[-1], [
       context.LogicalDeviceConfiguration(64),
       context.LogicalDeviceConfiguration(64),
   ])
   collective_all_reduce_strategy.CollectiveAllReduceStrategy(
       cluster_resolver=resolver)
   # Since we create two logical GPUs out of the last GPU, there should be one
   # more logical GPUs than physical GPUs.
   self.assertLen(tf_config.list_logical_devices('GPU'), len(gpus) + 1)
   context._reset_context()  # pylint: disable=protected-access
 def _create_multi_worker_mirrored():
     tf_config = cluster_resolver.TFConfigClusterResolver()
     resolver = cluster_resolver.SimpleClusterResolver(
         cluster_spec=tf_config.cluster_spec(),
         task_type=tf_config.task_type,
         task_id=tf_config.task_id,
         environment=tf_config.environment,
         num_accelerators={"GPU": required_gpus},
         rpc_layer=tf_config.rpc_layer,
     )
     strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
         cluster_resolver=resolver)
     # TODO(b/152320929): Wait for the cluster before proceeding, otherwise
     # collectives may hang if any worker launches collectives before the chief
     # creates the strategy.
     try:
         multi_process_runner.barrier().wait()
     except ValueError:
         # If the creator is called in the main process,
         # multi_process_runner.barrier() raises ValueError, which is safe to
         # ignore.
         pass
     return strategy