Ejemplo n.º 1
0
  def make_collective(self, num_processes, gpu_per_process, communication):
    """Returns collectives and other info to be used in tests.

    Args:
      num_processes: an integer indicating the number of processes that
        participate in the collective.
      gpu_per_process: number of GPUs (0 if no GPUs) used by each process.
      communication: one of `CollectiveCommunication`.

    Returns:
     A tuple of (collective, devices, group_size) where collective is a instance
     of `CollectiveAllReduce`, devices are a list of local devices (str)
     attached to the current process, and group_size is the group_size of
     collective.
    """

    cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
    devices = [
        "/job:worker/replica:0/task:%d/device:CPU:0" % cluster_resolver.task_id
    ]
    if gpu_per_process > 0:
      devices = [
          "/job:worker/replica:0/task:%d/device:GPU:%d" %
          (cluster_resolver.task_id, i) for i in range(gpu_per_process)
      ]
    group_size = num_processes * len(devices)
    collective = cross_device_ops_lib.CollectiveAllReduce(
        devices=devices, group_size=group_size, communication=communication)
    return collective, devices, cluster_resolver.task_id
Ejemplo n.º 2
0
    def worker_fn():
      cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
      enable_collective_ops(cluster_resolver)

      collective_ops.all_reduce(
          constant_op.constant(1.),
          group_size=2,
          group_key=100,
          instance_key=100,
          merge_op="Add",
          final_op="Id",
          communication_hint="ring")

      if cluster_resolver.task_type == "worker":
        # MultiProcessRunner will auto restart worker-0.
        os._exit(1)  # pylint: disable=protected-access
      else:
        # chief should eventually gets FailedPreconditionError after worker-0
        # has restarted.
        while True:
          time.sleep(1)
          try:
            context.context().check_collective_ops_peer_health(
                "/job:worker/replica:0/task:0",)
          except errors.UnavailableError:
            pass
          except errors.FailedPreconditionError:
            break
Ejemplo n.º 3
0
 def _create_multi_worker_mirrored():
     tf_config = cluster_resolver.TFConfigClusterResolver()
     master = tf_config.master()
     if tf_config.rpc_layer:
         # Strip off the rpc_layer suffix.
         master = master[len("%s://" % tf_config.rpc_layer):]
     resolver = cluster_resolver.SimpleClusterResolver(
         cluster_spec=tf_config.cluster_spec(),
         task_type=tf_config.task_type,
         task_id=tf_config.task_id,
         master=master,
         environment=tf_config.environment,
         num_accelerators={"GPU": required_gpus},
         rpc_layer=tf_config.rpc_layer or "grpc",
     )
     # Always create the strategy in eager mode so that it starts the server and
     # configures the eager context. The eager context can no longer be
     # configured after initialization.
     with context.eager_mode():
         strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
             cluster_resolver=resolver)
     # TODO(b/152320929): Wait for the cluster before proceeding, otherwise
     # collectives may hang if any worker launches collectives before the chief
     # creates the strategy.
     try:
         multi_process_runner.barrier().wait()
     except ValueError:
         # If the creator is called in the main process,
         # multi_process_runner.barrier() raises ValueError, which is safe to
         # ignore.
         pass
     return strategy
  def testAbortCommunication(self, device, communication):
    if communication == "NCCL":
      self.skipTest("b/171358086: cannot test multi worker NCCL")
    dev0 = "/device:%s:0" % device
    cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
    enable_collective_ops_with_barrier(cluster_resolver)
    group_size = 2
    group_key = 100
    instance_key = 100
    in_tensor = constant_op.constant([1.])

    # First perform a normal all-reduce to complete the group and instance
    # resolution.
    with ops.device(dev0):
      collective_ops.all_reduce(
          in_tensor,
          group_size,
          group_key,
          instance_key,
          communication_hint=communication)

    if cluster_resolver.task_id == 1:

      def abort_fn():
        time.sleep(2)
        context.context().abort_collective_ops(errors.UNAVAILABLE, "peer down")

      t = threading.Thread(target=abort_fn)
      t.start()

      with self.assertRaisesRegex(errors.UnavailableError, "peer down"):
        with ops.device(dev0):
          collective_ops.all_reduce(
              in_tensor,
              group_size,
              group_key,
              instance_key,
              communication_hint=communication)

      # After abortion, subsequent collectives should fail immediately.
      with self.assertRaisesRegex(errors.UnavailableError, "peer down"):
        with ops.device(dev0):
          collective_ops.all_reduce(
              in_tensor,
              group_size,
              group_key,
              instance_key,
              communication_hint=communication)

      t.join()

    # Enable collective ops again in order to reset the collective executor.
    enable_collective_ops_with_barrier(cluster_resolver)
    with ops.device(dev0):
      collective_ops.all_reduce(
          in_tensor,
          group_size,
          group_key,
          instance_key,
          communication_hint=communication)
Ejemplo n.º 5
0
    def testAbortInstanceParamsResolution(self, device, communication):
        if communication == "NCCL":
            self.skipTest("b/171358086: cannot test multi worker NCCL")
        dev0 = "/device:%s:0" % device
        cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
        enable_collective_ops_with_barrier(cluster_resolver)
        group_size = 2
        group_key = 100
        instance_key = 100
        in_tensor = constant_op.constant([1.])

        # First perform a normal all-reduce to complete the group resolution.
        with ops.device(dev0):
            collective_ops.all_reduce(in_tensor, group_size, group_key,
                                      instance_key)

        # We use broadcast to test aborting instance resolution since only broadcast
        # waits for the group.

        if cluster_resolver.task_id == 1:

            def abort_fn():
                time.sleep(2)
                context.context().abort_collective_ops(errors.UNAVAILABLE,
                                                       "peer down")

            t = threading.Thread(target=abort_fn)
            t.start()

            # Use a different instance key to trigger another instance resolution.
            instance_key = 101
            with self.assertRaisesRegex(errors.UnavailableError, "peer down"):
                # This hangs on params resolution since we're only launching one
                # collective for a group size of 2.
                with ops.device(dev0):
                    collective_ops.broadcast_send(in_tensor, (1, ),
                                                  dtypes.float32, group_size,
                                                  group_key, instance_key)

            # After abortion, subsequent collectives should fail immediately.
            with self.assertRaisesRegex(errors.UnavailableError, "peer down"):
                with ops.device(dev0):
                    collective_ops.broadcast_send(in_tensor, (1, ),
                                                  dtypes.float32, group_size,
                                                  group_key, instance_key)

            t.join()

        # Enable collective ops again in order to reset the collective executor.
        enable_collective_ops_with_barrier(cluster_resolver)
        # Reassign instance_key so that it's the same on each worker.
        instance_key = 100
        with ops.device(dev0):
            if cluster_resolver.task_id == 0:
                collective_ops.broadcast_send(in_tensor, (1, ), dtypes.float32,
                                              group_size, group_key,
                                              instance_key)
            else:
                collective_ops.broadcast_recv(
                    (1, ), dtypes.float32, group_size, group_key, instance_key)
Ejemplo n.º 6
0
    def _create_parameter_server():
        if framework_test_util.is_xla_enabled():
            # To address test failures resulting in XLA with MultiProcessRunner,
            # continue to use in-process cluster for XLA tests.
            cluster_def = multi_worker_test_base.create_in_process_cluster(
                num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
            resolver = cluster_resolver.SimpleClusterResolver(
                server_lib.ClusterSpec(cluster_def),
                num_accelerators={"GPU": required_gpus},
                rpc_layer="grpc")
            return _create_ps_strategy(resolver, variable_partitioner)
        else:
            tf_config = cluster_resolver.TFConfigClusterResolver()
            cluster_def = tf_config.cluster_spec().as_dict()
            if not cluster_def:
                # When MultiProcessRunner cluster is used, the cluster is not created
                # initially when the decorator is called. When the test runs, initially
                # this method is invoked via decorator before setting up the
                # MultiProcessRunner with worker and ps in the combinations.py. After
                # setup is done, the subprocess invokes this method again to get
                # strategy object. We return None strategy when the main thread invokes
                # this method before setting up cluster.
                # Returning None is fine here, since this thread will proceed to create
                # MultiProcessRunner and invoke tests with decorator inside
                # subprocesses.
                return None
            # MultiProcessRunner is already setup and this method is invoked from a
            # subprocess running the actual test.
            resolver = cluster_resolver.SimpleClusterResolver(
                server_lib.ClusterSpec(cluster_def),
                num_accelerators={"GPU": required_gpus},
                task_type=tf_config.task_type,
                task_id=tf_config.task_id,
                environment=tf_config.environment,
                rpc_layer=tf_config.rpc_layer or "grpc")
            if tf_config.task_type in ("worker", "ps"):
                worker_config = config_pb2.ConfigProto()
                worker_config.inter_op_parallelism_threads = 4  # max num_workers + 1

                try:
                    server = server_lib.Server(cluster_def,
                                               job_name=tf_config.task_type,
                                               task_index=tf_config.task_id,
                                               protocol="grpc",
                                               config=worker_config)
                except errors.UnknownError as e:
                    if "Could not start gRPC server" in e.message:
                        raise unittest.SkipTest("Cannot start std servers.")
                    else:
                        raise

                # Blocking the process that starts a server from exiting.
                server.join()

            return _create_ps_strategy(resolver, variable_partitioner)
Ejemplo n.º 7
0
 def worker_fn():
   enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver())
   # There may be some delays before the server startup. Check health should
   # eventually be OK.
   while True:
     try:
       for task in [
           "/job:worker/replica:0/task:0",
           "/job:worker/replica:0/task:1",
       ]:
         context.context().check_collective_ops_peer_health(task)
     except errors.UnavailableError:
       continue
     break
   multi_process_runner.get_barrier().wait()
Ejemplo n.º 8
0
def enable_collective_ops():
  """Enable collectives in the current process."""
  cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
  context.context().configure_collective_ops(
      collective_leader="'/job:worker/replica:0/task:0'")
  config_proto = config_pb2.ConfigProto()
  config_proto.experimental.collective_group_leader = (
      "/job:worker/replica:0/task:0")
  server_def = tensorflow_server_pb2.ServerDef(
      cluster=cluster_resolver.cluster_spec().as_cluster_def(),
      default_session_config=config_proto,
      job_name=cluster_resolver.task_type,
      task_index=cluster_resolver.task_id,
      protocol=cluster_resolver.rpc_layer)
  context.context().enable_collective_ops(server_def)
Ejemplo n.º 9
0
    def enable_collectives(self, num_processes, gpu_per_process,
                           communication):
        """Enable collectives in the current process.

    Args:
      num_processes: an integer indicating the number of processes that
        participate in the collective.
      gpu_per_process: number of GPUs (0 if no GPUs) used by each process.
      communication: one of `CollectiveCommunication`.

    Returns:
     A tuple of (collective, devices, group_size) where collective is a instance
     of `CollectiveAllReduce`, devices are a list of local devices (str)
     attached to the current process, and group_size is the group_size of
     collective.
    """
        cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
        context.context().configure_collective_ops(
            collective_leader="'/job:worker/replica:0/task:0'")
        config_proto = config_pb2.ConfigProto()
        config_proto.experimental.collective_group_leader = (
            "/job:worker/replica:0/task:0")
        server_def = tensorflow_server_pb2.ServerDef(
            cluster=cluster_resolver.cluster_spec().as_cluster_def(),
            default_session_config=config_proto,
            job_name=cluster_resolver.task_type,
            task_index=cluster_resolver.task_id,
            protocol=cluster_resolver.rpc_layer)
        context.context().enable_collective_ops(server_def)

        devices = [
            "/job:worker/replica:0/task:%d/device:CPU:0" %
            cluster_resolver.task_id
        ]
        if gpu_per_process > 0:
            devices = [
                "/job:worker/replica:0/task:%d/device:GPU:%d" %
                (cluster_resolver.task_id, i) for i in range(gpu_per_process)
            ]
        group_size = num_processes * len(devices)
        collective = cross_device_ops_lib.CollectiveAllReduce(
            devices=devices,
            group_size=group_size,
            communication=communication)
        return collective, devices, cluster_resolver.task_id
Ejemplo n.º 10
0
def enable_collective_ops():
    """Enable collectives in the current process."""
    cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
    context.context().configure_collective_ops(
        collective_leader="'/job:worker/replica:0/task:0'")
    config_proto = config_pb2.ConfigProto()
    config_proto.experimental.collective_group_leader = (
        "/job:worker/replica:0/task:0")
    server_def = tensorflow_server_pb2.ServerDef(
        cluster=cluster_resolver.cluster_spec().as_cluster_def(),
        default_session_config=config_proto,
        job_name=cluster_resolver.task_type,
        task_index=cluster_resolver.task_id,
        protocol=cluster_resolver.rpc_layer)
    context.context().enable_collective_ops(server_def)
    # Recover default flag values.
    CollectiveReplicaLauncher._prefer_unique_instance_key = True
    CollectiveReplicaLauncher._prefer_ordering_token = False
Ejemplo n.º 11
0
    def _create_multi_worker_mirrored():
        tf_config = cluster_resolver.TFConfigClusterResolver()
        master = tf_config.master()
        if tf_config.rpc_layer:
            # Strip off the rpc_layer suffix.
            master = master[len("%s://" % tf_config.rpc_layer):]
        resolver = cluster_resolver.SimpleClusterResolver(
            cluster_spec=tf_config.cluster_spec(),
            task_type=tf_config.task_type,
            task_id=tf_config.task_id,
            master=master,
            environment=tf_config.environment,
            num_accelerators={"GPU": required_gpus},
            rpc_layer=tf_config.rpc_layer or "grpc",
        )
        # Disable health check and coordination service. We don't have a reliable
        # way to shutdown the strategy (and thus the strategy health check or
        # coordination service heartbeat) at the end of a test. Turning on the
        # strategy health check or coordination service heartbeat causes some
        # flakiness since we re-create part of the server when creating a strategy,
        # and our tests are capable of handling failures.
        CollectiveAllReduceExtended._enable_check_health = False  # pylint: disable=protected-access
        context.context().configure_coordination_service(service_type="")
        # Always create the strategy in eager mode so that it starts the server and
        # configures the eager context. The eager context can no longer be
        # configured after initialization.
        with context.eager_mode():
            strategy = CollectiveAllReduceStrategy(cluster_resolver=resolver)

        if not use_merge_call:
            strategy.extended._use_merge_call = lambda: False  # pylint: disable=protected-access
        # TODO(b/152320929): Wait for the cluster before proceeding, otherwise
        # collectives may hang if any worker launches collectives before the chief
        # creates the strategy.
        try:
            multi_process_runner.get_barrier().wait()
        except ValueError:
            # If the creator is called in the main process,
            # multi_process_runner.get_barrier() raises ValueError, which is safe to
            # ignore.
            pass
        return strategy
def enable_collective_ops():
    """Enable collectives in the current process."""
    cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
    context.context().configure_collective_ops(
        collective_leader="'/job:worker/replica:0/task:0'")
    config_proto = config_pb2.ConfigProto()
    config_proto.experimental.collective_group_leader = (
        "/job:worker/replica:0/task:0")
    server_def = tensorflow_server_pb2.ServerDef(
        cluster=cluster_resolver.cluster_spec().as_cluster_def(),
        default_session_config=config_proto,
        job_name=cluster_resolver.task_type,
        task_index=cluster_resolver.task_id,
        protocol=cluster_resolver.rpc_layer)
    context.context().enable_collective_ops(server_def)
    # Recover default flag values.
    cross_device_ops_lib.CollectiveAllReduce._limited_nccl = True
    cross_device_utils.CollectiveReplicaLauncher._use_scoped_allocator = False
    cross_device_utils.CollectiveReplicaLauncher._use_collective_v2 = True
    cross_device_utils.CollectiveReplicaLauncher._use_ordering_token = False
 def _create_multi_worker_mirrored():
     tf_config = cluster_resolver.TFConfigClusterResolver()
     resolver = cluster_resolver.SimpleClusterResolver(
         cluster_spec=tf_config.cluster_spec(),
         task_type=tf_config.task_type,
         task_id=tf_config.task_id,
         environment=tf_config.environment,
         num_accelerators={"GPU": required_gpus},
         rpc_layer=tf_config.rpc_layer,
     )
     strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
         cluster_resolver=resolver)
     # TODO(b/152320929): Wait for the cluster before proceeding, otherwise
     # collectives may hang if any worker launches collectives before the chief
     # creates the strategy.
     try:
         multi_process_runner.barrier().wait()
     except ValueError:
         # If the creator is called in the main process,
         # multi_process_runner.barrier() raises ValueError, which is safe to
         # ignore.
         pass
     return strategy
Ejemplo n.º 14
0
 def worker_fn():
   enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver())
   context.context().check_collective_ops_peer_health(
       "/job:worker/replica:0/task:1",)
Ejemplo n.º 15
0
 def worker_fn():
   enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver())
   context.context().check_collective_ops_peer_health("localhost:12345",)