def all_reduce_v2(t,
                  group_size,
                  group_key,
                  instance_key,
                  merge_op='Add',
                  final_op='Id',
                  communication_hint='auto'):
    """Reduces tensors collectively, across devices.

  Args:
    t: the tensor to be reduced.
    group_size: an int32 tensor. The total number of tensors to be collectively
      reduced.  Each must reside on a different device.  Should be a positive
      integer.
    group_key: an int32 tensor identifying the group of devices.
    instance_key: an int32 tensor identifying the participating group of Ops.
    merge_op: string naming the binary Op to be applied to compute each partial
      reduction.
    final_op: string naming the unary Op to be applied to each fully reduced
      value.  Can be 'Id' for no operation.
    communication_hint: preferred collective communication.  The implementation
      may fall back to another mechanism.  Options include `auto`, `ring`, and
      `nccl`.

  Returns:
    An Op implementing the distributed reduction.
  """
    return gen_collective_ops.collective_reduce_v2(
        t,
        group_size=group_size,
        group_key=group_key,
        instance_key=instance_key,
        merge_op=merge_op,
        final_op=final_op,
        communication_hint=communication_hint.lower())
Exemple #2
0
 def single_all_reduce(in_value, group_size, group_key, instance_key):
     return gen_collective_ops.collective_reduce_v2(
         in_value,
         group_size,
         group_key,
         instance_key,
         merge_op='Add',
         final_op='Id',
         communication_hint='auto')
Exemple #3
0
def all_reduce_v2(t,
                  group_size,
                  group_key,
                  instance_key,
                  merge_op='Add',
                  final_op='Id',
                  communication_hint='auto',
                  timeout=0,
                  ordering_token=None,
                  max_subdivs_per_device=-1):
    """Reduces tensors collectively, across devices.

  Args:
    t: the tensor to be reduced.
    group_size: an int32 tensor. The total number of tensors to be collectively
      reduced.  Each must reside on a different device.  Should be a positive
      integer.
    group_key: an int32 tensor identifying the group of devices.
    instance_key: an int32 tensor identifying the participating group of Ops.
    merge_op: string naming the binary Op to be applied to compute each partial
      reduction.
    final_op: string naming the unary Op to be applied to each fully reduced
      value.  Can be 'Id' for no operation.
    communication_hint: preferred collective communication.  The implementation
      may fall back to another mechanism.  Options include `auto`, `ring`, and
      `nccl`.
    timeout: a float. If set to a non zero, set a completion timeout to detect
      staleness.  If the timer goes off, a DeadlineExceededError is raised.  The
      timeout value in seconds. This feature is experimental.
    ordering_token: an optional resource tensor to pass to the op as inputs.
      They aren't used by the kernel but allow AutoControlDependency to order
      the collectives with control dependencies.
    max_subdivs_per_device: int specifying the maximum number of subdivisions a
      tensor on a device can be divided into. The runtime uses this contraint to
      parallelize processing of each per-device tensor. Setting to -1 disables
      subdivision and reverts to previous behavior of not sub-dividing tensor.
      Setting to 0 uses sytem defaults.

  Returns:
    An Op implementing the distributed reduction.
  """
    if ordering_token is not None:
        ordering_token = [ordering_token]
    return gen_collective_ops.collective_reduce_v2(
        t,
        group_size=group_size,
        group_key=group_key,
        instance_key=instance_key,
        merge_op=merge_op,
        final_op=final_op,
        communication_hint=communication_hint.lower(),
        timeout_seconds=timeout,
        ordering_token=ordering_token or [],
        max_subdivs_per_device=max_subdivs_per_device)
Exemple #4
0
def all_reduce_v2(t,
                  group_size,
                  group_key,
                  instance_key,
                  merge_op='Add',
                  final_op='Id',
                  communication_hint='auto',
                  timeout=0):
    """Reduces tensors collectively, across devices.

  Args:
    t: the tensor to be reduced.
    group_size: an int32 tensor. The total number of tensors to be collectively
      reduced.  Each must reside on a different device.  Should be a positive
      integer.
    group_key: an int32 tensor identifying the group of devices.
    instance_key: an int32 tensor identifying the participating group of Ops.
    merge_op: string naming the binary Op to be applied to compute each partial
      reduction.
    final_op: string naming the unary Op to be applied to each fully reduced
      value.  Can be 'Id' for no operation.
    communication_hint: preferred collective communication.  The implementation
      may fall back to another mechanism.  Options include `auto`, `ring`, and
      `nccl`.
    timeout: a float. If set to a non zero, set a completion timeout to detect
      staleness.  If the timer goes off, a DeadlineExceededError is raised.  The
      timeout value in seconds. This feature is experimental.

  Returns:
    An Op implementing the distributed reduction.
  """
    return gen_collective_ops.collective_reduce_v2(
        t,
        group_size=group_size,
        group_key=group_key,
        instance_key=instance_key,
        merge_op=merge_op,
        final_op=final_op,
        communication_hint=communication_hint.lower(),
        timeout_seconds=timeout)