Beispiel #1
0
    def reduce_implementation(self, reduce_op, per_replica_value, destinations,
                              experimental_hints):
        all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value],
                                             experimental_hints)[0]
        devices = get_devices_from(destinations)

        if _devices_match(per_replica_value, destinations):
            return all_reduced

        # Convert `all_reduced` to a `Mirrored` object, as a simple and uniform
        # utility to access component for a particular device.
        if not isinstance(all_reduced, value_lib.Mirrored):
            all_reduced = value_lib.Mirrored([all_reduced])

        # If we got this far, the destination devices do not match the all-reduce
        # devices, so we must map from one to the other.
        index = []
        # We must add these control dependencies, otherwise we can get deadlock.
        with ops.control_dependencies(all_reduced.values):
            for d in devices:
                with ops.device(d):
                    for v in all_reduced.values:
                        if v.device == d:
                            index.append(array_ops.identity(v))
                            break
                    else:
                        # TODO(josh11b): Once we add support for model parallelism, get the
                        # copy from the corresponding replica instead of the primary.
                        index.append(array_ops.identity(all_reduced._primary))  # pylint: disable=protected-access
        return distribute_utils.regroup(index, wrap_class=value_lib.Mirrored)
    def reduce(self, reduce_op, per_replica_value, destinations):
        """Reduce `per_replica_value` to `destinations`.

    It runs the reduction operation defined by `reduce_op` and put the
    result on `destinations`.

    Args:
      reduce_op: Indicates how per_replica_value will be reduced. Accepted
        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
      per_replica_value: a PerReplica object or a tensor with device set.
      destinations: the reduction destinations.

    Returns:
      a Mirrored object.

    Raises:
      ValueError: if per_replica_value can't be converted to a PerReplica
        object.
    """
        if not isinstance(per_replica_value, value_lib.PerReplica):
            per_replica_value = _make_tensor_into_per_replica(
                per_replica_value)

        validate_destinations(destinations)

        # Shortcut if `per_replica_value` only contains one value.
        if self._num_between_graph_workers == 1 and len(
                per_replica_value.values) == 1 and _devices_match(
                    per_replica_value, destinations):
            return value_lib.Mirrored(per_replica_value.device_map,
                                      per_replica_value.values)

        return self.reduce_implementation(reduce_op, per_replica_value,
                                          destinations)
def _simple_broadcast(value, destinations):
    index = {}
    devices = get_devices_from(destinations)
    for d in devices:
        index[d] = cross_device_utils.copy_tensor_or_indexed_slices_to_device(
            value, d)
    return value_lib.Mirrored(index)
Beispiel #4
0
def _reduce_non_distributed_value(extended, reduce_op, value, destinations):
  """Reduce a non-DistributedValue `value` to `destinations`."""
  if isinstance(value, values.DistributedValues):
    raise ValueError("You are passing a `DistributedValue` to "
                     "`_reduce_non_distributed_value`, which is not allowed.")

  # If the same value is present on all replicas then the PerReplica value will
  # be a single value. We also handle the case when `value` is a single value
  # and equal to 0.
  if value == 0:
    return 0
  # If there is only a single value and the reduce op is MEAN,
  # that value should be on all destinations.
  if reduce_op == reduce_util.ReduceOp.MEAN:
    return value

  cross_device_ops_lib.validate_destinations(destinations)
  # We do not support a reduce op of SUM if the value is the same across
  # all replicas. We call this as part of assign functions for MirroredVariables
  # and summing up identical values across replicas is not clearly defined.
  if (len(extended.worker_devices) != 1 or
      not cross_device_ops_lib.check_destinations(destinations)):
    raise ValueError("A non-DistributedValues value %s cannot be reduced with "
                     "the given reduce op %s." % (value, reduce_op))
  # TODO(anjalisridhar): Moves these methods to a device utility file?
  devices = cross_device_ops_lib.get_devices_from(destinations)
  if len(devices) == 1:
    with ops.device(devices[0]):
      return array_ops.identity(value)
  else:
    value_updates = {}
    for d in devices:
      with ops.device(d):
        value_updates[d] = array_ops.identity(value)
    return values.Mirrored(value_updates)
Beispiel #5
0
def _ungroup_and_make_mirrored(grouped_reduced,
                               destinations,
                               reduce_op,
                               num_between_graph_workers=1):
    """Ungroup results from all-reduce and make Mirrored objects.

  Each all-reduce result will be divided by the number of destinations before
  Mirrored objects are created if reduce_op is "mean".

  Args:
    grouped_reduced: a list of lists, each sublist has components for each
      device, paired with a None. It is the result from
      cross_device_utils.aggregate_gradients_using*.
    destinations: a value to colocate the result with.
    reduce_op: Indicates how values will be aggregated. Accepted values
      are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
    num_between_graph_workers: number of workers in the between-graph
      replication.

  Returns:
    a list of Mirrored objects.
  """
    device_map, logical_device = get_device_map_from(destinations)
    num_replicas = device_map.num_replicas_in_graph * num_between_graph_workers
    index = [[] for _ in range(len(grouped_reduced[0]))]
    for per_replica_reduced in grouped_reduced:
        for i, (v, _) in enumerate(per_replica_reduced):
            if reduce_op == reduce_util.ReduceOp.MEAN:
                index[i].append(v / num_replicas)
            else:
                index[i].append(v)
    return [value_lib.Mirrored(device_map, v, logical_device) for v in index]
Beispiel #6
0
    def _reduce(self, reduce_op, per_replica_value, destinations):
        if cross_device_utils.contains_indexed_slices(per_replica_value):
            raise ValueError(
                "`IndexSlices` is not supported for Collective All-Reduce.")
        if context.executing_eagerly():
            raise ValueError(
                "Eager execution is not supported for Collective All-Reduce")

        all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value])[0]
        device_map, logical_device = get_device_map_from(destinations)
        if (all_reduced.device_map is device_map
                and all_reduced.logical_device == logical_device):
            return all_reduced
        devices = device_map.logical_to_actual_devices(logical_device)
        index = []
        for d in devices:
            if d in all_reduced.devices:
                index.append(all_reduced.get(d))
            else:
                # TODO(josh11b): Once we add support for model parallelism, get the
                # copy from the corresponding replica instead of the primary.
                with ops.control_dependencies(
                        all_reduced.values), ops.device(d):
                    index.append(array_ops.identity(all_reduced.primary))

        return value_lib.Mirrored(device_map, index, logical_device)
Beispiel #7
0
    def reduce_implementation(self, reduce_op, per_replica_value,
                              destinations):
        all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value])[0]
        device_map, logical_device = get_device_map_from(destinations)
        devices = device_map.logical_to_actual_devices(logical_device)

        if (isinstance(all_reduced, value_lib.Mirrored)
                and all_reduced.device_map is device_map
                and all_reduced.logical_device == logical_device):
            return all_reduced

        # Convert `all_reduced` to a `Mirrored` object, as a simple and uniform
        # utility to access component for a particular device.
        if not isinstance(all_reduced, value_lib.Mirrored):
            all_reduced = value_lib.Mirrored(
                value_lib.SingleDeviceMap(all_reduced.device), [all_reduced])

        index = []
        with ops.control_dependencies(all_reduced.values):
            for d in devices:
                with ops.device(d):
                    if d in all_reduced.devices:
                        index.append(array_ops.identity(all_reduced.get(d)))
                    else:
                        # TODO(josh11b): Once we add support for model parallelism, get the
                        # copy from the corresponding replica instead of the primary.
                        index.append(array_ops.identity(all_reduced.primary))

        return value_lib.regroup(device_map,
                                 index,
                                 wrap_class=value_lib.Mirrored)
def _make_mirrored_val(init_val=5.0):
  v = []
  devices = ["/device:GPU:0", "/device:CPU:0"]
  for d, _ in zip(devices, ["v", "v/replica"]):
    with ops.device(d):
      v.append(constant_op.constant(init_val))
  return values_lib.Mirrored(v)
Beispiel #9
0
def _fake_mirrored(value, devices):
    """Create a faked Mirrored object for testing.

  All components of the returned Mirrored have the same objects, which is not
  true in reality.
  """
    devices = _get_devices(devices)
    return value_lib.Mirrored(value_lib.ReplicaDeviceMap(devices),
                              [value] * len(devices))
def _fake_mirrored(value, devices):
  """Create a faked Mirrored object for testing.

  All components of the returned Mirrored have the same objects, which is not
  true in reality.
  """
  devices = cross_device_ops_lib.get_devices_from(devices)
  return value_lib.Mirrored(
      {d: v for d, v in zip(devices, [value] * len(devices))})
    def _make_grouped_mirrored(values):
        """Convert per-replica list `values` into Mirrored type with grouping."""
        if len(values) == 1:
            return values_lib.Mirrored(values)

        # Make sure we run all updates. Without this, something like
        # session.run(extended.update(...)) may only update one replica.
        g = control_flow_ops.group(values)

        # If values is just ops, the grouping is enough. Everything in values
        # should have the same type, since we expect every replica to be performing
        # the same computation.
        if not all(tensor_util.is_tf_type(v) for v in values):
            return g

        # Otherwise we need tensors with the same values as `values`, but
        # that have a dependency on `g`.
        with_dep = []
        for v in values:
            with ops.device(v.device), ops.control_dependencies([g]):
                with_dep.append(array_ops.identity(v))

        return values_lib.Mirrored(with_dep)
Beispiel #12
0
def simple_broadcast(value, destinations, always_mirrored=False):
    """Broadcast `value` to `destinations` using simple copies."""
    device_map, logical_device = get_device_map_from(destinations)
    devices = device_map.logical_to_actual_devices(logical_device)
    if len(devices) == 1 and not always_mirrored:
        return cross_device_utils.copy_tensor_or_indexed_slices_to_device(
            value, devices[0])
    else:
        value_updates = []
        for d in devices:
            value_updates.append(
                cross_device_utils.copy_tensor_or_indexed_slices_to_device(
                    value, d))
        return value_lib.Mirrored(device_map, value_updates, logical_device)
Beispiel #13
0
  def reduce_implementation(self, reduce_op, per_replica_value, destinations):
    all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value])[0]
    device_map, logical_device = get_device_map_from(destinations)
    if (all_reduced.device_map is device_map and
        all_reduced.logical_device == logical_device):
      return all_reduced
    devices = device_map.logical_to_actual_devices(logical_device)
    index = []
    for d in devices:
      if d in all_reduced.devices:
        index.append(all_reduced.get(d))
      else:
        # TODO(josh11b): Once we add support for model parallelism, get the
        # copy from the corresponding replica instead of the primary.
        with ops.control_dependencies(all_reduced.values), ops.device(d):
          index.append(array_ops.identity(all_reduced.primary))

    return value_lib.Mirrored(device_map, index, logical_device)
    def batch_reduce(self, reduce_op, value_destination_pairs):
        """Reduce PerReplica objects in a batch.

    Reduce each first element in `value_destination_pairs` to each second
    element which indicates the destinations.

    Args:
      reduce_op: Indicates how per_replica_value will be reduced. Accepted
        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
      value_destination_pairs: a list or a tuple of tuples of PerReplica objects
        (or tensors with device set if there is one device) and destinations.

    Returns:
      a list of Mirrored objects.

    Raises:
      ValueError: if `value_destination_pairs` is not a list or a tuple of
        tuples of PerReplica objects and destinations
    """
        # TODO(yuefengz): if destinations are different, split into several
        # `_batch_reduce` invocations.
        if not _validate_value_destination_pairs(value_destination_pairs):
            # If the first element of each pair is a tensor, we try to turn it into a
            # PerReplica object.
            value_destination_pairs = _normalize_value_destination_pairs(
                value_destination_pairs)

        for _, d in value_destination_pairs:
            validate_destinations(d)

        # Shortcut all PerReplica objects only contain one value.
        if self._num_between_graph_workers == 1 and _all_devices_match(
                value_destination_pairs) and len(
                    value_destination_pairs[0][0].values) == 1:
            return [
                value_lib.Mirrored(v.device_map, v.values)
                for v, _ in value_destination_pairs
            ]

        return self.batch_reduce_implementation(reduce_op,
                                                value_destination_pairs)
Beispiel #15
0
  def _reduce(self, reduce_op, per_replica_value, destinations):
    if cross_device_utils.contains_indexed_slices(per_replica_value):
      raise ValueError(
          "`IndexSlices` is not supported for Collective All-Reduce.")
    if context.executing_eagerly():
      raise ValueError(
          "Eager execution is not supported for Collective All-Reduce")

    all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value])[0]
    if _devices_match(per_replica_value, destinations):
      return all_reduced
    else:
      index = {}
      for d in get_devices_from(destinations):
        # pylint: disable=protected-access
        if d in all_reduced._index:
          index[d] = all_reduced._index[d]
        else:
          with ops.control_dependencies(list(
              all_reduced._index.values())), ops.device(d):
            index[d] = array_ops.identity(list(all_reduced._index.values())[0])

      return value_lib.Mirrored(index)
Beispiel #16
0
 def _assume_mirrored(grad):
     if isinstance(grad, ds_values.PerReplica):
         return ds_values.Mirrored(grad.values)
     return grad
def _make_mirrored_indexed_slices(devices, values, indices, dense_shape):
  return value_lib.Mirrored({
      d: _make_indexed_slices(values, indices, dense_shape, d) for d in devices
  })
Beispiel #18
0
def _make_mirrored_indexed_slices(devices, values, indices, dense_shape):
    values = [
        _make_indexed_slices(values, indices, dense_shape, d) for d in devices
    ]
    return value_lib.Mirrored(value_lib.ReplicaDeviceMap(devices), values)