def reduce_implementation(self, reduce_op, per_replica_value, destinations, experimental_hints): all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value], experimental_hints)[0] devices = get_devices_from(destinations) if _devices_match(per_replica_value, destinations): return all_reduced # Convert `all_reduced` to a `Mirrored` object, as a simple and uniform # utility to access component for a particular device. if not isinstance(all_reduced, value_lib.Mirrored): all_reduced = value_lib.Mirrored([all_reduced]) # If we got this far, the destination devices do not match the all-reduce # devices, so we must map from one to the other. index = [] # We must add these control dependencies, otherwise we can get deadlock. with ops.control_dependencies(all_reduced.values): for d in devices: with ops.device(d): for v in all_reduced.values: if v.device == d: index.append(array_ops.identity(v)) break else: # TODO(josh11b): Once we add support for model parallelism, get the # copy from the corresponding replica instead of the primary. index.append(array_ops.identity(all_reduced._primary)) # pylint: disable=protected-access return distribute_utils.regroup(index, wrap_class=value_lib.Mirrored)
def reduce(self, reduce_op, per_replica_value, destinations): """Reduce `per_replica_value` to `destinations`. It runs the reduction operation defined by `reduce_op` and put the result on `destinations`. Args: reduce_op: Indicates how per_replica_value will be reduced. Accepted values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`. per_replica_value: a PerReplica object or a tensor with device set. destinations: the reduction destinations. Returns: a Mirrored object. Raises: ValueError: if per_replica_value can't be converted to a PerReplica object. """ if not isinstance(per_replica_value, value_lib.PerReplica): per_replica_value = _make_tensor_into_per_replica( per_replica_value) validate_destinations(destinations) # Shortcut if `per_replica_value` only contains one value. if self._num_between_graph_workers == 1 and len( per_replica_value.values) == 1 and _devices_match( per_replica_value, destinations): return value_lib.Mirrored(per_replica_value.device_map, per_replica_value.values) return self.reduce_implementation(reduce_op, per_replica_value, destinations)
def _simple_broadcast(value, destinations): index = {} devices = get_devices_from(destinations) for d in devices: index[d] = cross_device_utils.copy_tensor_or_indexed_slices_to_device( value, d) return value_lib.Mirrored(index)
def _reduce_non_distributed_value(extended, reduce_op, value, destinations): """Reduce a non-DistributedValue `value` to `destinations`.""" if isinstance(value, values.DistributedValues): raise ValueError("You are passing a `DistributedValue` to " "`_reduce_non_distributed_value`, which is not allowed.") # If the same value is present on all replicas then the PerReplica value will # be a single value. We also handle the case when `value` is a single value # and equal to 0. if value == 0: return 0 # If there is only a single value and the reduce op is MEAN, # that value should be on all destinations. if reduce_op == reduce_util.ReduceOp.MEAN: return value cross_device_ops_lib.validate_destinations(destinations) # We do not support a reduce op of SUM if the value is the same across # all replicas. We call this as part of assign functions for MirroredVariables # and summing up identical values across replicas is not clearly defined. if (len(extended.worker_devices) != 1 or not cross_device_ops_lib.check_destinations(destinations)): raise ValueError("A non-DistributedValues value %s cannot be reduced with " "the given reduce op %s." % (value, reduce_op)) # TODO(anjalisridhar): Moves these methods to a device utility file? devices = cross_device_ops_lib.get_devices_from(destinations) if len(devices) == 1: with ops.device(devices[0]): return array_ops.identity(value) else: value_updates = {} for d in devices: with ops.device(d): value_updates[d] = array_ops.identity(value) return values.Mirrored(value_updates)
def _ungroup_and_make_mirrored(grouped_reduced, destinations, reduce_op, num_between_graph_workers=1): """Ungroup results from all-reduce and make Mirrored objects. Each all-reduce result will be divided by the number of destinations before Mirrored objects are created if reduce_op is "mean". Args: grouped_reduced: a list of lists, each sublist has components for each device, paired with a None. It is the result from cross_device_utils.aggregate_gradients_using*. destinations: a value to colocate the result with. reduce_op: Indicates how values will be aggregated. Accepted values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`. num_between_graph_workers: number of workers in the between-graph replication. Returns: a list of Mirrored objects. """ device_map, logical_device = get_device_map_from(destinations) num_replicas = device_map.num_replicas_in_graph * num_between_graph_workers index = [[] for _ in range(len(grouped_reduced[0]))] for per_replica_reduced in grouped_reduced: for i, (v, _) in enumerate(per_replica_reduced): if reduce_op == reduce_util.ReduceOp.MEAN: index[i].append(v / num_replicas) else: index[i].append(v) return [value_lib.Mirrored(device_map, v, logical_device) for v in index]
def _reduce(self, reduce_op, per_replica_value, destinations): if cross_device_utils.contains_indexed_slices(per_replica_value): raise ValueError( "`IndexSlices` is not supported for Collective All-Reduce.") if context.executing_eagerly(): raise ValueError( "Eager execution is not supported for Collective All-Reduce") all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value])[0] device_map, logical_device = get_device_map_from(destinations) if (all_reduced.device_map is device_map and all_reduced.logical_device == logical_device): return all_reduced devices = device_map.logical_to_actual_devices(logical_device) index = [] for d in devices: if d in all_reduced.devices: index.append(all_reduced.get(d)) else: # TODO(josh11b): Once we add support for model parallelism, get the # copy from the corresponding replica instead of the primary. with ops.control_dependencies( all_reduced.values), ops.device(d): index.append(array_ops.identity(all_reduced.primary)) return value_lib.Mirrored(device_map, index, logical_device)
def reduce_implementation(self, reduce_op, per_replica_value, destinations): all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value])[0] device_map, logical_device = get_device_map_from(destinations) devices = device_map.logical_to_actual_devices(logical_device) if (isinstance(all_reduced, value_lib.Mirrored) and all_reduced.device_map is device_map and all_reduced.logical_device == logical_device): return all_reduced # Convert `all_reduced` to a `Mirrored` object, as a simple and uniform # utility to access component for a particular device. if not isinstance(all_reduced, value_lib.Mirrored): all_reduced = value_lib.Mirrored( value_lib.SingleDeviceMap(all_reduced.device), [all_reduced]) index = [] with ops.control_dependencies(all_reduced.values): for d in devices: with ops.device(d): if d in all_reduced.devices: index.append(array_ops.identity(all_reduced.get(d))) else: # TODO(josh11b): Once we add support for model parallelism, get the # copy from the corresponding replica instead of the primary. index.append(array_ops.identity(all_reduced.primary)) return value_lib.regroup(device_map, index, wrap_class=value_lib.Mirrored)
def _make_mirrored_val(init_val=5.0): v = [] devices = ["/device:GPU:0", "/device:CPU:0"] for d, _ in zip(devices, ["v", "v/replica"]): with ops.device(d): v.append(constant_op.constant(init_val)) return values_lib.Mirrored(v)
def _fake_mirrored(value, devices): """Create a faked Mirrored object for testing. All components of the returned Mirrored have the same objects, which is not true in reality. """ devices = _get_devices(devices) return value_lib.Mirrored(value_lib.ReplicaDeviceMap(devices), [value] * len(devices))
def _fake_mirrored(value, devices): """Create a faked Mirrored object for testing. All components of the returned Mirrored have the same objects, which is not true in reality. """ devices = cross_device_ops_lib.get_devices_from(devices) return value_lib.Mirrored( {d: v for d, v in zip(devices, [value] * len(devices))})
def _make_grouped_mirrored(values): """Convert per-replica list `values` into Mirrored type with grouping.""" if len(values) == 1: return values_lib.Mirrored(values) # Make sure we run all updates. Without this, something like # session.run(extended.update(...)) may only update one replica. g = control_flow_ops.group(values) # If values is just ops, the grouping is enough. Everything in values # should have the same type, since we expect every replica to be performing # the same computation. if not all(tensor_util.is_tf_type(v) for v in values): return g # Otherwise we need tensors with the same values as `values`, but # that have a dependency on `g`. with_dep = [] for v in values: with ops.device(v.device), ops.control_dependencies([g]): with_dep.append(array_ops.identity(v)) return values_lib.Mirrored(with_dep)
def simple_broadcast(value, destinations, always_mirrored=False): """Broadcast `value` to `destinations` using simple copies.""" device_map, logical_device = get_device_map_from(destinations) devices = device_map.logical_to_actual_devices(logical_device) if len(devices) == 1 and not always_mirrored: return cross_device_utils.copy_tensor_or_indexed_slices_to_device( value, devices[0]) else: value_updates = [] for d in devices: value_updates.append( cross_device_utils.copy_tensor_or_indexed_slices_to_device( value, d)) return value_lib.Mirrored(device_map, value_updates, logical_device)
def reduce_implementation(self, reduce_op, per_replica_value, destinations): all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value])[0] device_map, logical_device = get_device_map_from(destinations) if (all_reduced.device_map is device_map and all_reduced.logical_device == logical_device): return all_reduced devices = device_map.logical_to_actual_devices(logical_device) index = [] for d in devices: if d in all_reduced.devices: index.append(all_reduced.get(d)) else: # TODO(josh11b): Once we add support for model parallelism, get the # copy from the corresponding replica instead of the primary. with ops.control_dependencies(all_reduced.values), ops.device(d): index.append(array_ops.identity(all_reduced.primary)) return value_lib.Mirrored(device_map, index, logical_device)
def batch_reduce(self, reduce_op, value_destination_pairs): """Reduce PerReplica objects in a batch. Reduce each first element in `value_destination_pairs` to each second element which indicates the destinations. Args: reduce_op: Indicates how per_replica_value will be reduced. Accepted values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`. value_destination_pairs: a list or a tuple of tuples of PerReplica objects (or tensors with device set if there is one device) and destinations. Returns: a list of Mirrored objects. Raises: ValueError: if `value_destination_pairs` is not a list or a tuple of tuples of PerReplica objects and destinations """ # TODO(yuefengz): if destinations are different, split into several # `_batch_reduce` invocations. if not _validate_value_destination_pairs(value_destination_pairs): # If the first element of each pair is a tensor, we try to turn it into a # PerReplica object. value_destination_pairs = _normalize_value_destination_pairs( value_destination_pairs) for _, d in value_destination_pairs: validate_destinations(d) # Shortcut all PerReplica objects only contain one value. if self._num_between_graph_workers == 1 and _all_devices_match( value_destination_pairs) and len( value_destination_pairs[0][0].values) == 1: return [ value_lib.Mirrored(v.device_map, v.values) for v, _ in value_destination_pairs ] return self.batch_reduce_implementation(reduce_op, value_destination_pairs)
def _reduce(self, reduce_op, per_replica_value, destinations): if cross_device_utils.contains_indexed_slices(per_replica_value): raise ValueError( "`IndexSlices` is not supported for Collective All-Reduce.") if context.executing_eagerly(): raise ValueError( "Eager execution is not supported for Collective All-Reduce") all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value])[0] if _devices_match(per_replica_value, destinations): return all_reduced else: index = {} for d in get_devices_from(destinations): # pylint: disable=protected-access if d in all_reduced._index: index[d] = all_reduced._index[d] else: with ops.control_dependencies(list( all_reduced._index.values())), ops.device(d): index[d] = array_ops.identity(list(all_reduced._index.values())[0]) return value_lib.Mirrored(index)
def _assume_mirrored(grad): if isinstance(grad, ds_values.PerReplica): return ds_values.Mirrored(grad.values) return grad
def _make_mirrored_indexed_slices(devices, values, indices, dense_shape): return value_lib.Mirrored({ d: _make_indexed_slices(values, indices, dense_shape, d) for d in devices })
def _make_mirrored_indexed_slices(devices, values, indices, dense_shape): values = [ _make_indexed_slices(values, indices, dense_shape, d) for d in devices ] return value_lib.Mirrored(value_lib.ReplicaDeviceMap(devices), values)