def push_pull(tensor, scope='', average=True, device_dense='', device_sparse='', compression=Compression.none, enable_async=False): """Perform an push_pull on a tf.Tensor or tf.IndexedSlices. Arguments: tensor: tf.Tensor, tf.Variable, or tf.IndexedSlices to reduce. The shape of the input must be identical across all ranks. scope: the graph name scope average: If True, computes the average over all ranks. Otherwise, computes the sum over all ranks. device_dense: Device to be used for dense tensors. Uses GPU by default. device_sparse: Device to be used for sparse tensors. Uses GPU by default. compression: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. Returns: A tensor of the same shape and type as `tensor`, summed across all processes. """ with tf.device(device_dense): byteps_size = tf.cast(size(), dtype=tensor.dtype) tensor_compressed, ctx = compression.compress(tensor) summed_tensor_compressed = _push_pull(tensor_compressed, scope) summed_tensor = compression.decompress(summed_tensor_compressed, ctx) if not enable_async: new_tensor = (tf.div(summed_tensor, byteps_size) if average else summed_tensor) else: # no need to average for async training new_tensor = summed_tensor return new_tensor
def reduce_implementation(self, reduce_op, per_replica_value, destinations): if tf_cross_device_ops.check_destinations(destinations): devices = tf_cross_device_ops.get_devices_from(destinations) else: devices = tf_cross_device_ops.get_devices_from(per_replica_value) reduce_to_device = devices[0] logging.log_first_n(logging.INFO, "Using byteps push pull to aggregate values", 1) reduced = _simple_reduce(per_replica_value, reduce_to_device, self.accumulation_fn, reduce_op) if size() > 1: reduced = _push_pull(reduced) return reduced
def push_pull(tensor, scope='', average=None, device_dense='', device_sparse='', compression=Compression.none, op=None, enable_async=False): """Perform an push_pull on a tf.Tensor or tf.IndexedSlices. Arguments: tensor: tf.Tensor, tf.Variable, or tf.IndexedSlices to reduce. The shape of the input must be identical across all ranks. average: .. warning:: .. deprecated Use `op` instead. Will be removed. scope: the graph name scope average: If True, computes the average over all ranks. Otherwise, computes the sum over all ranks. device_dense: Device to be used for dense tensors. Uses GPU by default. device_sparse: Device to be used for sparse tensors. Uses GPU by default. compression: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. op: The reduction operation to combine tensors across different ranks. Defaults to Average if None is given. Returns: A tensor of the same shape and type as `tensor`, summed across all processes. """ op = handle_average_backwards_compatibility(op, average).value # Averaging happens in framework code, so translate that to Sum for the actual call true_op = Sum if op == Average else op with tf.device(device_dense): byteps_size = tf.cast(size(), dtype=tensor.dtype) tensor_compressed, ctx = compression.compress(tensor) summed_tensor_compressed = _push_pull(tensor_compressed, scope) summed_tensor = compression.decompress(summed_tensor_compressed, ctx) if not enable_async: _div = tf.div if hasattr(tf, 'div') else tf.math.divide new_tensor = (_div(summed_tensor, byteps_size) if op == Average else summed_tensor) else: # no need to average for async training new_tensor = summed_tensor return new_tensor