Esempio n. 1
0
def _build_nccl_hybrid(input_tensors, red_op, upper_level_f):
    """Construct a subgraph for NCCL hybrid all-reduce.

  Args:
    input_tensors: list of T `tf.Tensor` of same-shape and type values to
      be reduced.
    red_op: binary elementwise reduction operator.
    upper_level_f: function for reducing one value per worker, across
      workers.

  Returns:
    list of T `tf.Tensor` of reduced values.

  Raises:
    ValueError: inputs not well-formed.
  """
    input_tensors, shape = _flatten_tensors(input_tensors)
    devices = [t.device for t in input_tensors]
    per_worker_devices, per_worker_values = _split_by_task(
        devices, input_tensors)
    num_workers = len(per_worker_devices)
    up_values = [None for w in range(0, num_workers)]
    up_devices = up_values[:]
    down_values = up_values[:]
    # First stage: reduce within each worker using NCCL
    for w in range(0, num_workers):
        worker_values = build_nccl_all_reduce(per_worker_values[w], red_op)
        # NOTE: these reductions will not run to completion unless
        # every output value is used.  Since we only need one, we
        # need to put control dependencies on the rest.
        with ops.control_dependencies(worker_values):
            with ops.device(worker_values[0].device):
                up_values[w] = array_ops.identity(worker_values[0])
            up_devices[w] = per_worker_devices[w][0]
    # Second stage: Apply upper_level_f to reduce across first device at
    # each worker
    level_2_output = upper_level_f(up_values)
    # Third stage: propagate within each worker using NCCL Broadcast
    for w in range(0, num_workers):
        dst_tensors = []
        with ops.device(per_worker_devices[w][0]):
            broadcast_src = nccl_ops.broadcast(
                array_ops.identity(level_2_output[w]))
        for d in per_worker_devices[w]:
            with ops.device(d):
                dst_tensors.append(array_ops.identity(broadcast_src))
        down_values[w] = dst_tensors
    output_tensors = [v for sublist in down_values for v in sublist]
    if len(shape) != 1:
        output_tensors = _reshape_tensors(output_tensors, shape)
    return output_tensors
Esempio n. 2
0
def _build_nccl_hybrid(input_tensors, red_op, upper_level_f):
  """Construct a subgraph for NCCL hybrid all-reduce.

  Args:
    input_tensors: list of T `tf.Tensor` of same-shape and type values to
      be reduced.
    red_op: binary elementwise reduction operator.
    upper_level_f: function for reducing one value per worker, across
      workers.

  Returns:
    list of T `tf.Tensor` of reduced values.

  Raises:
    ValueError: inputs not well-formed.
  """
  input_tensors, shape = _flatten_tensors(input_tensors)
  devices = [t.device for t in input_tensors]
  per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors)
  num_workers = len(per_worker_devices)
  up_values = [None for w in range(0, num_workers)]
  up_devices = up_values[:]
  down_values = up_values[:]
  # First stage: reduce within each worker using NCCL
  for w in range(0, num_workers):
    worker_values = build_nccl_all_reduce(per_worker_values[w], red_op)
    # NOTE: these reductions will not run to completion unless
    # every output value is used.  Since we only need one, we
    # need to put control dependencies on the rest.
    with ops.control_dependencies(worker_values):
      with ops.device(worker_values[0].device):
        up_values[w] = array_ops.identity(worker_values[0])
      up_devices[w] = per_worker_devices[w][0]
  # Second stage: Apply upper_level_f to reduce across first device at
  # each worker
  level_2_output = upper_level_f(up_values)
  # Third stage: propagate within each worker using NCCL Broadcast
  for w in range(0, num_workers):
    dst_tensors = []
    with ops.device(per_worker_devices[w][0]):
      broadcast_src = nccl_ops.broadcast(array_ops.identity(level_2_output[w]))
    for d in per_worker_devices[w]:
      with ops.device(d):
        dst_tensors.append(array_ops.identity(broadcast_src))
    down_values[w] = dst_tensors
  output_tensors = [v for sublist in down_values for v in sublist]
  if len(shape) != 1:
    output_tensors = _reshape_tensors(output_tensors, shape)
  return output_tensors
Esempio n. 3
0
def all_sync_params(tower_params, devices):
   if len(devices) == 1:
       return tf.no_op()
   sync_ops = []
   if False and have_nccl and FLAGS.nccl:
       for param_on_devices in zip(*tower_params):
           param0 = param_on_devices[0]
           received = nccl_ops.broadcast(param0)
           for device, param in zip(devices[1:], param_on_devices[1:]):
               with tf.device(device):
                   sync_op = param.assign(received)
                   sync_ops.append(sync_op)
   else:
       params0 = tower_params[0]
       for device, params in zip(devices, tower_params):
           with tf.device(device):
               for param, param0 in zip(params, params0):
                   sync_op = param.assign(param0.read_value())
                   sync_ops.append(sync_op)
   return tf.group(*sync_ops)
Esempio n. 4
0
def _NcclBroadcast(tensors, devices):
    sender = np.random.randint(0, len(devices))
    with ops.device(devices[sender]):
        tensor = array_ops.identity(tensors[0])
        broadcast = nccl_ops.broadcast(tensor)
    return _DeviceTensors([broadcast] * len(devices), devices)
Esempio n. 5
0
def _NcclBroadcast(tensors, devices):
  sender = np.random.randint(0, len(devices))
  with ops.device(devices[sender]):
    tensor = array_ops.identity(tensors[0])
    broadcast = nccl_ops.broadcast(tensor)
  return _DeviceTensors([broadcast] * len(devices), devices)