Example #1
0
def all_sync_params(tower_params, devices, usenccl=True):
    """Assigns the params from the first tower to all others"""
    if len(devices) == 1:
        return tf.no_op()
    sync_ops = []
    if have_nccl and usenccl:
        for param_on_devices in zip(*tower_params):
            # print('PARAM_ON_DEVICES: {}'.format(param_on_devices))  # DEBUG
            # Note: param_on_devices is [paramX_gpu0, paramX_gpu1, ...]
            param0 = param_on_devices[0]
            send_op, received_tensors = nccl.broadcast(param0, devices[1:])
            sync_ops.append(send_op)
            for device, param, received in zip(devices[1:],
                                               param_on_devices[1:],
                                               received_tensors):
                with tf.device(device):
                    sync_op = param.assign(received)
                    sync_ops.append(sync_op)
    else:
        params0 = tower_params[0]
        for device, params in zip(devices, tower_params):
            with tf.device(device):
                for param, param0 in zip(params, params0):
                    sync_op = param.assign(param0.read_value())
                    sync_ops.append(sync_op)

    return tf.group(*sync_ops)
Example #2
0
def all_sync_params(tower_params, devices, usenccl=True):
    """Assigns the params from the first tower to all others"""
    if len(devices) == 1:
        return tf.no_op()
    sync_ops = []
    if have_nccl and usenccl:
        for param_on_devices in zip(*tower_params):
            # print('PARAM_ON_DEVICES: {}'.format(param_on_devices))  # DEBUG
            # Note: param_on_devices is [paramX_gpu0, paramX_gpu1, ...]
            param0 = param_on_devices[0]
            send_op, received_tensors = nccl.broadcast(param0, devices[1:])
            sync_ops.append(send_op)
            for device, param, received in zip(devices[1:],
                                               param_on_devices[1:],
                                               received_tensors):
                with tf.device(device):
                    sync_op = param.assign(received)
                    sync_ops.append(sync_op)
    else:
        params0 = tower_params[0]
        for device, params in zip(devices, tower_params):
            with tf.device(device):
                for param, param0 in zip(params, params0):
                    sync_op = param.assign(param0.read_value())
                    sync_ops.append(sync_op)

    return tf.group(*sync_ops)
Example #3
0
    def testBroadcast(self):
        if not test.is_gpu_available():
            return  # Test requires access to a GPU

        for dtype in [np.float32, np.int32, np.int64, np.float64]:
            # Create session inside outer loop to test use of
            # same communicator across multiple sessions.
            with self.test_session(use_gpu=True) as sess:
                for devices in [[
                        '/device:GPU:0', '/device:GPU:0', '/device:GPU:0'
                ], ['/device:GPU:0', '/device:GPU:0']]:
                    shape = (3, 4)
                    sender = np.random.randint(0, len(devices) - 1)
                    with ops.device(devices[sender]):
                        np_ans = (((np.random.random_sample(shape) - .5) *
                                   1024).astype(dtype))
                        t = array_ops.identity(np_ans)
                    other_devices = devices[:sender] + devices[sender + 1:]
                    send_op, received_tensors = nccl.broadcast(
                        t, other_devices)

                    # Verify shape inference.
                    for r in received_tensors:
                        self.assertEqual(shape, r.get_shape())

                    # Run and verify results.
                    nccl_results = sess.run(received_tensors + [send_op])
                    for r in nccl_results[:-1]:
                        self.assertAllClose(r, np_ans)
Example #4
0
  def testBroadcast(self):
    if not test.is_gpu_available():
      return  # Test requires access to a GPU

    for dtype in [np.float32, np.int32, np.int64, np.float64]:
      # Create session inside outer loop to test use of
      # same communicator across multiple sessions.
      with self.test_session(use_gpu=True) as sess:
        for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]:
          shape = (3, 4)
          sender = np.random.randint(0, len(devices) - 1)
          with ops.device(devices[sender]):
            np_ans = ((
                (np.random.random_sample(shape) - .5) * 1024).astype(dtype))
            t = array_ops.identity(np_ans)
          other_devices = devices[:sender] + devices[sender + 1:]
          send_op, received_tensors = nccl.broadcast(t, other_devices)

          # Verify shape inference.
          for r in received_tensors:
            self.assertEqual(shape, r.get_shape())

          # Run and verify results.
          nccl_results = sess.run(received_tensors + [send_op])
          for r in nccl_results[:-1]:
            self.assertAllClose(r, np_ans)
Example #5
0
def _build_nccl_hybrid(input_tensors, red_op, upper_level_f):
    """Construct a subgraph for NCCL hybrid all-reduce.

  Args:
    input_tensors: list of T @{tf.Tensor} of same-shape and type values to
      be reduced.
    red_op: binary elementwise reduction operator.
    upper_level_f: function for reducing one value per worker, across
      workers.

  Returns:
    list of T @{tf.Tensor} of reduced values.

  Raises:
    ValueError: inputs not well-formed.
  """
    input_tensors, shape = _flatten_tensors(input_tensors)
    devices = [t.device for t in input_tensors]
    per_worker_devices, per_worker_values = _split_by_task(
        devices, input_tensors)
    num_workers = len(per_worker_devices)
    up_values = [None for w in range(0, num_workers)]
    up_devices = up_values[:]
    down_values = up_values[:]
    # First stage: reduce within each worker using NCCL
    for w in range(0, num_workers):
        worker_values = build_nccl_all_reduce(per_worker_values[w], red_op)
        # NOTE: these reductions will not run to completion unless id:556
        # https://github.com/imdone/tensorflow/issues/557
        # every output value is used.  Since we only need one, we
        # need to put control dependencies on the rest.
        with ops.control_dependencies(worker_values):
            with ops.device(worker_values[0].device):
                up_values[w] = array_ops.identity(worker_values[0])
            up_devices[w] = per_worker_devices[w][0]
    # Second stage: Apply upper_level_f to reduce across first device at
    # each worker
    level_2_output = upper_level_f(up_values)
    # Third stage: propagate within each worker using NCCL Broadcast
    for w in range(0, num_workers):
        dst_tensors = []
        with ops.device(per_worker_devices[w][0]):
            broadcast_src = nccl.broadcast(
                array_ops.identity(level_2_output[w]))
        for d in per_worker_devices[w]:
            with ops.device(d):
                dst_tensors.append(array_ops.identity(broadcast_src))
        down_values[w] = dst_tensors
    output_tensors = [v for sublist in down_values for v in sublist]
    if len(shape) != 1:
        output_tensors = _reshape_tensors(output_tensors, shape)
    return output_tensors
Example #6
0
def _build_nccl_hybrid(input_tensors, red_op, upper_level_f):
  """Construct a subgraph for NCCL hybrid all-reduce.

  Args:
    input_tensors: list of T @{tf.Tensor} of same-shape and type values to
      be reduced.
    red_op: binary elementwise reduction operator.
    upper_level_f: function for reducing one value per worker, across
      workers.

  Returns:
    list of T @{tf.Tensor} of reduced values.

  Raises:
    ValueError: inputs not well-formed.
  """
  input_tensors, shape = _flatten_tensors(input_tensors)
  devices = [t.device for t in input_tensors]
  per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors)
  num_workers = len(per_worker_devices)
  up_values = [None for w in range(0, num_workers)]
  up_devices = up_values[:]
  down_values = up_values[:]
  # First stage: reduce within each worker using NCCL
  for w in range(0, num_workers):
    worker_values = build_nccl_all_reduce(per_worker_values[w], red_op)
    # NOTE: these reductions will not run to completion unless
    # every output value is used.  Since we only need one, we
    # need to put control dependencies on the rest.
    with ops.control_dependencies(worker_values):
      with ops.device(worker_values[0].device):
        up_values[w] = array_ops.identity(worker_values[0])
      up_devices[w] = per_worker_devices[w][0]
  # Second stage: Apply upper_level_f to reduce across first device at
  # each worker
  level_2_output = upper_level_f(up_values)
  # Third stage: propagate within each worker using NCCL Broadcast
  for w in range(0, num_workers):
    dst_tensors = []
    with ops.device(per_worker_devices[w][0]):
      broadcast_src = nccl.broadcast(array_ops.identity(level_2_output[w]))
    for d in per_worker_devices[w]:
      with ops.device(d):
        dst_tensors.append(array_ops.identity(broadcast_src))
    down_values[w] = dst_tensors
  output_tensors = [v for sublist in down_values for v in sublist]
  if len(shape) != 1:
    output_tensors = _reshape_tensors(output_tensors, shape)
  return output_tensors
Example #7
0
    def testCombined(self):
        if not test.is_gpu_available():
            return  # Test requires access to a GPU

        for dtype in [np.float32, np.int32, np.int64, np.float64]:
            # Create session inside outer loop to test use of
            # same communicator across multiple sessions.
            with self.test_session(use_gpu=True) as sess:
                for devices in [[
                        '/device:GPU:0', '/device:GPU:0', '/device:GPU:0'
                ], ['/device:GPU:0', '/device:GPU:0']]:
                    shape = (3, 4)

                    # all-reduce
                    np_ans = np.zeros(shape=shape, dtype=dtype)
                    tensors = []
                    for d in devices:
                        with ops.device(d):
                            t = ((np.random.random_sample(shape) - .5) *
                                 1024).astype(dtype)
                            np_ans += t
                            tensors.append(array_ops.identity(t))
                    all_reduce_tensors = nccl.all_sum(tensors)

                    sender = np.random.randint(0, len(devices) - 1)
                    other_devices = devices[:sender] + devices[sender + 1:]
                    send_op, received_tensors = nccl.broadcast(
                        all_reduce_tensors[sender], other_devices)

                    # sender doesn't need to be fetched as part of outputs of session.run.
                    del all_reduce_tensors[sender]

                    # Verify shape inference.
                    for r in received_tensors:
                        self.assertEqual(shape, r.get_shape())

                    # Run and verify results.
                    nccl_results = sess.run(received_tensors + [send_op] +
                                            all_reduce_tensors)
                    for r in nccl_results[:len(received_tensors)]:
                        self.assertAllClose(r, np_ans)
Example #8
0
  def testCombined(self):
    if not test.is_gpu_available():
      return  # Test requires access to a GPU

    for dtype in [np.float32, np.int32, np.int64, np.float64]:
      # Create session inside outer loop to test use of
      # same communicator across multiple sessions.
      with self.test_session(use_gpu=True) as sess:
        for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]:
          shape = (3, 4)

          # all-reduce
          np_ans = np.zeros(shape=shape, dtype=dtype)
          tensors = []
          for d in devices:
            with ops.device(d):
              t = ((np.random.random_sample(shape) - .5) * 1024).astype(dtype)
              np_ans += t
              tensors.append(array_ops.identity(t))
          all_reduce_tensors = nccl.all_sum(tensors)

          sender = np.random.randint(0, len(devices) - 1)
          other_devices = devices[:sender] + devices[sender + 1:]
          send_op, received_tensors = nccl.broadcast(all_reduce_tensors[sender],
                                                     other_devices)

          # sender doesn't need to be fetched as part of outputs of session.run.
          del all_reduce_tensors[sender]

          # Verify shape inference.
          for r in received_tensors:
            self.assertEqual(shape, r.get_shape())

          # Run and verify results.
          nccl_results = sess.run(
              received_tensors + [send_op] + all_reduce_tensors)
          for r in nccl_results[:len(received_tensors)]:
            self.assertAllClose(r, np_ans)
Example #9
0
def _NcclBroadcast(tensors, devices):
  sender = np.random.randint(0, len(devices))
  with ops.device(devices[sender]):
    tensor = array_ops.identity(tensors[0])
    broadcast = nccl.broadcast(tensor)
  return _DeviceTensors([broadcast] * len(devices), devices)
def _NcclBroadcast(tensors, devices):
    sender = np.random.randint(0, len(devices))
    d_tensor = _DeviceTensors(tensors[0:1], devices[sender:sender + 1])[0]
    other_devices = devices[:sender] + devices[sender + 1:]
    send_op, received_tensors = nccl.broadcast(d_tensor, other_devices)
    return received_tensors, [send_op]
Example #11
0
def _NcclBroadcast(tensors, devices):
    sender = np.random.randint(0, len(devices))
    with ops.device(devices[sender]):
        tensor = array_ops.identity(tensors[0])
        broadcast = nccl.broadcast(tensor)
    return _DeviceTensors([broadcast] * len(devices), devices)
Example #12
0
def _NcclBroadcast(tensors, devices):
  sender = np.random.randint(0, len(devices))
  d_tensor = _DeviceTensors(tensors[0:1], devices[sender:sender + 1])[0]
  other_devices = devices[:sender] + devices[sender + 1:]
  send_op, received_tensors = nccl.broadcast(d_tensor, other_devices)
  return received_tensors, [send_op]