def all_sync_params(tower_params, devices, usenccl=True): """Assigns the params from the first tower to all others""" if len(devices) == 1: return tf.no_op() sync_ops = [] if have_nccl and usenccl: for param_on_devices in zip(*tower_params): # print('PARAM_ON_DEVICES: {}'.format(param_on_devices)) # DEBUG # Note: param_on_devices is [paramX_gpu0, paramX_gpu1, ...] param0 = param_on_devices[0] send_op, received_tensors = nccl.broadcast(param0, devices[1:]) sync_ops.append(send_op) for device, param, received in zip(devices[1:], param_on_devices[1:], received_tensors): with tf.device(device): sync_op = param.assign(received) sync_ops.append(sync_op) else: params0 = tower_params[0] for device, params in zip(devices, tower_params): with tf.device(device): for param, param0 in zip(params, params0): sync_op = param.assign(param0.read_value()) sync_ops.append(sync_op) return tf.group(*sync_ops)
def testBroadcast(self): if not test.is_gpu_available(): return # Test requires access to a GPU for dtype in [np.float32, np.int32, np.int64, np.float64]: # Create session inside outer loop to test use of # same communicator across multiple sessions. with self.test_session(use_gpu=True) as sess: for devices in [[ '/device:GPU:0', '/device:GPU:0', '/device:GPU:0' ], ['/device:GPU:0', '/device:GPU:0']]: shape = (3, 4) sender = np.random.randint(0, len(devices) - 1) with ops.device(devices[sender]): np_ans = (((np.random.random_sample(shape) - .5) * 1024).astype(dtype)) t = array_ops.identity(np_ans) other_devices = devices[:sender] + devices[sender + 1:] send_op, received_tensors = nccl.broadcast( t, other_devices) # Verify shape inference. for r in received_tensors: self.assertEqual(shape, r.get_shape()) # Run and verify results. nccl_results = sess.run(received_tensors + [send_op]) for r in nccl_results[:-1]: self.assertAllClose(r, np_ans)
def testBroadcast(self): if not test.is_gpu_available(): return # Test requires access to a GPU for dtype in [np.float32, np.int32, np.int64, np.float64]: # Create session inside outer loop to test use of # same communicator across multiple sessions. with self.test_session(use_gpu=True) as sess: for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]: shape = (3, 4) sender = np.random.randint(0, len(devices) - 1) with ops.device(devices[sender]): np_ans = (( (np.random.random_sample(shape) - .5) * 1024).astype(dtype)) t = array_ops.identity(np_ans) other_devices = devices[:sender] + devices[sender + 1:] send_op, received_tensors = nccl.broadcast(t, other_devices) # Verify shape inference. for r in received_tensors: self.assertEqual(shape, r.get_shape()) # Run and verify results. nccl_results = sess.run(received_tensors + [send_op]) for r in nccl_results[:-1]: self.assertAllClose(r, np_ans)
def _build_nccl_hybrid(input_tensors, red_op, upper_level_f): """Construct a subgraph for NCCL hybrid all-reduce. Args: input_tensors: list of T @{tf.Tensor} of same-shape and type values to be reduced. red_op: binary elementwise reduction operator. upper_level_f: function for reducing one value per worker, across workers. Returns: list of T @{tf.Tensor} of reduced values. Raises: ValueError: inputs not well-formed. """ input_tensors, shape = _flatten_tensors(input_tensors) devices = [t.device for t in input_tensors] per_worker_devices, per_worker_values = _split_by_task( devices, input_tensors) num_workers = len(per_worker_devices) up_values = [None for w in range(0, num_workers)] up_devices = up_values[:] down_values = up_values[:] # First stage: reduce within each worker using NCCL for w in range(0, num_workers): worker_values = build_nccl_all_reduce(per_worker_values[w], red_op) # NOTE: these reductions will not run to completion unless id:556 # https://github.com/imdone/tensorflow/issues/557 # every output value is used. Since we only need one, we # need to put control dependencies on the rest. with ops.control_dependencies(worker_values): with ops.device(worker_values[0].device): up_values[w] = array_ops.identity(worker_values[0]) up_devices[w] = per_worker_devices[w][0] # Second stage: Apply upper_level_f to reduce across first device at # each worker level_2_output = upper_level_f(up_values) # Third stage: propagate within each worker using NCCL Broadcast for w in range(0, num_workers): dst_tensors = [] with ops.device(per_worker_devices[w][0]): broadcast_src = nccl.broadcast( array_ops.identity(level_2_output[w])) for d in per_worker_devices[w]: with ops.device(d): dst_tensors.append(array_ops.identity(broadcast_src)) down_values[w] = dst_tensors output_tensors = [v for sublist in down_values for v in sublist] if len(shape) != 1: output_tensors = _reshape_tensors(output_tensors, shape) return output_tensors
def _build_nccl_hybrid(input_tensors, red_op, upper_level_f): """Construct a subgraph for NCCL hybrid all-reduce. Args: input_tensors: list of T @{tf.Tensor} of same-shape and type values to be reduced. red_op: binary elementwise reduction operator. upper_level_f: function for reducing one value per worker, across workers. Returns: list of T @{tf.Tensor} of reduced values. Raises: ValueError: inputs not well-formed. """ input_tensors, shape = _flatten_tensors(input_tensors) devices = [t.device for t in input_tensors] per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors) num_workers = len(per_worker_devices) up_values = [None for w in range(0, num_workers)] up_devices = up_values[:] down_values = up_values[:] # First stage: reduce within each worker using NCCL for w in range(0, num_workers): worker_values = build_nccl_all_reduce(per_worker_values[w], red_op) # NOTE: these reductions will not run to completion unless # every output value is used. Since we only need one, we # need to put control dependencies on the rest. with ops.control_dependencies(worker_values): with ops.device(worker_values[0].device): up_values[w] = array_ops.identity(worker_values[0]) up_devices[w] = per_worker_devices[w][0] # Second stage: Apply upper_level_f to reduce across first device at # each worker level_2_output = upper_level_f(up_values) # Third stage: propagate within each worker using NCCL Broadcast for w in range(0, num_workers): dst_tensors = [] with ops.device(per_worker_devices[w][0]): broadcast_src = nccl.broadcast(array_ops.identity(level_2_output[w])) for d in per_worker_devices[w]: with ops.device(d): dst_tensors.append(array_ops.identity(broadcast_src)) down_values[w] = dst_tensors output_tensors = [v for sublist in down_values for v in sublist] if len(shape) != 1: output_tensors = _reshape_tensors(output_tensors, shape) return output_tensors
def testCombined(self): if not test.is_gpu_available(): return # Test requires access to a GPU for dtype in [np.float32, np.int32, np.int64, np.float64]: # Create session inside outer loop to test use of # same communicator across multiple sessions. with self.test_session(use_gpu=True) as sess: for devices in [[ '/device:GPU:0', '/device:GPU:0', '/device:GPU:0' ], ['/device:GPU:0', '/device:GPU:0']]: shape = (3, 4) # all-reduce np_ans = np.zeros(shape=shape, dtype=dtype) tensors = [] for d in devices: with ops.device(d): t = ((np.random.random_sample(shape) - .5) * 1024).astype(dtype) np_ans += t tensors.append(array_ops.identity(t)) all_reduce_tensors = nccl.all_sum(tensors) sender = np.random.randint(0, len(devices) - 1) other_devices = devices[:sender] + devices[sender + 1:] send_op, received_tensors = nccl.broadcast( all_reduce_tensors[sender], other_devices) # sender doesn't need to be fetched as part of outputs of session.run. del all_reduce_tensors[sender] # Verify shape inference. for r in received_tensors: self.assertEqual(shape, r.get_shape()) # Run and verify results. nccl_results = sess.run(received_tensors + [send_op] + all_reduce_tensors) for r in nccl_results[:len(received_tensors)]: self.assertAllClose(r, np_ans)
def testCombined(self): if not test.is_gpu_available(): return # Test requires access to a GPU for dtype in [np.float32, np.int32, np.int64, np.float64]: # Create session inside outer loop to test use of # same communicator across multiple sessions. with self.test_session(use_gpu=True) as sess: for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]: shape = (3, 4) # all-reduce np_ans = np.zeros(shape=shape, dtype=dtype) tensors = [] for d in devices: with ops.device(d): t = ((np.random.random_sample(shape) - .5) * 1024).astype(dtype) np_ans += t tensors.append(array_ops.identity(t)) all_reduce_tensors = nccl.all_sum(tensors) sender = np.random.randint(0, len(devices) - 1) other_devices = devices[:sender] + devices[sender + 1:] send_op, received_tensors = nccl.broadcast(all_reduce_tensors[sender], other_devices) # sender doesn't need to be fetched as part of outputs of session.run. del all_reduce_tensors[sender] # Verify shape inference. for r in received_tensors: self.assertEqual(shape, r.get_shape()) # Run and verify results. nccl_results = sess.run( received_tensors + [send_op] + all_reduce_tensors) for r in nccl_results[:len(received_tensors)]: self.assertAllClose(r, np_ans)
def _NcclBroadcast(tensors, devices): sender = np.random.randint(0, len(devices)) with ops.device(devices[sender]): tensor = array_ops.identity(tensors[0]) broadcast = nccl.broadcast(tensor) return _DeviceTensors([broadcast] * len(devices), devices)
def _NcclBroadcast(tensors, devices): sender = np.random.randint(0, len(devices)) d_tensor = _DeviceTensors(tensors[0:1], devices[sender:sender + 1])[0] other_devices = devices[:sender] + devices[sender + 1:] send_op, received_tensors = nccl.broadcast(d_tensor, other_devices) return received_tensors, [send_op]