def __init__(self, worker_index, model_creator, all_reduce_alg="simple", num_devices=1, gpu=False, max_bytes=10000000, plasma_op=False): self.worker_index = worker_index assert num_devices > 0 # TODO(ekl) support custom session tf_session_args = { "device_count": { "CPU": num_devices }, "log_device_placement": False, "gpu_options": tf.GPUOptions(force_gpu_compatible=True), "inter_op_parallelism_threads": 128, } config_proto = tf.ConfigProto(**tf_session_args) self.sess = tf.Session(config=config_proto) self.models = [] grad_ops = [] if gpu: device_tmpl = "/gpu:%d" else: device_tmpl = "/cpu:%d" with self.sess.as_default(): for device_idx in range(num_devices): device = device_tmpl % device_idx with tf.device(device): with tf.variable_scope("device_%d" % device_idx): model = model_creator(worker_index, device_idx) self.models.append(model) grads = [ t for t in model.optimizer.compute_gradients( model.loss) if t[0] is not None ] grad_ops.append(grads) if num_devices == 1: if max_bytes: raise ValueError( "Implementation limitation: grad_shard_bytes > 0 " "({}) currently requires > 1 device".format(max_bytes)) self.packed_grads_and_vars = grad_ops else: if max_bytes: self.packed_grads_and_vars, packing_vals = ( sum_gradients_all_reduce( "", grad_ops, 1, all_reduce_alg, 1, list(range(num_devices)), agg_small_grads_max_bytes=max_bytes)) else: self.packed_grads_and_vars, _ = (sum_gradients_all_reduce( "", grad_ops, 1, all_reduce_alg, 1, list(range(num_devices)), agg_small_grads_max_bytes=0)) self.per_device_grads = [ list(zip(*dev_gv))[0] for dev_gv in self.packed_grads_and_vars ] assert (len(self.per_device_grads) == num_devices) self.num_grads = num_grads = len(self.packed_grads_and_vars[0]) if max_bytes: logger.info("Packed grads => {} tensors".format(num_grads)) # Ops for reading grads with the right control deps nccl_noops = [] for j in range(num_grads)[::-1]: deps = nccl_noops + [ dev_grad[j] for dev_grad in self.per_device_grads ] with tf.control_dependencies(deps): nccl_noops = [tf.no_op()] # You must fetch this otherwise the NCCL allreduce will hang self.nccl_control_out = tf.group(*nccl_noops) if plasma_op: store_socket = ( ray.worker.global_worker.plasma_client.store_socket_name) manager_socket = ( ray.worker.global_worker.plasma_client.manager_socket_name) if not plasma.tf_plasma_op: plasma.build_plasma_tensorflow_op() # For fetching grads -> plasma self.plasma_in_grads = [] self.plasma_in_grads_oids = [ tf.placeholder(shape=[], dtype=tf.string, name="in_grad_oids") for _ in range(num_grads) ] for j in range(num_grads): grad = self.per_device_grads[0][j] with tf.device(self.models[0].loss.device): plasma_grad = plasma.tf_plasma_op.tensor_to_plasma( [grad], self.plasma_in_grads_oids[j], plasma_store_socket_name=store_socket, plasma_manager_socket_name=manager_socket) self.plasma_in_grads.append(plasma_grad) # For applying grads <- plasma unpacked_gv = [] self.plasma_out_grads_oids = [ tf.placeholder(shape=[], dtype=tf.string, name="grad_out_oids") for _ in range(num_grads) ] packed_plasma_grads = [] for j in range(num_grads): with tf.device(self.plasma_in_grads[j].device): with tf.control_dependencies([self.plasma_in_grads[j]]): grad_ph = plasma.tf_plasma_op.plasma_to_tensor( self.plasma_out_grads_oids[j], dtype=tf.float32, plasma_store_socket_name=store_socket, plasma_manager_socket_name=manager_socket) grad_ph = tf.reshape(grad_ph, self.packed_grads_and_vars[0][j][0].shape) logger.debug("Packed tensor {}".format(grad_ph)) packed_plasma_grads.append(grad_ph) for i in range(num_devices): per_device = [] for j, (g, v) in enumerate(self.packed_grads_and_vars[i]): grad_ph = packed_plasma_grads[j] per_device.append((grad_ph, v)) unpacked_gv.append(per_device) if max_bytes: unpacked_gv = unpack_small_tensors(unpacked_gv, packing_vals) elif max_bytes: unpacked_gv = unpack_small_tensors(self.packed_grads_and_vars, packing_vals) else: unpacked_gv = self.packed_grads_and_vars # Same shape as packed_grads_and_vars assert len(unpacked_gv) == num_devices assert len(unpacked_gv[0][0]) == 2 apply_ops = [] to_apply = unpacked_gv[0] for ix, m in enumerate(self.models): apply_ops.append( m.optimizer.apply_gradients([ (g, v) for ((g, _), (_, v)) in zip(to_apply, unpacked_gv[ix]) ])) self.apply_op = tf.group(*apply_ops) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) self.sess.run(init_op)
def __init__(self, worker_index, model_creator, all_reduce_alg="simple", num_devices=1, gpu=False, max_bytes=10000000, plasma_op=False): self.worker_index = worker_index assert num_devices > 0 # TODO(ekl) support custom session tf_session_args = { "device_count": { "CPU": num_devices }, "log_device_placement": False, "gpu_options": tf.GPUOptions(force_gpu_compatible=True), "inter_op_parallelism_threads": 128, } config_proto = tf.ConfigProto(**tf_session_args) self.sess = tf.Session(config=config_proto) self.models = [] grad_ops = [] if gpu: device_tmpl = "/gpu:%d" else: device_tmpl = "/cpu:%d" with self.sess.as_default(): for device_idx in range(num_devices): device = device_tmpl % device_idx with tf.device(device): with tf.variable_scope("device_%d" % device_idx): model = model_creator(worker_index, device_idx) self.models.append(model) optimizer = model.get_optimizer() loss = model.get_loss() grads = [ t for t in optimizer.compute_gradients(loss) if t[0] is not None ] grad_ops.append(grads) if num_devices == 1: if max_bytes: raise ValueError( "Implementation limitation: grad_shard_bytes > 0 " "({}) currently requires > 1 device".format(max_bytes)) self.packed_grads_and_vars = grad_ops else: if max_bytes: self.packed_grads_and_vars, packing_vals = ( sum_gradients_all_reduce( "", grad_ops, 1, all_reduce_alg, 1, list(range(num_devices)), agg_small_grads_max_bytes=max_bytes)) else: self.packed_grads_and_vars, _ = (sum_gradients_all_reduce( "", grad_ops, 1, all_reduce_alg, 1, list(range(num_devices)), agg_small_grads_max_bytes=0)) self.per_device_grads = [ list(zip(*dev_gv))[0] for dev_gv in self.packed_grads_and_vars ] assert (len(self.per_device_grads) == num_devices) self.num_grads = num_grads = len(self.packed_grads_and_vars[0]) if max_bytes: logger.info("Packed grads => {} tensors".format(num_grads)) # Ops for reading grads with the right control deps nccl_noops = [] for j in range(num_grads)[::-1]: deps = nccl_noops + [ dev_grad[j] for dev_grad in self.per_device_grads ] with tf.control_dependencies(deps): nccl_noops = [tf.no_op()] # You must fetch this otherwise the NCCL allreduce will hang self.nccl_control_out = tf.group(*nccl_noops) if plasma_op: store_socket = ( ray.worker.global_worker.plasma_client.store_socket_name) ensure_plasma_tensorflow_op() # For fetching grads -> plasma self.plasma_in_grads = [] self.plasma_in_grads_oids = [ tf.placeholder(shape=[], dtype=tf.string, name="in_grad_oids") for _ in range(num_grads) ] for j in range(num_grads): grad = self.per_device_grads[0][j] with tf.device(self.models[0].get_loss().device): plasma_grad = plasma.tf_plasma_op.tensor_to_plasma( [grad], self.plasma_in_grads_oids[j], plasma_store_socket_name=store_socket) self.plasma_in_grads.append(plasma_grad) # For applying grads <- plasma unpacked_gv = [] self.plasma_out_grads_oids = [ tf.placeholder( shape=[], dtype=tf.string, name="grad_out_oids") for _ in range(num_grads) ] packed_plasma_grads = [] for j in range(num_grads): with tf.device(self.plasma_in_grads[j].device): with tf.control_dependencies([self.plasma_in_grads[j]]): grad_ph = plasma.tf_plasma_op.plasma_to_tensor( self.plasma_out_grads_oids[j], dtype=tf.float32, plasma_store_socket_name=store_socket) grad_ph = tf.reshape(grad_ph, self.packed_grads_and_vars[0][j][0].shape) logger.debug("Packed tensor {}".format(grad_ph)) packed_plasma_grads.append(grad_ph) for i in range(num_devices): per_device = [] for j, (g, v) in enumerate(self.packed_grads_and_vars[i]): grad_ph = packed_plasma_grads[j] per_device.append((grad_ph, v)) unpacked_gv.append(per_device) if max_bytes: unpacked_gv = unpack_small_tensors(unpacked_gv, packing_vals) elif max_bytes: unpacked_gv = unpack_small_tensors(self.packed_grads_and_vars, packing_vals) else: unpacked_gv = self.packed_grads_and_vars # Same shape as packed_grads_and_vars assert len(unpacked_gv) == num_devices assert len(unpacked_gv[0][0]) == 2 apply_ops = [] to_apply = unpacked_gv[0] for ix, m in enumerate(self.models): apply_ops.append(m.get_optimizer().apply_gradients([ (g, v) for ((g, _), (_, v)) in zip(to_apply, unpacked_gv[ix]) ])) self.apply_op = tf.group(*apply_ops) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) self.sess.run(init_op)