def group_hierarchical_nccl_all_reduce(ts): names = [t.name for t in ts if t is not None] def reduce_op_name(name): return 'reduce_' + name def bcast_op_name(name): return 'bcast_' + name reduce_names = map_maybe(lambda t: reduce_op_name(t.name), ts) bcast_names = map_maybe(lambda t: bcast_op_name(t.name), ts) def all_reduce(args): i, t = args return _scheduled_hierarchical_nccl_all_reduce( t, op_names=[reduce_names[i], bcast_names[i]]) t_names = list(sorted(names)) all_op_names = list([reduce_op_name(name) for name in t_names] + [bcast_op_name(name) for name in t_names]) with tf.control_dependencies([ _start_nccl_scheduler(all_op_names, scope='local'), ]): return map_maybe(all_reduce, enumerate(ts))
def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs): gradients, variables = list(zip(*grads_and_vars)) if self._nccl: # FIXME: We have a limitation that KungFu schedules NCCL operations # in the order of the given gradients. This order is sub-optimal # to the topological sorting order of dataflow. We get around of this issue by # fusing all gradients. We need to figure out H ow to get the optimal topological s # sortting order from TensorFlow. if self._nccl_fusion: fused_grad = fuse(gradients) summed_fused_gradients = self._group_all_reduce_fn( [fused_grad]) summed_gradients = defuse(summed_fused_gradients[0], [g.shape for g in gradients]) else: summed_gradients = self._group_all_reduce_fn(gradients) else: if self._monitor: summed_gradients = map_maybe(lambda g: monitored_all_reduce(g, []), gradients) # with tf.control_dependencies(summed_gradients): # return calc_stats() else: summed_gradients = self._group_all_reduce_fn(gradients) np = tf.cast(self._num_workers, tf.float32) reduced_grads = map_maybe(lambda g: g / np, summed_gradients) # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once. reduced_grads_and_vars = zip(reduced_grads, variables) return apply_grads_func(reduced_grads_and_vars, **kwargs)
def group_nccl_all_reduce(ts): """Create a list of all_reduce operators for given tensor list, using NCCL.""" names = [t.name for t in ts if t is not None] if len(names) == 1: return map_maybe(_nccl_all_reduce, ts) # exactly one of ts is not None else: names = list(sorted(names)) with tf.control_dependencies([ _start_nccl_scheduler(names, scope='global'), ]): return map_maybe(_scheduled_nccl_all_reduce, ts)
def group_nccl_all_reduce(ts): """Create a list of all_reduce operators for given tensor list, using NCCL.""" names = [t.name for t in ts if t is not None] if len(names) == 1: return map_maybe(_nccl_all_reduce, ts) # exactly one of ts is not None else: print("WARNING: Please fuse %d tensors before using NCCL." % len(names)) names = list(sorted(names)) # FIXME: use topsort import tensorflow as tf with tf.control_dependencies([ _start_nccl_scheduler(names), ]): return map_maybe(_scheduled_nccl_all_reduce, ts)
def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs): gradients, variables = list(zip(*grads_and_vars)) # logging.info("apply gradients is called here------------") if self._reshape_strategy: # logging.info("reshape on") reshape_strategy(1) else: # logging.info("reshape called with int 0") reshape_strategy(0) if self._nccl: # FIXME: We have a limitation that KungFu schedules NCCL operations # in the order of the given gradients. This order is sub-optimal # to the topological sorting order of dataflow. We get around of this issue by # fusing all gradients. We need to figure out H ow to get the optimal topological s # sortting order from TensorFlow. if self._nccl_fusion: fused_grad = fuse(gradients) summed_fused_gradients = group_nccl_all_reduce([fused_grad]) summed_gradients = defuse(summed_fused_gradients[0], [g.shape for g in gradients]) else: summed_gradients = group_nccl_all_reduce(gradients) else: summed_gradients = group_all_reduce(gradients) reduced_grads = map_maybe(lambda g: g / self._num_workers, summed_gradients) # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once. reduced_grads_and_vars = zip(reduced_grads, variables) return apply_grads_func(reduced_grads_and_vars, **kwargs)
def _ssgd(self, apply_grads_func, gradients, variables, **kwargs): sum_grads = group_all_reduce(gradients) avg_grads = map_maybe(lambda g: g / self._num_workers, sum_grads) # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once. grads_and_vars = zip(avg_grads, variables) return apply_grads_func(grads_and_vars, **kwargs)
def _monitor(self, grads, reduced_grads): square_grads = [tf.square(g) for g in grads] summed_square_grads = group_all_reduce(square_grads) reduced_square_grads = map_maybe(lambda g: g / self._num_workers, summed_square_grads) grad_variances = [ square_grad - tf.square(grad) for square_grad, grad in zip(reduced_square_grads, reduced_grads) ] variances = [ tf.norm(grad_variance) for grad_variance in grad_variances ] summed_variance = tf.reduce_sum(variances) return tf.print('Variance:', summed_variance)
def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs): grads, variables = list(zip(*grads_and_vars)) # Synchronization logic summed_grads = group_all_reduce(grads) reduced_grads = map_maybe(lambda g: g / self._num_workers, summed_grads) # Monitoring logic monitor_grads_op = tf.cond( tf.equal(tf.mod(self._step, self._interval), 0), lambda: self._monitor(grads, reduced_grads), lambda: tf.no_op()) with tf.control_dependencies([monitor_grads_op]): return apply_grads_func(zip(reduced_grads, variables), **kwargs)
def apply_gradients(self, grads_and_vars, **kwargs): gradients, variables = list(zip(*grads_and_vars)) if self._nccl: if self._nccl_fusion: fused_grad = fuse(gradients) summed_fused_gradients = group_nccl_all_reduce([fused_grad]) summed_gradients = defuse(summed_fused_gradients[0], [g.shape for g in gradients]) else: summed_gradients = group_nccl_all_reduce(gradients) else: summed_gradients = group_all_reduce(gradients) reduced_grads = map_maybe(lambda g: g / self._num_workers, summed_gradients) reduced_grads_and_vars = zip(reduced_grads, variables) return self._optimizer.apply_gradients(reduced_grads_and_vars, **kwargs)
def group_all_reduce(ts): """Create a list of all_reduce operators for given tensor list.""" return map_maybe(all_reduce, ts)