def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs): gradients, variables = list(zip(*grads_and_vars)) if self._nccl: # FIXME: We have a limitation that KungFu schedules NCCL operations # in the order of the given gradients. This order is sub-optimal # to the topological sorting order of dataflow. We get around of this issue by # fusing all gradients. We need to figure out H ow to get the optimal topological s # sortting order from TensorFlow. if self._nccl_fusion: fused_grad = fuse(gradients) summed_fused_gradients = group_nccl_all_reduce([fused_grad]) summed_gradients = defuse(summed_fused_gradients[0], [g.shape for g in gradients]) else: summed_gradients = group_nccl_all_reduce(gradients) else: summed_gradients = group_all_reduce(gradients) reduced_grads = map_maybe(lambda g: g / self._num_workers, summed_gradients) # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once. reduced_grads_and_vars = zip(reduced_grads, variables) return apply_grads_func(reduced_grads_and_vars, **kwargs)
def test_group_all_reduce(): sizes = [i % 5 for i in range(10)] xs = [tf.Variable(tf.ones([n], tf.int32)) if n else None for n in sizes] ys = group_all_reduce(xs) op = [y for y in ys if y is not None] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(op)
def _ssgd(self, apply_grads_func, gradients, variables, **kwargs): sum_grads = group_all_reduce(gradients) avg_grads = map_maybe(lambda g: g / self._num_workers, sum_grads) # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once. grads_and_vars = zip(avg_grads, variables) return apply_grads_func(grads_and_vars, **kwargs)
def build_fake_train_op(use_nccl): xs = [tf.Variable(tf.ones((2, 5)))] if use_nccl: from kungfu.tensorflow.ops import group_nccl_all_reduce ys = group_nccl_all_reduce(xs) else: from kungfu.tensorflow.ops import group_all_reduce ys = group_all_reduce(xs) return ys
def gen_fake_train_op(sizes): grads = [] for size in sizes: grads.append(tf.Variable(tf.ones(shape=(size, ), dtype=tf.float32))) new_grads = group_all_reduce(grads) ops = [] for g, new_g in zip(grads, new_grads): ops.append(tf.assign(g, new_g)) return tf.group(ops)
def _sync_ma_sgd(self, grads_and_vars, **kwargs): _, variables = list(zip(*grads_and_vars)) sum_vars = group_all_reduce(variables) avg_vars = [g / self._num_workers for g in sum_vars] assign_ops = [ tf.assign(v, avg_v) for v, avg_v in zip(variables, avg_vars) ] with tf.control_dependencies(assign_ops): return self._optimizer.apply_gradients(grads_and_vars, **kwargs)
def _monitor(self, grads, reduced_grads): square_grads = [tf.square(g) for g in grads] summed_square_grads = group_all_reduce(square_grads) reduced_square_grads = map_maybe(lambda g: g / self._num_workers, summed_square_grads) grad_variances = [ square_grad - tf.square(grad) for square_grad, grad in zip(reduced_square_grads, reduced_grads) ] variances = [ tf.norm(grad_variance) for grad_variance in grad_variances ] summed_variance = tf.reduce_sum(variances) return tf.print('Variance:', summed_variance)
def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs): grads, variables = list(zip(*grads_and_vars)) # Synchronization logic summed_grads = group_all_reduce(grads) reduced_grads = map_maybe(lambda g: g / self._num_workers, summed_grads) # Monitoring logic monitor_grads_op = tf.cond( tf.equal(tf.mod(self._step, self._interval), 0), lambda: self._monitor(grads, reduced_grads), lambda: tf.no_op()) with tf.control_dependencies([monitor_grads_op]): return apply_grads_func(zip(reduced_grads, variables), **kwargs)
def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs): # It is important to apply model averaging every iteration [2] gradients, variables = list(zip(*grads_and_vars)) sum_vars = group_all_reduce(variables) avg_vars = [g / self._num_workers for g in sum_vars] # TODO: Apply momentum to the averaged model [2] assign_ops = [ _tf_assign(v, avg_v) for v, avg_v in zip(variables, avg_vars) ] # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once. new_grads_and_vars = zip(gradients, variables) # We can overlap model averaging and local SGD [2]. with tf.control_dependencies(assign_ops): return apply_grads_func(new_grads_and_vars, **kwargs)
def _monitor(self, grads, reduced_grads): square_grads = [tf.square(g) for g in grads] summed_square_grads = group_all_reduce(square_grads) reduced_square_grads = [ g / self._num_workers for g in summed_square_grads ] grad_variances = [ square_grad - tf.square(grad) for square_grad, grad in zip(reduced_square_grads, reduced_grads) ] self._variances = [ tf.norm(grad_variance) for grad_variance in grad_variances ] self._summed_variance = tf.reduce_sum(self._variances) print_op = tf.print('Sum of gradient variance:', self._summed_variance) with tf.control_dependencies([print_op]): return tf.no_op()
def all_reduce_benchmark(sizes, dtype=tf.float32): xs = [tf.Variable(tf.ones([n], dtype)) for n in sizes] tot_size = sum(_tensor_size(x) for x in xs) np = current_cluster_size() multiplier = 4 * (np - 1) print('all reduce total size: %s among %d peers' % (show_size(tot_size), np)) ys = group_all_reduce(xs) init = tf.global_variables_initializer() warmup_steps = 5 bench_steps = 10 with tf.Session() as sess: sess.run(init) for step in range(warmup_steps): sess.run(ys) for step in range(bench_steps): t0 = time.time() sess.run(ys) d = time.time() - t0 rate = 0 print('step %d, took %.2fs, equivalent data rate: %s' % (step, d, show_rate(tot_size * multiplier, d)))