def test_group_all_reduce(): sizes = [i % 5 for i in range(10)] xs = [tf.Variable(tf.ones([n], tf.int32)) if n else None for n in sizes] ys = group_all_reduce(xs) op = [y for y in ys if y is not None] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(op)
def gen_fake_train_op(sizes): grads = [] for size in sizes: grads.append(tf.Variable(tf.ones(shape=(size, ), dtype=tf.float32))) new_grads = group_all_reduce(grads) ops = [] for g, new_g in zip(grads, new_grads): ops.append(tf.assign(g, new_g)) return tf.group(ops)
def _sync_ma_sgd(self, grads_and_vars, **kwargs): _, variables = list(zip(*grads_and_vars)) sum_vars = group_all_reduce(variables) avg_vars = [g / self._num_workers for g in sum_vars] assign_ops = [ tf.assign(v, avg_v) for v, avg_v in zip(variables, avg_vars) ] with tf.control_dependencies(assign_ops): return self._optimizer.apply_gradients(grads_and_vars, **kwargs)
def apply_gradients(self, grads_and_vars, **kwargs): # It is important to apply model averaging every iteration [2] _, variables = list(zip(*grads_and_vars)) sum_vars = group_all_reduce(variables) avg_vars = [g / self._num_workers for g in sum_vars] # TODO: Apply momentum to the averaged model [2] assign_ops = [ tf.assign(v, avg_v) for v, avg_v in zip(variables, avg_vars) ] # We can overlap model averaging and local SGD [2]. with tf.control_dependencies(assign_ops): return self._optimizer.apply_gradients(grads_and_vars, **kwargs)
def apply_gradients(self, grads_and_vars, **kwargs): grads, variables = list(zip(*grads_and_vars)) # Synchronization logic summed_grads = group_all_reduce(grads) reduced_grads = [g / self._num_workers for g in summed_grads] # Monitoring logic monitor_grads_op = tf.cond( tf.equal(tf.mod(self._step, self._interval), 0), lambda: self._monitor(grads, reduced_grads), lambda: tf.no_op()) with tf.control_dependencies([monitor_grads_op]): return self._optimizer.apply_gradients( zip(reduced_grads, variables), **kwargs)
def _monitor(self, grads, reduced_grads): square_grads = [tf.square(g) for g in grads] summed_square_grads = group_all_reduce(square_grads) reduced_square_grads = [ g / self._num_workers for g in summed_square_grads ] grad_variances = [ square_grad - tf.square(grad) for square_grad, grad in zip(reduced_square_grads, reduced_grads) ] self._variances = [ tf.norm(grad_variance) for grad_variance in grad_variances ] self._summed_variance = tf.reduce_sum(self._variances) print_op = tf.print('Sum of gradient variance:', self._summed_variance) with tf.control_dependencies([print_op]): return tf.no_op()
def apply_gradients(self, grads_and_vars, **kwargs): gradients, variables = list(zip(*grads_and_vars)) if self._nccl: if self._nccl_fusion: fused_grad = fuse(gradients) summed_fused_gradients = group_nccl_all_reduce([fused_grad]) summed_gradients = defuse(summed_fused_gradients[0], [g.shape for g in gradients]) else: summed_gradients = group_nccl_all_reduce(gradients) else: summed_gradients = group_all_reduce(gradients) reduced_grads = [g / self._num_workers for g in summed_gradients] reduced_grads_and_vars = zip(reduced_grads, variables) return self._optimizer.apply_gradients(reduced_grads_and_vars, **kwargs)
def all_reduce_benchmark(sizes, dtype=tf.float32): xs = [tf.Variable(tf.ones([n], dtype)) for n in sizes] tot_size = sum(_tensor_size(x) for x in xs) np = current_cluster_size() multiplier = 4 * (np - 1) print('all reduce total size: %s among %d peers' % (show_size(tot_size), np)) ys = group_all_reduce(xs) init = tf.global_variables_initializer() warmup_steps = 5 bench_steps = 10 with tf.Session() as sess: sess.run(init) for step in range(warmup_steps): sess.run(ys) for step in range(bench_steps): t0 = time.time() sess.run(ys) d = time.time() - t0 rate = 0 print('step %d, took %.2fs, equivalent data rate: %s' % (step, d, show_rate(tot_size * multiplier, d)))