def __init__(self, optimizer, interval, name=None, use_locking=False): super(AdaptiveSGDOptimizer, self).__init__(optimizer, name, use_locking=use_locking) self._num_workers = current_cluster_size() self._rank = current_rank() self._step = tf.Variable(0, trainable=False, dtype=tf.int32) self._interval = interval
def __init__(self, optimizer, nccl=False, nccl_fusion=True, name=None, use_locking=False): super(SynchronousSGDOptimizer, self).__init__(optimizer, name, use_locking=use_locking) self._num_workers = current_cluster_size() self._nccl = nccl self._nccl_fusion = nccl_fusion
def __init__(self, optimizer, name=None, monitor_interval=1, use_locking=False): super(SyncSGDWithGradVarianceOptimizer, self).__init__(optimizer, name, use_locking=use_locking) self._num_workers = current_cluster_size() self._step = counter() self._interval = monitor_interval self._summed_variance = None self._variances = None
def __init__(self, optimizer, device_batch_size, name=None, monitor_interval=1, use_locking=False): super(SyncSGDWithGradNoiseScaleOptimizer, self).__init__(optimizer, name, use_locking=use_locking) self._num_workers = current_cluster_size() self._step = counter() self._interval = monitor_interval self._device_batch_size = tf.cast(device_batch_size, dtype=tf.float32) self._global_batch_size = self._device_batch_size * self._num_workers self._noise_op = None
def apply_gradients(self, grads_and_vars, **kwargs): """Calls this same method on the underlying optimizer.""" np, rank = current_cluster_size(), current_rank() target = get_random_peer(np, rank) variables = [v for _g, v in grads_and_vars] other_peer_vars, save_model_op = self._build_request_and_save_ops( target, variables) assign_ops = [ tf.assign(v, 0.5 * (v + other_v)) for v, other_v in zip(variables, other_peer_vars) ] apply_op = self._optimizer.apply_gradients(grads_and_vars, **kwargs) with tf.control_dependencies(assign_ops): with tf.control_dependencies([apply_op]): with tf.control_dependencies([save_model_op]): return tf.group(apply_op)
def all_reduce_benchmark(sizes, dtype=tf.float32): xs = [tf.Variable(tf.ones([n], dtype)) for n in sizes] tot_size = sum(_tensor_size(x) for x in xs) np = current_cluster_size() multiplier = 4 * (np - 1) print('all reduce total size: %s among %d peers' % (show_size(tot_size), np)) ys = group_all_reduce(xs) init = tf.global_variables_initializer() warmup_steps = 5 bench_steps = 10 with tf.Session() as sess: sess.run(init) for step in range(warmup_steps): sess.run(ys) for step in range(bench_steps): t0 = time.time() sess.run(ys) d = time.time() - t0 rate = 0 print('step %d, took %.2fs, equivalent data rate: %s' % (step, d, show_rate(tot_size * multiplier, d)))
def fake_get_shard_info(use_kungfu): if use_kungfu: from kungfu.tensorflow.v1.ops import current_cluster_size, current_rank return current_rank(), current_cluster_size() return 0, 1
def show_info_example(): rank = current_rank() np = current_cluster_size() print('rank=%d, np=%d' % (rank, np))
return gs ckpt = tf.placeholder(tf.string) new_size = tf.placeholder(tf.int32) resize_op = resize_cluster(ckpt, new_size) init = tf.global_variables_initializer() # barrier_op = barrier() with tf.Session() as sess: sess.run(init) init_gs = restore(get_init_checkpoint()) np = current_cluster_size() init_np = get_cluster_size(init_gs, cluster_size_schedule, np) if np != init_np: print( '[W] init cluster size (np=%d) is not consistent with schedule (np=%d)' % (np, init_np)) print('restored from %d, np=%d, init_np=%d, start took %s' % (init_gs, np, init_np, show_duration(time.time() - t0))) for gs in range(init_gs, max_step): t0 = time.time() v = sess.run(y) print('step %d, result: %d, np=%d, took %s' % (gs, v, np, show_duration(time.time() - t0)))
def __init__(self, optimizer, name=None, use_locking=False): super(SynchronousAveragingOptimizer, self).__init__(optimizer, name, use_locking=use_locking) self._num_workers = current_cluster_size() self._rank = current_rank()