def __init__(self, device_batch_size, monitor_interval=1): self._num_workers = current_cluster_size() self._step = counter() self._interval = monitor_interval self._device_batch_size = tf.cast(device_batch_size, dtype=tf.float32) self._global_batch_size = self._device_batch_size * self._num_workers
def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs): np, rank = current_cluster_size(), current_rank() target = get_random_peer(np, rank) gradients, variables = list(zip(*grads_and_vars)) init_store_op = tf.cond(tf.equal(self._step, 0), lambda: self.init_store(variables), tf.no_op) with tf.control_dependencies([init_store_op]): other_peer_vars = self._build_request_ops(target, variables) save_model_op = self._build_save_op(variables) assign_ops = [ _tf_assign(v, 0.5 * (v + other_v)) for v, other_v in zip(variables, other_peer_vars) ] # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once. new_grads_and_vars = zip(gradients, variables) apply_op = apply_grads_func(new_grads_and_vars, **kwargs) with tf.control_dependencies(assign_ops): with tf.control_dependencies([apply_op]): with tf.control_dependencies([save_model_op]): return tf.group(apply_op)
def __init__(self, monitor_interval=1): self._num_workers = current_cluster_size() self._step = counter() self._interval = monitor_interval self._summed_variance = None self._variances = None
def __init__(self, optimizer, interval, name=None, use_locking=False): super(AdaptiveSGDOptimizer, self).__init__(optimizer, name, use_locking=use_locking) self._num_workers = current_cluster_size() self._rank = current_rank() self._step = tf.Variable(0, trainable=False, dtype=tf.int32) self._interval = interval
def __init__( self, reshape_strategy, nccl=False, nccl_fusion=True, ): self._reshape_strategy = reshape_strategy self._nccl = nccl self._nccl_fusion = nccl_fusion self._num_workers = current_cluster_size()
def log_final_result(value, error): from kungfu.tensorflow.ops import current_rank, current_cluster_size if current_rank() != 0: return attrs = { 'np': current_cluster_size(), 'strategy': os.getenv('KUNGFU_ALLREDUCE_STRATEGY'), 'bs': args.batch_size, 'model': args.model, 'kf-opt': args.kf_optimizer, } log_detailed_result(value, error, attrs)
def test_group_all_gather(): from kungfu import current_cluster_size, current_rank from kungfu.tensorflow.ops import all_gather rank = current_rank() np = current_cluster_size() sizes = [i + 1 for i in range(5)] xs = [(rank + 1) * tf.Variable(tf.ones([n], tf.int32)) for n in sizes] ys = [all_gather(x) for x in xs] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i, y in enumerate(ys): v = sess.run(y) assert (v.sum() == (np + 1) * np / 2 * (i + 1))
def after_run(self, run_context, run_values): self._step += 1 np = current_cluster_size() self._trained_samples += self._local_batch_size * np changed, keep = run_context.session.run(self._resize_op) if not keep: run_context.request_stop() self._exit_reason = 'change cluster' return if changed: self._need_sync = True if self._trained_samples >= self._total_samples: self._exit_reason = 'finished' run_context.request_stop()
def log_final_result(value, error): if current_rank() != 0: return attrs = { 'framework': 'kungfu', 'np': current_cluster_size(), 'strategy': os.getenv('KUNGFU_ALLREDUCE_STRATEGY'), 'bs': args.batch_size, 'model': args.model, 'xla': args.xla, 'kf-opt': args.kf_optimizer, 'fuse': args.fuse, 'nvlink': os.getenv('KUNGFU_ALLOW_NVLINK'), 'data': 'disk' if args.data_dir else 'memory', } log_detailed_result(value, error, attrs)
def test_consensus(): from kungfu import current_cluster_size, current_rank from kungfu.tensorflow.ops import consensus np = current_cluster_size() rank = current_rank() x = tf.Variable(rank, dtype=tf.int32) consensus_check = consensus(x) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) v = sess.run(consensus_check) assert v == (np == 1)
def test_monitored_all_reduce(): def gen_tree(n, r): tree = [i for i in range(n)] for i in range(n): if i != r: tree[i] = r return tree from kungfu.tensorflow.ops import monitored_all_reduce, current_cluster_size np = current_cluster_size() init_tree = gen_tree(np, 0) tree = tf.Variable(init_tree, dtype=tf.int32) x = tf.Variable(tf.ones([16, 1024, 1024], dtype=tf.int64)) y = monitored_all_reduce(x, tree) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) v = sess.run(y) assert (v.sum() == np * 16 * 1024 * 1024)
def all_reduce_benchmark(sizes, dtype=tf.float32): xs = [tf.Variable(tf.ones([n], dtype)) for n in sizes] tot_size = sum(_tensor_size(x) for x in xs) np = current_cluster_size() multiplier = 4 * (np - 1) print('all reduce total size: %s among %d peers' % (show_size(tot_size), np)) ys = group_all_reduce(xs) init = tf.global_variables_initializer() warmup_steps = 5 bench_steps = 10 with tf.Session() as sess: sess.run(init) for step in range(warmup_steps): sess.run(ys) for step in range(bench_steps): t0 = time.time() sess.run(ys) d = time.time() - t0 rate = 0 print('step %d, took %.2fs, equivalent data rate: %s' % (step, d, show_rate(tot_size * multiplier, d)))
def __init__(self): self._num_workers = current_cluster_size()
def get_cluster_size(method): if method == 'HOROVOD': import horovod.tensorflow as hvd return hvd.size() else: return current_cluster_size()
def __init__(self, change_step, alpha): self._num_workers = current_cluster_size() self._alpha = alpha self._change_step = change_step self._global_step = tf.train.get_or_create_global_step()
def __init__(self, alpha): self._num_workers = current_cluster_size() self._alpha = alpha
def show_info_example(): rank = current_rank() np = current_cluster_size() print('rank=%d, np=%d' % (rank, np))
def restore(checkpoint): gs = int(checkpoint) return gs new_size = tf.placeholder(tf.int32) resize_op = resize_cluster_from_url() init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) np = current_cluster_size() gs_place = tf.placeholder(dtype=tf.int64, shape=()) sync_step_op = all_reduce(gs_place, op='max') shoud_sync = True gs = 0 while gs < max_step: if shoud_sync: new_gs = sess.run(sync_step_op, feed_dict={gs_place: gs}) print('sync step: %d -> %d' % (gs, new_gs)) gs = new_gs shoud_sync = False t0 = time.time() v = sess.run(y) print('step %d, result: %d, np=%d, took %s' % (gs, v, np, show_duration(time.time() - t0)))
def fake_get_shard_info(use_kungfu): if use_kungfu: from kungfu.tensorflow.ops import current_cluster_size, current_rank return current_rank(), current_cluster_size() return 0, 1
def __init__(self, nccl=False, nccl_fusion=True): self._nccl = nccl self._nccl_fusion = nccl_fusion self._num_workers = current_cluster_size()