def begin(self): self._sync_op = BroadcastGlobalVariablesOp() self._step = 0 self._step_place = tf.placeholder(dtype=tf.int32, shape=()) self._sync_step_op = all_reduce(self._step_place, op='max') self._resize_op, self._new_size_op = self._build_resize_op( self._schedule, self._step_place)
def begin(self): self._step = 0 self._trained_samples = 0 self._trained_samples_place = tf.placeholder(dtype=tf.int32, shape=()) self._sync_offset_op = all_reduce(self._trained_samples_place, op='max') self._sync_state_op = BroadcastGlobalVariablesOp() self._resize_op = resize_cluster_from_url()
def all_reduce_example(): x = tf.Variable(tf.ones([], tf.int32)) y = all_reduce(x) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) for step in range(5): v = sess.run(y) print('step %d, result: %d' % (step, v))
def worker(rank): from kungfu.python import current_cluster_size, current_rank from kungfu.tensorflow.ops import all_reduce print('rank=%d' % (rank)) print('kungfu rank: %d, size %d' % (current_rank(), current_cluster_size())) x = tf.Variable(tf.ones(shape=(), dtype=tf.int32)) y = all_reduce(x * rank) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) v = sess.run(y) print('v=%s' % (v))
def main(): # step -> new_size fake_schedule = { 10: 2, 20: 3, 40: 4, 50: 1, } args = parse_args() gs = tf.train.get_or_create_global_step() sync_step_op = tf.assign(gs, all_reduce(gs, op='max')) inc_gs = tf.assign_add(gs, 1) new_size = tf.placeholder(dtype=tf.uint32) resize_op = resize(new_size) train_op = build_fake_train_op(args.use_nccl) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) need_sync = True while True: if need_sync: sess.run(sync_step_op) need_sync = False step = sess.run(gs) # BEGIN train vs = sess.run(train_op) print('step %d, result: %d' % (step, vs[0].sum())) # END train if step in fake_schedule: changed = sess.run(resize_op, feed_dict={new_size: fake_schedule[step]}) if changed: need_sync = True if detached(): break else: print('cluster not changed') assert changed next_gs = sess.run(inc_gs) print('finished %s' % (next_gs - 1)) if next_gs >= args.max_step: break print('stopped')
def test_set_tree(steps, warmup_steps=10): from kungfu.python import current_cluster_size from kungfu.tensorflow.ops import all_reduce, broadcast from kungfu.tensorflow.ops.adapt import set_tree n = current_cluster_size() tree_place = tf.placeholder(dtype=tf.int32, shape=(n, )) set_tree_op = set_tree(broadcast(tree_place)) magic = 32 x = tf.Variable(list(range(magic)), dtype=tf.int32) y = all_reduce(x) init = tf.global_variables_initializer() durations = [] with tf.Session() as sess: sess.run(init) from kungfu._utils import one_based_range for step in one_based_range(steps + warmup_steps): v = sess.run(y) assert (v.sum() == n * magic * (magic - 1) / 2) # print(v) tree = gen_tree(n) t0 = time.time() sess.run(set_tree_op, feed_dict={ tree_place: tree, }) dur = time.time() - t0 if step > warmup_steps: durations.append(dur) ds = np.array([d * 1000 for d in durations]) from kungfu._utils import show_duration print( 'test set_tree OK for %d times among %d peers, took ~ %f <- [%f, %f] (ms)' % (len(ds), n, ds.mean(), ds.min(), ds.max()))
def run(sess, train_op, bcast_op): if args.num_batches_per_iter > 1: print('--num-batches-per-iter == 1 is highly recommended, using %d' % (args.num_batches_per_iter)) from kungfu.tensorflow.ops import all_reduce, resize_cluster_from_url step_place = tf.placeholder(dtype=tf.int32, shape=()) sync_step_op = all_reduce(step_place, op='max') resize_op = resize_cluster_from_url() # Benchmark log('Running benchmark...') img_secs = [] need_sync = True step = 0 while step < args.num_iters: if need_sync: new_step = sess.run(sync_step_op, feed_dict={step_place: step}) if new_step != step: print('sync step : %d -> %d' % (step, new_step)) step = new_step if bcast_op: duration, _ = measure(lambda: session.run(bcast_op)) log('bcast_op took %.3fs' % (duration)) need_sync = False step += 1 time = timeit.timeit(lambda: sess.run(train_op), number=args.num_batches_per_iter) img_sec = args.batch_size / time log('Iter #%d: %.1f img/sec per %s' % (step, img_sec, device)) img_secs.append(img_sec) changed, keep = sess.run(resize_op) if not keep: return if changed: need_sync = True img_sec_mean = np.mean(img_secs) img_sec_conf = 1.96 * np.std(img_secs) log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) log_final_result(img_sec_mean, img_sec_conf)
max_step = step_per_stage * len(stage_sizes) return config, max_step config, max_step = get_config() def build_ops(): step_place = tf.placeholder(dtype=tf.int32, shape=()) new_step_op = step_based_schedule(config, step_place) resize_op = resize_cluster_from_url() return step_place, resize_op, new_step_op step_place, resize_op, new_step_op = build_ops() sync_step_op = all_reduce(step_place, op='max') x = tf.Variable(1, tf.int32) y = all_reduce(x) sync_state_op = tf.assign(x, broadcast(x)) init_op = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init_op) need_sync = True i = 0 while i < max_step: if need_sync: new_step = sess.run(sync_step_op, feed_dict={step_place: i}) print('sync step: %d -> %d' % (i, new_step)) i = new_step
cluster_size_schedule, max_step = parse_schedule(args.schedule) # print(cluster_size_schedule) # print(max_step) def get_cluster_size(i, sch, old): for s, e, n in sch: if s <= i and i < e: return n print('[W] not scheduled for %d' % (i)) return old x = tf.Variable(tf.ones([], dtype=tf.int32)) y = all_reduce(x) def restore(checkpoint): gs = int(checkpoint) return gs new_size = tf.placeholder(tf.int32) resize_op = resize_cluster_from_url() init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init)
return '%.2fs' % duration sec = int(duration) mm, ss = sec / 60, sec % 60 if duration < 3600: return '%dm%ds' % (mm, ss) return '%dh%dm%ds' % (mm / 60, mm % 60, ss) # x = tf.Variable(tf.ones([], dtype=tf.int32)) x = tf.ones((10, 1), dtype=tf.int32) print(x.numpy()) steps = 10 mean_time = [] for i in range(steps): # reshape strategy before AllReduce to bypass straggler node t1 = time.time() keep = reshape_strategy(debug=False) iteration_time = time.time() - t1 print('reshape took %s' % (show_duration(iteration_time))) t0 = time.time() v = all_reduce(x) print('all reduce step %d, took %s' % (i, show_duration(time.time() - t0))) mean_time.append(iteration_time) if not keep: break print(np.mean(mean_time))
def begin(self): global_step = tf.train.get_or_create_global_step() new_global_step = all_reduce(global_step, op='max') self._sync_step_op = tf.assign(global_step, new_global_step) from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp self._sync_state_op = BroadcastGlobalVariablesOp()