def all_reduce_benchmark(sizes, dtype=tf.float32, method='CPU'): rank = _rank(method) def log(msg): if rank == 0: print(msg) xs = [tf.Variable(tf.ones([n], dtype)) for n in sizes] tot_size = sum(_tensor_size(x) for x in xs) np = get_cluster_size(method) multiplier = 4 * (np - 1) log('all reduce %d tensors of total size: %s among %d peers, using %s' % (len(sizes), show_size(tot_size), np, method)) ys = _group_all_reduce_func[method](xs) init = tf.global_variables_initializer() warmup_steps = 5 bench_steps = 10 with tf.Session(config=_config(method)) as sess: duration, _ = measure(lambda: sess.run(init)) log('tensorflow init took %.fs' % (duration)) for step in one_based_range(warmup_steps): duration, _ = measure(lambda: sess.run(ys)) log('warmup step %d, took %.2fs, equivalent data rate: %s' % (step, duration, show_rate(tot_size * multiplier, duration))) for step in one_based_range(bench_steps): duration, _ = measure(lambda: sess.run(ys)) log('step %d, took %.2fs, equivalent data rate: %s' % (step, duration, show_rate(tot_size * multiplier, duration)))
def all_reduce_benchmark(sizes, dtype, args): rank = _rank(args.method) def log(msg): if rank == 0: print(msg) xs = [tf.Variable(tf.ones([n], dtype)) for n in sizes] tot_size = sum(_tensor_size(x) for x in xs) np = get_cluster_size(args.method) multiplier = 4 * (np - 1) log('all reduce %d tensors of total size: %s among %d peers, using %s' % (len(sizes), show_size(tot_size), np, args.method)) ys = _group_all_reduce_func[args.method](xs) init = tf.global_variables_initializer() values = [] with tf.Session(config=_config(args.method)) as sess: duration, _ = measure(lambda: sess.run(init)) log('tensorflow init took %.fs' % (duration)) for step in one_based_range(args.warmup_steps): duration, _ = measure(lambda: sess.run(ys)) log('warmup step %d, took %.2fs, equivalent data rate: %s' % (step, duration, show_rate(tot_size * multiplier, duration))) for step in one_based_range(args.steps): duration, _ = measure(lambda: sess.run(ys)) gi = 1024 * 1024 * 1024 values.append(tot_size * multiplier / gi / duration) log('step %d, took %.2fs, equivalent data rate: %s' % (step, duration, show_rate(tot_size * multiplier, duration))) if get_rank(args.method) == 0: log_final_result(values, args)
def run(sess, train_op, bcast_op): if args.num_batches_per_iter > 1: print('--num-batches-per-iter == 1 is highly recommended, using %d' % (args.num_batches_per_iter)) from kungfu.tensorflow.ops import all_reduce, resize_cluster_from_url step_place = tf.placeholder(dtype=tf.int32, shape=()) sync_step_op = all_reduce(step_place, op='max') resize_op = resize_cluster_from_url() # Benchmark log('Running benchmark...') img_secs = [] need_sync = True step = 0 while step < args.num_iters: if need_sync: new_step = sess.run(sync_step_op, feed_dict={step_place: step}) if new_step != step: print('sync step : %d -> %d' % (step, new_step)) step = new_step if bcast_op: duration, _ = measure(lambda: session.run(bcast_op)) log('bcast_op took %.3fs' % (duration)) need_sync = False step += 1 time = timeit.timeit(lambda: sess.run(train_op), number=args.num_batches_per_iter) img_sec = args.batch_size / time log('Iter #%d: %.1f img/sec per %s' % (step, img_sec, device)) img_secs.append(img_sec) changed, keep = sess.run(resize_op) if not keep: return if changed: need_sync = True img_sec_mean = np.mean(img_secs) img_sec_conf = 1.96 * np.std(img_secs) log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) log_final_result(img_sec_mean, img_sec_conf)
# Results img_sec_mean = np.mean(img_secs) img_sec_conf = 1.96 * np.std(img_secs) log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) log_final_result(img_sec_mean, img_sec_conf) loss = loss_function() train_opt = opt.minimize(loss) if tf.executing_eagerly(): with tf.device(device): run(lambda: opt.minimize(loss_function, var_list=model.trainable_variables)) else: init = tf.global_variables_initializer() bcast_op = None if args.kf_optimizer: from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp bcast_op = BroadcastGlobalVariablesOp() with tf.Session(config=config) as session: from kungfu._utils import measure duration, _ = measure(lambda: session.run(init)) log('init took %.3fs' % (duration)) if bcast_op: duration, _ = measure(lambda: session.run(bcast_op)) log('bcast_op took %.3fs' % (duration)) run(lambda: session.run(train_opt)) if barrier_op is not None: session.run(barrier_op)
img_sec = args.batch_size / time log('Iter #%d: %.1f img/sec per %s' % (step, img_sec, device)) img_secs.append(img_sec) changed, keep = sess.run(resize_op) if not keep: return if changed: need_sync = True img_sec_mean = np.mean(img_secs) img_sec_conf = 1.96 * np.std(img_secs) log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) log_final_result(img_sec_mean, img_sec_conf) loss = loss_function() train_op = opt.minimize(loss) bcast_op = None if args.kf_optimizer: from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp bcast_op = BroadcastGlobalVariablesOp() init = tf.global_variables_initializer() with tf.Session(config=config) as session: from kungfu._utils import measure duration, _ = measure(lambda: session.run(init)) log('init took %.3fs' % (duration)) run(session, train_op, bcast_op) print('stopped')