Beispiel #1
0
def all_reduce_benchmark(sizes, dtype=tf.float32, method='CPU'):
    rank = _rank(method)

    def log(msg):
        if rank == 0:
            print(msg)

    xs = [tf.Variable(tf.ones([n], dtype)) for n in sizes]
    tot_size = sum(_tensor_size(x) for x in xs)
    np = get_cluster_size(method)
    multiplier = 4 * (np - 1)
    log('all reduce %d tensors of total size: %s among %d peers, using %s' %
        (len(sizes), show_size(tot_size), np, method))

    ys = _group_all_reduce_func[method](xs)

    init = tf.global_variables_initializer()

    warmup_steps = 5
    bench_steps = 10

    with tf.Session(config=_config(method)) as sess:
        duration, _ = measure(lambda: sess.run(init))
        log('tensorflow init took %.fs' % (duration))

        for step in one_based_range(warmup_steps):
            duration, _ = measure(lambda: sess.run(ys))
            log('warmup step %d, took %.2fs, equivalent data rate: %s' %
                (step, duration, show_rate(tot_size * multiplier, duration)))

        for step in one_based_range(bench_steps):
            duration, _ = measure(lambda: sess.run(ys))
            log('step %d, took %.2fs, equivalent data rate: %s' %
                (step, duration, show_rate(tot_size * multiplier, duration)))
Beispiel #2
0
def all_reduce_benchmark(sizes, dtype, args):
    rank = _rank(args.method)

    def log(msg):
        if rank == 0:
            print(msg)

    xs = [tf.Variable(tf.ones([n], dtype)) for n in sizes]
    tot_size = sum(_tensor_size(x) for x in xs)
    np = get_cluster_size(args.method)
    multiplier = 4 * (np - 1)
    log('all reduce %d tensors of total size: %s among %d peers, using %s' %
        (len(sizes), show_size(tot_size), np, args.method))

    ys = _group_all_reduce_func[args.method](xs)

    init = tf.global_variables_initializer()

    values = []
    with tf.Session(config=_config(args.method)) as sess:
        duration, _ = measure(lambda: sess.run(init))
        log('tensorflow init took %.fs' % (duration))

        for step in one_based_range(args.warmup_steps):
            duration, _ = measure(lambda: sess.run(ys))
            log('warmup step %d, took %.2fs, equivalent data rate: %s' %
                (step, duration, show_rate(tot_size * multiplier, duration)))

        for step in one_based_range(args.steps):
            duration, _ = measure(lambda: sess.run(ys))
            gi = 1024 * 1024 * 1024
            values.append(tot_size * multiplier / gi / duration)
            log('step %d, took %.2fs, equivalent data rate: %s' %
                (step, duration, show_rate(tot_size * multiplier, duration)))

    if get_rank(args.method) == 0:
        log_final_result(values, args)
def run(sess, train_op, bcast_op):
    if args.num_batches_per_iter > 1:
        print('--num-batches-per-iter == 1 is highly recommended, using %d' %
              (args.num_batches_per_iter))
    from kungfu.tensorflow.ops import all_reduce, resize_cluster_from_url
    step_place = tf.placeholder(dtype=tf.int32, shape=())
    sync_step_op = all_reduce(step_place, op='max')
    resize_op = resize_cluster_from_url()
    # Benchmark
    log('Running benchmark...')
    img_secs = []
    need_sync = True
    step = 0
    while step < args.num_iters:
        if need_sync:
            new_step = sess.run(sync_step_op, feed_dict={step_place: step})
            if new_step != step:
                print('sync step : %d -> %d' % (step, new_step))
            step = new_step
            if bcast_op:
                duration, _ = measure(lambda: session.run(bcast_op))
                log('bcast_op took %.3fs' % (duration))
            need_sync = False
        step += 1
        time = timeit.timeit(lambda: sess.run(train_op),
                             number=args.num_batches_per_iter)
        img_sec = args.batch_size / time
        log('Iter #%d: %.1f img/sec per %s' % (step, img_sec, device))
        img_secs.append(img_sec)

        changed, keep = sess.run(resize_op)
        if not keep:
            return
        if changed:
            need_sync = True

    img_sec_mean = np.mean(img_secs)
    img_sec_conf = 1.96 * np.std(img_secs)
    log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
    log_final_result(img_sec_mean, img_sec_conf)
Beispiel #4
0
    # Results
    img_sec_mean = np.mean(img_secs)
    img_sec_conf = 1.96 * np.std(img_secs)
    log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
    log_final_result(img_sec_mean, img_sec_conf)


loss = loss_function()
train_opt = opt.minimize(loss)

if tf.executing_eagerly():
    with tf.device(device):
        run(lambda: opt.minimize(loss_function,
                                 var_list=model.trainable_variables))
else:
    init = tf.global_variables_initializer()
    bcast_op = None
    if args.kf_optimizer:
        from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp
        bcast_op = BroadcastGlobalVariablesOp()
    with tf.Session(config=config) as session:
        from kungfu._utils import measure
        duration, _ = measure(lambda: session.run(init))
        log('init took %.3fs' % (duration))
        if bcast_op:
            duration, _ = measure(lambda: session.run(bcast_op))
            log('bcast_op took %.3fs' % (duration))
        run(lambda: session.run(train_opt))
        if barrier_op is not None:
            session.run(barrier_op)
        img_sec = args.batch_size / time
        log('Iter #%d: %.1f img/sec per %s' % (step, img_sec, device))
        img_secs.append(img_sec)

        changed, keep = sess.run(resize_op)
        if not keep:
            return
        if changed:
            need_sync = True

    img_sec_mean = np.mean(img_secs)
    img_sec_conf = 1.96 * np.std(img_secs)
    log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
    log_final_result(img_sec_mean, img_sec_conf)


loss = loss_function()
train_op = opt.minimize(loss)

bcast_op = None
if args.kf_optimizer:
    from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp
    bcast_op = BroadcastGlobalVariablesOp()
init = tf.global_variables_initializer()
with tf.Session(config=config) as session:
    from kungfu._utils import measure
    duration, _ = measure(lambda: session.run(init))
    log('init took %.3fs' % (duration))
    run(session, train_op, bcast_op)
print('stopped')