Ejemplo n.º 1
0
    def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs):
        gradients, variables = list(zip(*grads_and_vars))
        # logging.info("apply gradients is called here------------")
        if self._reshape_strategy:
            # logging.info("reshape on")
            reshape_strategy(1)
        else:
            # logging.info("reshape called with int 0")
            reshape_strategy(0)

        if self._nccl:
            # FIXME: We have a limitation that KungFu schedules NCCL operations
            # in the order of the given gradients. This order is sub-optimal
            # to the topological sorting order of dataflow. We get around of this issue by
            # fusing all gradients. We need to figure out H ow to get the optimal topological s
            # sortting order from TensorFlow.
            if self._nccl_fusion:
                fused_grad = fuse(gradients)
                summed_fused_gradients = group_nccl_all_reduce([fused_grad])
                summed_gradients = defuse(summed_fused_gradients[0],
                                          [g.shape for g in gradients])
            else:
                summed_gradients = group_nccl_all_reduce(gradients)
        else:
            summed_gradients = group_all_reduce(gradients)

        reduced_grads = map_maybe(lambda g: g / self._num_workers,
                                  summed_gradients)

        # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once.
        reduced_grads_and_vars = zip(reduced_grads, variables)

        return apply_grads_func(reduced_grads_and_vars, **kwargs)
def benchmark_step(first_batch):
    # reshape strategy here
    reshape_strategy(reshape)
    # gradient calculation and updates
    with tf.GradientTape() as tape:
        probs = model(data, training=True)
        loss = tf.losses.categorical_crossentropy(target, probs)

    gradients = tape.gradient(loss, model.trainable_variables)
    opt.apply_gradients(zip(gradients, model.trainable_variables))

    if first_batch:
        from kungfu.tensorflow.initializer import broadcast_variables
        broadcast_variables(model.variables)
        broadcast_variables(opt.variables())
def run(benchmark_step):
    # Warm-up
    log('Running warmup...')
    timeit.timeit(benchmark_step, number=args.num_warmup_batches)

    # Benchmark
    log('Running benchmark...')
    img_secs = []
    iteration_time = []
    for x in range(args.num_iters):
        #reshape_strategy op
        session.run(reshape_strategy(reshape))

        time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
        img_sec = args.batch_size * args.num_batches_per_iter / time
        log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
        log('iteration time : %.3f' % time)
        img_secs.append(img_sec)
        iteration_time.append(time)

    # Results
    log('mean iteration time: %.3f' % np.mean(iteration_time))
    img_sec_mean = np.mean(img_secs)
    img_sec_conf = 1.96 * np.std(img_secs)
    log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
Ejemplo n.º 4
0
    val_acc_metric = tf.metrics.SparseCategoricalAccuracy()
    best_val_acc = 0

    time_log = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = f"tensorboard-logs/{args.name}/{time_log}"
    summary_writer = tf.summary.create_file_writer(log_dir, flush_millis=10000)

    step = 0
    with summary_writer.as_default():
        for epoch in range(NUM_EPOCHS):
            print('Start of epoch %d' % (epoch + 1, ))
            for batch, (images,
                        labels) in enumerate(train_dataset.take(NUM_STEPS)):

                # reshape strategy before apply_gradients (and therefore AllReduce is called in KungFu)
                reshape_strategy(reshape)

                t0 = time.time()
                probs, loss_value = training_step(mnist_model, opt, images,
                                                  labels, batch == 0)

                print('training step %d, took %s' %
                      (step, show_duration(time.time() - t0)))
                step += 1
                # print(f"batch number here is {batch}")
                # update training metric
                train_acc_metric(labels, probs)

                # Log loss metric every 10th step only on the 0th worker
                if step % 3 == 0 and current_rank() == 0:
                    # print('Training step #%d\tLoss: %.6f' %
Ejemplo n.º 5
0
    best_val_acc = 0

    time_log = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = f"tensorboard-logs/{args.name}/{time_log}"
    summary_writer = tf.summary.create_file_writer(
        log_dir, flush_millis=10000)

    step = 0
    with summary_writer.as_default():
        for epoch in range(NUM_EPOCHS):
            print('Start of epoch %d' % (epoch+1,))
            for batch, (images, labels) in enumerate(train_dataset.take(NUM_STEPS)):


                # reshape strategy before apply_gradients (and therefore AllReduce is called in KungFu)
                keep = reshape_strategy()
                if not keep:
                    print("RESHAPE STRATEGY DIDN'T WORK!!!!!!!!!!!!!!")

                t0 = time.time()
                probs, loss_value = training_step(
                    mnist_model, opt, images, labels, batch == 0)

                print('training step %d, took %s' %
                      (step, show_duration(time.time() - t0)))
                step += 1
                # print(f"batch number here is {batch}")
                # update training metric
                train_acc_metric(labels, probs)

                # Log loss metric every 10th step only on the 0th worker
        return '%.2fs' % duration
    sec = int(duration)
    mm, ss = sec / 60, sec % 60
    if duration < 3600:
        return '%dm%ds' % (mm, ss)
    return '%dh%dm%ds' % (mm / 60, mm % 60, ss)


# x = tf.Variable(tf.ones([], dtype=tf.int32))
x = tf.ones((10, 1), dtype=tf.int32)
print(x.numpy())

steps = 10
mean_time = []
for i in range(steps):

    # reshape strategy before AllReduce to bypass straggler node
    t1 = time.time()
    keep = reshape_strategy(debug=False)
    iteration_time = time.time() - t1
    print('reshape took %s' % (show_duration(iteration_time)))

    t0 = time.time()
    v = all_reduce(x)
    print('all reduce step %d, took %s' % (i, show_duration(time.time() - t0)))

    mean_time.append(iteration_time)
    if not keep:
        break
print(np.mean(mean_time))