Esempio n. 1
0
def main():
    client = Client()
    # view = client.load_balanced_view()
    view = client[:]

    @view.parallel()
    @require('os')
    @require('shutil')
    @require('utils')
    @require('itertools')
    def run_experiment(parameter_sets):
        REPEAT_COUNT = 10
        results = []
        for ml_window, avg_window, merge_threshold in parameter_sets:

            dir = utils.TEMP_DIR + str(os.getpid())
            if not os.path.exists(dir):
                os.mkdir(dir)
            cumulative_errs = ()
            for _ in range(REPEAT_COUNT):
                errs = utils.experiment(os.path.join(utils.DATA_DIR, 'vh.fasta'), os.path.join(utils.DATA_DIR, 'vh.kabat'), ml_window, avg_window, merge_threshold, 2000, dir)
                cumulative_errs = tuple(sum(es) for es in itertools.zip_longest(cumulative_errs, errs, fillvalue=0))
            # shutil.rmtree(dir)

            # have to use loop instead of list comprehension, because embedding REPEAT_COUNT in a comprehension results
            # in 'cannot pickle code object with closures' error
            final_errs = []
            for e in cumulative_errs:
                final_errs.append(e / REPEAT_COUNT)
            results.append(tuple(final_errs))

            with open(os.path.join(dir, 'log.txt'), 'a+') as log:
                utils.log_result(log, ml_window, avg_window, merge_threshold, *final_errs)

        return results


    parameter_sets = list(product(range(3, 34)[::2], range(3, 34)[::2], range(2, 10)))
    # parameter_sets = list(product([11, 13, 15], [3], [4, 5, 6, 7, 8, 9]))
    # parameter_sets = [(5, 5, 5)]
    # with open('missing.txt') as missing:
        # parameter_sets = [(tuple(n for n in map(float, line.split()))) for line in missing]
    async = run_experiment(parameter_sets)

    with open('log.txt', 'w') as log:
        for parameters, errs in zip_longest(parameter_sets, async):
            ml_window, avg_window, merge_threshold = parameters
            log_result(log, ml_window, avg_window, merge_threshold, *errs)
    for test_images, test_labels in test_ds:
        test_step(test_images, test_labels)

    print(
        "Epoch {}, Loss: {}, Accuracy: {}, Test loss {}, test accuracy {}".format(
            epoch + 1,
            train_loss.result(),
            train_accuracy.result(),
            test_loss.result(),
            test_accuracy.result(),
        )
    )

t1 = time.time()
xla_str = " (XLA)" if tf.config.optimizer.get_jit() else ""
log_result("Time-to-train" + xla_str, duration)
assert t1 - t0 < 72.0


save_path = "./tf2_conv_saved_model"
model.save_model(save_path)

if smp.rank() == 0:
    loss_np = train_loss.result().numpy()
    try:
        loss_scalar = loss_np.item()  # numpy >= 1.16
    except:
        loss_scalar = loss_np.asscalar()  # numpy < 1.16

    log_result("Training loss" + xla_str, loss_scalar)
    assert loss_scalar < 0.016
    train_accuracy.reset_states()

    for batch, (images, labels) in enumerate(train_ds):
        if step == 1:
            t0 = time.time()
        train_step(images, labels, tf.constant(batch == 0))
        step += 1

    print(
        "Epoch {}, Accuracy: {}, Loss: {}".format(
            epoch + 1, train_accuracy.result(), train_loss.result()
        )
    )

t1 = time.time()
log_result("Time-to-train", t1 - t0)
assert t1 - t0 < 220.0

save_path = "./hvd2_conv_saved_model_multinode"
model.save_model(save_path)

if smp.mp_rank() == 1:
    loss_np = train_loss.result().numpy()
    try:
        loss_scalar = loss_np.item()  # numpy >= 1.16
    except:
        loss_scalar = loss_np.asscalar()  # numpy < 1.16

    log_result("Training loss", loss_scalar)
    assert loss_scalar < 0.008
Esempio n. 4
0
        loss_sum, correct, ttl = utils.train_batch(
            epoch, i, loss_sum, correct, ttl,
            input, target, model, criterion, optimizer,
            weight_quantizer, grad_quantizer, acc_quantizer, writer,
            quantize_momentum=args.quantize_momentum)

    if (epoch + 1) >= args.swa_start:
        utils.moving_average(swa_model, model, 1.0 / (swa_n + 1))
        swa_n += 1

    correct = correct.cpu().item()
    train_res = {
        'loss': loss_sum / float(ttl),
        'accuracy': correct / float(ttl) * 100.0,
    }
    utils.log_result(writer, "train", train_res, epoch+1)

    # Validation : SGD performance
    test_res = defaultdict(lambda : None)
    utils.bn_update(loaders['train'], model)
    test_res.update(utils.eval(loaders['test'], model, criterion))
    utils.log_result(writer, "test", test_res, epoch+1)
    if (epoch + 1) == args.swa_start:
        sgd_acc = test_res['accuracy']

    # Validation : SWA performance
    swa_te_res = defaultdict(lambda : None)
    if (epoch + 1) >= args.swa_start:
        utils.bn_update(loaders['train'], swa_model)
        swa_te_res.update(utils.eval(loaders['test'], swa_model, criterion))
        utils.log_result(writer, "test_swa", swa_te_res, epoch+1)