def main(): client = Client() # view = client.load_balanced_view() view = client[:] @view.parallel() @require('os') @require('shutil') @require('utils') @require('itertools') def run_experiment(parameter_sets): REPEAT_COUNT = 10 results = [] for ml_window, avg_window, merge_threshold in parameter_sets: dir = utils.TEMP_DIR + str(os.getpid()) if not os.path.exists(dir): os.mkdir(dir) cumulative_errs = () for _ in range(REPEAT_COUNT): errs = utils.experiment(os.path.join(utils.DATA_DIR, 'vh.fasta'), os.path.join(utils.DATA_DIR, 'vh.kabat'), ml_window, avg_window, merge_threshold, 2000, dir) cumulative_errs = tuple(sum(es) for es in itertools.zip_longest(cumulative_errs, errs, fillvalue=0)) # shutil.rmtree(dir) # have to use loop instead of list comprehension, because embedding REPEAT_COUNT in a comprehension results # in 'cannot pickle code object with closures' error final_errs = [] for e in cumulative_errs: final_errs.append(e / REPEAT_COUNT) results.append(tuple(final_errs)) with open(os.path.join(dir, 'log.txt'), 'a+') as log: utils.log_result(log, ml_window, avg_window, merge_threshold, *final_errs) return results parameter_sets = list(product(range(3, 34)[::2], range(3, 34)[::2], range(2, 10))) # parameter_sets = list(product([11, 13, 15], [3], [4, 5, 6, 7, 8, 9])) # parameter_sets = [(5, 5, 5)] # with open('missing.txt') as missing: # parameter_sets = [(tuple(n for n in map(float, line.split()))) for line in missing] async = run_experiment(parameter_sets) with open('log.txt', 'w') as log: for parameters, errs in zip_longest(parameter_sets, async): ml_window, avg_window, merge_threshold = parameters log_result(log, ml_window, avg_window, merge_threshold, *errs)
for test_images, test_labels in test_ds: test_step(test_images, test_labels) print( "Epoch {}, Loss: {}, Accuracy: {}, Test loss {}, test accuracy {}".format( epoch + 1, train_loss.result(), train_accuracy.result(), test_loss.result(), test_accuracy.result(), ) ) t1 = time.time() xla_str = " (XLA)" if tf.config.optimizer.get_jit() else "" log_result("Time-to-train" + xla_str, duration) assert t1 - t0 < 72.0 save_path = "./tf2_conv_saved_model" model.save_model(save_path) if smp.rank() == 0: loss_np = train_loss.result().numpy() try: loss_scalar = loss_np.item() # numpy >= 1.16 except: loss_scalar = loss_np.asscalar() # numpy < 1.16 log_result("Training loss" + xla_str, loss_scalar) assert loss_scalar < 0.016
train_accuracy.reset_states() for batch, (images, labels) in enumerate(train_ds): if step == 1: t0 = time.time() train_step(images, labels, tf.constant(batch == 0)) step += 1 print( "Epoch {}, Accuracy: {}, Loss: {}".format( epoch + 1, train_accuracy.result(), train_loss.result() ) ) t1 = time.time() log_result("Time-to-train", t1 - t0) assert t1 - t0 < 220.0 save_path = "./hvd2_conv_saved_model_multinode" model.save_model(save_path) if smp.mp_rank() == 1: loss_np = train_loss.result().numpy() try: loss_scalar = loss_np.item() # numpy >= 1.16 except: loss_scalar = loss_np.asscalar() # numpy < 1.16 log_result("Training loss", loss_scalar) assert loss_scalar < 0.008
loss_sum, correct, ttl = utils.train_batch( epoch, i, loss_sum, correct, ttl, input, target, model, criterion, optimizer, weight_quantizer, grad_quantizer, acc_quantizer, writer, quantize_momentum=args.quantize_momentum) if (epoch + 1) >= args.swa_start: utils.moving_average(swa_model, model, 1.0 / (swa_n + 1)) swa_n += 1 correct = correct.cpu().item() train_res = { 'loss': loss_sum / float(ttl), 'accuracy': correct / float(ttl) * 100.0, } utils.log_result(writer, "train", train_res, epoch+1) # Validation : SGD performance test_res = defaultdict(lambda : None) utils.bn_update(loaders['train'], model) test_res.update(utils.eval(loaders['test'], model, criterion)) utils.log_result(writer, "test", test_res, epoch+1) if (epoch + 1) == args.swa_start: sgd_acc = test_res['accuracy'] # Validation : SWA performance swa_te_res = defaultdict(lambda : None) if (epoch + 1) >= args.swa_start: utils.bn_update(loaders['train'], swa_model) swa_te_res.update(utils.eval(loaders['test'], swa_model, criterion)) utils.log_result(writer, "test_swa", swa_te_res, epoch+1)