def testMetric(self): track.init(trial_name="test_log") session = track.get_session() for i in range(5): track.log(test=i) result_path = os.path.join(session.logdir, EXPR_RESULT_FILE) self.assertTrue(_check_json_val(result_path, "test", i))
def ax_train_proxy(model_params, config_params, ax_params): rcml = RapidsCloudML(cloud_type=config_params['cloud_type'], model_type=config_params['model_type'], compute_type=f"single-{args.compute_type}", CSP_paths=config_params['paths']) # environment check rcml.environment_check() # ingest data [ post pre-processing ] dataset, col_labels, y_label, ingest_time = rcml.load_data(filename=config_params['dataset_filename']) rcml.query_memory() # classification objective requires int32 label for cuml random forest dataset[y_label] = dataset[y_label].astype('int32') accuracy_per_fold = [] train_time_per_fold = [] infer_time_per_fold = [] split_time_per_fold = [] global_best_model = None global_best_test_accuracy = 0 model_params["max_depth"] = ax_params["max_depth"] model_params["max_features"] = ax_params["max_features"] model_params["n_estimators"] = ax_params["n_estimators"] # optional cross-validation w/ model_params['n_train_folds'] > 1 for i_train_fold in range(config_params['CV_folds']): print(f"STARTING TRAINING FOLD {i_train_fold}", flush=True) rcml.log_to_file(f"\n CV fold {i_train_fold} of {config_params['CV_folds']}\n") # split data X_train, X_test, y_train, y_test, split_time = rcml.split_data(dataset=dataset, y_label=y_label, random_state=i_train_fold, shuffle=True) split_time_per_fold += [round(split_time, 4)] # train model trained_model, training_time = rcml.train_model(X_train, y_train, model_params) train_time_per_fold += [round(training_time, 4)] # evaluate perf test_accuracy, infer_time = rcml.evaluate_test_perf(trained_model, X_test, y_test) accuracy_per_fold += [round(test_accuracy, 4)] infer_time_per_fold += [round(infer_time, 4)] # update best model [ assumes maximization of perf metric ] if test_accuracy > global_best_test_accuracy: global_best_test_accuracy = test_accuracy global_best_model = trained_model rcml.log_to_file(f'\n accuracy per fold : {accuracy_per_fold} \n') rcml.log_to_file(f'\n train-time per fold : {train_time_per_fold} \n') rcml.log_to_file(f'\n infer-time per fold : {infer_time_per_fold} \n') rcml.log_to_file(f'\n split-time per fold : {split_time_per_fold} \n') track.log(accuracy=global_best_test_accuracy)
def fn(config): if names == ['config']: output = train_func(config) else: output = train_func(**config) output = float(output) track.log(**{SCORE_NAME: output})
def testLocalMetrics(self): """Checks that metric state is updated correctly.""" track.init(trial_name="test_logs") session = track.get_session() self.assertEqual(set(session.trial_config.keys()), {"trial_id"}) result_path = os.path.join(session.logdir, EXPR_RESULT_FILE) track.log(test=1) self.assertTrue(_check_json_val(result_path, "test", 1)) track.log(iteration=1, test=2) self.assertTrue(_check_json_val(result_path, "test", 2))
def crosseval_configuration(config): global data if data is None: data = get_data() splitter = ShuffleSplit(n_splits=3, test_size=0.2, random_state=111) splits = [(train, train[:10], test) for train, test in splitter.split(X=range(len(data)))] m_scores_std_scores = calc_mean_std_scores( lambda: data, partial(score_spacycrfsuite_tagger, params=config), splits) track.log(f1=m_scores_std_scores['m_scores']['f1-test'])
def train_mnist(config): model = ConvNet() train_loader, test_loader = get_data_loaders() optimizer = optim.SGD( model.parameters(), lr=config["lr"], momentum=config["momentum"]) for i in range(10): train(model, optimizer, train_loader) acc = test(model, test_loader) track.log(mean_accuracy=acc) if i % 5 == 0: # This saves the model to the trial directory torch.save(model, "./model.pth")
def train_mnist(config): use_cuda = config.get("use_gpu") and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") train_loader, test_loader = get_data_loaders() model = Net(config).to(device) optimizer = optim.SGD( model.parameters(), lr=config["lr"], momentum=config["momentum"]) while True: train(model, optimizer, train_loader, device) acc = test(model, test_loader, device) track.log(mean_accuracy=acc)
def testme(config): for i in range(config["iters"]): track.log(iteration=i, hi="test")
def train_eval(model, tokenizer, config, tb_writer=None): batch_size, max_seq_length = get_batch_size_seq_length(config['batch_size_seq_length']) ######################################################################################################## # TRAINING # init training structure train_dataset = load_and_cache_examples(task, tokenizer, max_seq_length) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) model.to(device) model.zero_grad() t_total = len(train_dataloader) * args.num_epochs logging_steps = int(t_total/args.num_logging_steps) warmup_steps = math.ceil(t_total * args.warmup_ratio) optimizer = AdamW(model.parameters(), lr=config['lr'], eps=args.adam_epsilon) lr_types = { "constant": get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps), "linear": get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps= t_total), "cosine": get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps= t_total) } scheduler = lr_types[config['lr_type']] # start training print(" ******************************************* ") print(" Running Training ") print(" *******************************************\n ") print(" Model: "+str(args.model)) print(" Length dataset training: "+str(len(train_dataset))) print(" Length dataloader training: "+str(len(train_dataloader))) print(" Number of epochs: "+str(args.num_epochs)) print(" Batch size: "+str(batch_size)) print(" Maximal sequence length: "+str(max_seq_length)) print(" Learning rate: "+str(config['lr'])) print(" Learning rate type: "+str(config['lr_type'])) print(" Total optimization steps: "+str(t_total)) print(" Steps between logging: "+str(logging_steps)) print(" Number of logging steps: "+str(args.num_logging_steps)) print(" *******************************************\n") global_step = 0 tr_loss, logging_loss = 0.0, 0.0 num_epoch = 0 train_iterator = trange(int(args.num_epochs), desc="Epoch") for _ in train_iterator: epoch_iterator = tqdm_notebook(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], # Distilbert and Roberta don't use segment_ids 'token_type_ids': batch[2] if args.model in ['bert', 'xlnet', 'albert'] else None, 'labels': batch[3]} outputs = model(**inputs) loss = outputs[0] # as backup check to see if model is still running print("\r%f" % loss, end='') loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() optimizer.step() scheduler.step() model.zero_grad() global_step += 1 # tracking loss has to be AFTER optimization steps were taken if logging_steps > 0 and global_step % logging_steps == 0: track.log(lr=scheduler.get_lr()[0]) track.log(loss_train=(tr_loss - logging_loss)/logging_steps) logging_loss = tr_loss num_epoch += 1 if num_epoch == args.num_epochs: output_dir = os.path.join(checkpoint_dir, 'checkpoint-{}'.format(num_epoch)) if not os.path.exists(output_dir): os.makedirs(output_dir) # necessary for distributed/parallel training model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir) logger.info("Saving model checkpoint to %s", output_dir) ######################################################################################################## # EVALUATION # init eval structure eval_dataset = load_and_cache_examples(task, tokenizer, max_seq_length, evaluate=True) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=batch_size) # start evaluation print(" ******************************************* ") print(" Running Evaluation ") print(" *******************************************\n ") print(" Length dataset evaluation: "+str(len(eval_dataset))) print(" Length dataloader evaluation: "+str(len(eval_dataloader))) print(" Number of epochs: "+str(args.num_epochs)) print(" Batch size: "+str(batch_size)) print(" ******************************************* \n ") eval_loss, eval_tr_loss = 0.0, 0.0 nb_eval_steps = 0 out_label_ids = None checkpoints = [checkpoint_dir] checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(checkpoint_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(device) # reinitialize preds for every checkpoint preds = None for batch in tqdm_notebook(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], # Distilbert and Roberta don't use segment_ids 'token_type_ids': batch[2] if args.model in ['bert', 'xlnet', 'albert'] else None, 'labels': batch[3]} outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] # for multi gpu, use mean() functionality eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 # calculate average loss within epoch eval_tr_loss = eval_loss / nb_eval_steps track.log(loss_eval_batch=eval_tr_loss) #log_scalar('loss_eval_batch', eval_tr_loss, nb_eval_steps, tb_writer=tb_writer) if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) track.log(loss_eval_epoch=(eval_loss / nb_eval_steps)) preds = np.argmax(preds, axis=1) acc = accuracy_score(out_label_ids, preds) fnc_score, conf_matrix = score_submission(preds=preds, labels=out_label_ids) fnc_score_best, _ = score_submission(preds=out_label_ids, labels=out_label_ids) fnc_score_rel = (fnc_score*100)/fnc_score_best f1, f1_scores = get_f1_overall(labels=labels, conf_matrix=conf_matrix) print("\n*******************************************") print("EVALUATION OF CHECKPOINT "+ checkpoint) print_confusion_matrix(conf_matrix) print("Score: " +str(fnc_score) + " out of " + str(fnc_score_best) + "\t("+str(fnc_score*100/fnc_score_best) + "%)") print("Accuracy: "+str(acc)) print("F1 overall: "+str(f1)) print("F1 per class: "+str(f1_scores)) print("*******************************************\n") track.log(acc=acc) track.log(fnc_score=fnc_score) track.log(fnc_score_best=fnc_score_best) track.log(fnc_score_rel=fnc_score_rel) track.log(f1_overall=f1)
def train_L2_2D(model, problem, method='unsupervised', niters=100, lr=1e-3, betas=(0, 0.9), lr_schedule=True, gamma=0.999, obs_every=1, d1=1, d2=1, log=True, plot=True, save=False, dirname='train_L2', config=None, loss_fn=None, save_for_animation=False, **kwargs): """ Train/test Lagaris method: supervised/semisupervised/unsupervised """ assert method in ['supervised', 'semisupervised', 'unsupervised'], f'Method {method} not understood!' dirname = os.path.join(this_dir, '../experiments/runs', dirname) if plot and save: handle_overwrite(dirname) # validation: fixed grid/solution x, y = problem.get_grid() grid = torch.cat((x, y), 1) sol = problem.get_solution(x, y) # optimizers & loss functions opt = torch.optim.Adam(model.parameters(), lr=lr, betas=betas) if loss_fn: mse = eval(f"torch.nn.{loss_fn}()") else: mse = torch.nn.MSELoss() # lr scheduler if lr_schedule: lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=opt, gamma=gamma) loss_trace = [] mses = {'train': [], 'val': []} preds = {'pred': [], 'soln': []} for i in range(niters): xs, ys = problem.get_grid_sample() grid_samp = torch.cat((xs, ys), 1) pred = model(grid_samp) residuals = problem.get_equation(pred, xs, ys) loss = mse(residuals, torch.zeros_like(residuals)) loss_trace.append(loss.item()) # train MSE: grid sample vs true soln # grid_samp, sort_ids = torch.sort(grid_samp, axis=0) pred = model(grid_samp) try: pred_adj = problem.adjust(pred, xs, ys)['pred'] sol_samp = problem.get_solution(xs, ys) train_mse = mse(pred_adj, sol_samp).item() except Exception as e: print(f'Exception: {e}') mses['train'].append(train_mse) # val MSE: fixed grid vs true soln val_pred = model(grid) val_pred_adj = problem.adjust(val_pred, x, y)['pred'] val_mse = mse(val_pred_adj, sol).item() mses['val'].append(val_mse) # store preds for animation preds['pred'].append(val_pred_adj.detach()) preds['soln'].append(sol.detach()) try: if (i + 1) % 10 == 0: # mean of val mses for last 10 steps track.log(mean_squared_error=np.mean(mses['val'][-10:])) except Exception as e: # print(f'Caught exception {e}') pass if log: print( f'Step {i}: Loss {loss.item():.4e} | Train MSE {train_mse:.4e} | Val MSE {val_mse:.4e}' ) opt.zero_grad() loss.backward(retain_graph=True) opt.step() if lr_schedule: lr_scheduler.step() if plot: loss_dict = {} if method == 'supervised': loss_dict['$L_S$'] = loss_trace elif method == 'semisupervised': loss_dict['$L_S$'] = [l[0] for l in loss_trace] loss_dict['$L_U$'] = [l[1] for l in loss_trace] else: loss_dict['$L_U$'] = loss_trace save_to = os.path.join(this_dir, '../experiments/runs', dirname) pred_dict, diff_dict = problem.get_plot_dicts(model(grid), x, y, sol) plot_results(mses, loss_dict, grid.detach(), pred_dict, diff_dict=diff_dict, save=save, dirname=dirname, logloss=True, alpha=0.7) if save: write_config(config, os.path.join(dirname, 'config.yaml')) if save_for_animation: if not os.path.exists(dirname): os.mkdir(dirname) anim_dir = os.path.join(dirname, "animation") print(f'Saving animation traces to {anim_dir}') if not os.path.exists(anim_dir): os.mkdir(anim_dir) np.save(os.path.join(anim_dir, "grid"), grid.detach()) for k, v in preds.items(): v = np.hstack(v) # TODO: for systems (i.e. multi-dim preds), # hstack flattens preds, need to use dstack # v = np.dstack(v) np.save(os.path.join(anim_dir, f"{k}_pred"), v) return {'mses': mses, 'model': model, 'losses': loss_trace}
def train_GAN_2D(G, D, problem, method='unsupervised', niters=100, g_lr=1e-3, g_betas=(0.0, 0.9), d_lr=1e-3, d_betas=(0.0, 0.9), lr_schedule=True, gamma=0.999, obs_every=1, d1=1., d2=1., G_iters=1, D_iters=1, wgan=True, gp=0.1, conditional=True, log=True, plot=True, save=False, dirname='train_GAN', config=None, save_for_animation=False, **kwargs): """ Train/test GAN method: supervised/semisupervised/unsupervised """ assert method in ['supervised', 'semisupervised', 'unsupervised'], f'Method {method} not understood!' dirname = os.path.join(this_dir, '../experiments/runs', dirname) if plot and save: handle_overwrite(dirname) # validation: fixed grid/solution x, y = problem.get_grid() grid = torch.cat((x, y), 1) soln = problem.get_solution(x, y) # # observer mask and masked grid/solution (t_obs/y_obs) observers = torch.arange(0, len(grid), obs_every) # grid_obs = grid[observers, :] # soln_obs = soln[observers, :] # labels real_label = 1 fake_label = -1 if wgan else 0 real_labels = torch.full((len(grid), ), real_label).reshape(-1, 1) fake_labels = torch.full((len(grid), ), fake_label).reshape(-1, 1) # masked label vectors real_labels_obs = real_labels[observers, :] fake_labels_obs = fake_labels[observers, :] # optimization optiG = torch.optim.Adam(G.parameters(), lr=g_lr, betas=g_betas) optiD = torch.optim.Adam(D.parameters(), lr=d_lr, betas=d_betas) if lr_schedule: lr_scheduler_G = torch.optim.lr_scheduler.ExponentialLR( optimizer=optiG, gamma=gamma) lr_scheduler_D = torch.optim.lr_scheduler.ExponentialLR( optimizer=optiD, gamma=gamma) # losses mse = nn.MSELoss() bce = nn.BCELoss() wass = lambda y_true, y_pred: torch.mean(y_true * y_pred) criterion = wass if wgan else bce # history losses = {'G': [], 'D': []} mses = {'train': [], 'val': []} preds = {'pred': [], 'soln': []} for epoch in range(niters): # Train Generator for p in D.parameters(): p.requires_grad = False # turn off computation for D for i in range(G_iters): xs, ys = problem.get_grid_sample() grid_samp = torch.cat((xs, ys), 1) pred = G(grid_samp) residuals = problem.get_equation(pred, xs, ys) # idea: add noise to relax from dirac delta at 0 to distb'n # + torch.normal(0, .1/(i+1), size=residuals.shape) real = torch.zeros_like(residuals) fake = residuals optiG.zero_grad() g_loss = criterion(D(fake), real_labels) # g_loss = criterion(D(fake), torch.ones_like(fake)) g_loss.backward(retain_graph=True) optiG.step() # Train Discriminator for p in D.parameters(): p.requires_grad = True # turn on computation for D for i in range(D_iters): if wgan: norm_penalty = calc_gradient_penalty(D, real, fake, gp, cuda=False) else: norm_penalty = torch.zeros(1) # print(real.shape, fake.shape) real_loss = criterion(D(real), real_labels) # real_loss = criterion(D(real), torch.ones_like(real)) fake_loss = criterion(D(fake), fake_labels) # fake_loss = criterion(D(fake), torch.zeros_like(fake)) optiD.zero_grad() d_loss = (real_loss + fake_loss) / 2 + norm_penalty d_loss.backward(retain_graph=True) optiD.step() losses['D'].append(d_loss.item()) losses['G'].append(g_loss.item()) if lr_schedule: lr_scheduler_G.step() lr_scheduler_D.step() # train MSE: grid sample vs true soln # grid_samp, sort_ids = torch.sort(grid_samp, axis=0) pred = G(grid_samp) pred_adj = problem.adjust(pred, xs, ys)['pred'] sol_samp = problem.get_solution(xs, ys) train_mse = mse(pred_adj, sol_samp).item() mses['train'].append(train_mse) # val MSE: fixed grid vs true soln val_pred = G(grid) val_pred_adj = problem.adjust(val_pred, x, y)['pred'] val_mse = mse(val_pred_adj, soln).item() mses['val'].append(val_mse) # save preds for animation preds['pred'].append(val_pred_adj.detach()) preds['soln'].append(soln.detach()) try: if (epoch + 1) % 10 == 0: # mean of val mses for last 10 steps track.log(mean_squared_error=np.mean(mses['val'][-10:])) # mean of G - D loss for last 10 steps # loss_diff = np.mean(np.abs(losses['G'][-10] - losses['D'][-10])) # track.log(mean_squared_error=loss_diff) except Exception as e: # print(f'Caught exception {e}') pass if log: print( f'Step {epoch}: G Loss: {g_loss.item():.4e} | D Loss: {d_loss.item():.4e} | Train MSE {train_mse:.4e} | Val MSE {val_mse:.4e}' ) if plot: pred_dict, diff_dict = problem.get_plot_dicts(G(grid), x, y, soln) plot_results(mses, losses, grid.detach(), pred_dict, diff_dict=diff_dict, save=save, dirname=dirname, logloss=False, alpha=0.7) if save: write_config(config, os.path.join(dirname, 'config.yaml')) if save_for_animation: if not os.path.exists(dirname): os.mkdir(dirname) anim_dir = os.path.join(dirname, "animation") print(f'Saving animation traces to {anim_dir}') if not os.path.exists(anim_dir): os.mkdir(anim_dir) np.save(os.path.join(anim_dir, "grid"), grid.detach()) for k, v in preds.items(): v = np.hstack(v) # TODO: for systems (i.e. multi-dim preds), # hstack flattens preds, need to use dstack # v = np.dstack(v) np.save(os.path.join(anim_dir, f"{k}_pred"), v) return {'mses': mses, 'model': G, 'losses': losses}
def on_epoch_end(self, batch, logs={}): self.iteration += 1 track.log(keras_info=logs, mean_accuracy=logs.get("accuracy"), mean_loss=logs.get("loss"))
return train_loader, valid_loader, test_loader def train_dyn(config): # use config to choose from grid search device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") train_loader, valid_loader, test_loader = get_data_loaders(csv="long_states_dt.csv") model = Dyn_NN(nx=5, ny=4, nh=50) optimizer = torch.optim.SGD( model.parameters(), lr=config["lr"], momentum=config["momentum"]) while True: train_loss = train(model, optimizer, train_loader, device) test_loss = test(model, test_loader, device) # TODO: check how to track loss instead of acc track.log(mean_loss=test_loss) if __name__ == '__main__': parser = argparse.ArgumentParser(description="PyTorch Car dynamics model") parser.add_argument( "--smoke-test", action="store_true", help="Finish quickly for testing") parser.add_argument( "--ray-redis-address", help="Address of Ray cluster for seamless distributed execution.") args = parser.parse_args() if args.ray_redis_address: ray.init(redis_address=args.ray_redis_address) sched = AsyncHyperBandScheduler( time_attr="training_iteration", metric="mean_loss")
def train_hypopt(config): dataset = DataLoader(path=config["filename"], split=0.80, cols=['log_ret'], start_from= "1985-01-01", end = "1995-01-01", label_col='log_ret', MinMax=False) timesteps = config["timesteps"] train_dt = dataset.get_train_data(timesteps, config["window_normalisation"], config["num_forward"]) test_dt = dataset.get_test_data(timesteps, config["window_normalisation"], config["num_forward"]) # Parameters dataloader_params_train = {'batch_size': 1, 'shuffle': True, 'drop_last': True, 'num_workers': 0} # Parameters dataloader_params_test = {'batch_size': 1, 'shuffle': False, 'drop_last': True, 'num_workers': 0} # Generators training_set = Dataset(train_dt) training_generator = data.DataLoader(training_set, **dataloader_params_train) test_set = Dataset(test_dt) test_generator = data.DataLoader(test_set, **dataloader_params_test) ### Saving: folder_name = str(config["num_forward"])+'_forward_usng_' + str(config["timesteps"]) + '_timesteps_' + str(config["hidden_dim"]) + '_hiddenDim_' + str( config["num_layers"]) + '_layers_'+str(config["lr"]) + "_LR" new_folder = create_folder(config["path"], folder_name) # Model: network_params = {'input_dim': 1, 'hidden_dim': config["hidden_dim"], 'batch_size': 1, 'output_dim': 1, 'dropout': config["dropout"], 'num_layers': config["num_layers"] } model = Model(**network_params) loss = torch.nn.MSELoss() optimiser = torch.optim.Adam(model.parameters(), lr=config['lr']) scheduler = None # CUDA for PyTorch use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") cudnn.benchmark = True model = model if torch.cuda.is_available(): # print("We're running on GPU") model.cuda() ####### lwst_error = 1000 while True: error, model = one_epoch_training(model, loss, optimiser, scheduler, device, training_generator, test_generator, timesteps, 1) is_best = bool(error < lwst_error) print( "error of the epoch: " + str(error) + " best accuracy before : " + str(lwst_error)) print("Best accuracy currently is: " + str(min(error, lwst_error))) lwst_error = min(error, lwst_error) save_checkpoint({ 'epoch': 'tuning', 'state_dict': model.state_dict(), 'best_accuracy': lwst_error }, is_best, new_folder) track.log(error=-error)