def load_checkpoint(self, best=False): path = get_model_path(alias=self.name) if best: filepath = os.path.join(path, "best.dat") else: filepath = os.path.join(path, "last.dat") if os.path.exists(filepath): logger.info(f"Checkpoint found! Loading {filepath}") state = torch.load(filepath) self.load_state_dict(state["model_params"]) self.optimizer.load_state_dict(state["opt_params"]) epoch = state["epoch"] global_step = state["global_step"] best_loss = state["best_loss"] logger.info(f"Checkpoint loaded successfully.") else: logger.warn( f"Checkpoint not found at {filepath}. Training a new model...") epoch = 0 global_step = 0 best_loss = np.Inf logger.info( f"Model at ep={epoch}, g_step={global_step}, best_loss={best_loss}" ) return epoch, global_step, best_loss
def save_checkpoint(self, epoch, global_step, best_loss, is_best=False): # https://pytorch.org/tutorials/beginner/saving_loading_models.html state = { "model_params": self.state_dict(), "opt_params": self.optimizer.state_dict(), "epoch": epoch, "global_step": global_step, "best_loss": best_loss, } path = get_model_path(alias=self.name) filepath = os.path.join(path, "last.dat") torch.save(state, filepath) if is_best: filepath_best = os.path.join(path, "best.dat") shutil.copyfile(filepath, filepath_best)
config = json.load(open("settings.json")) columns_target = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] df_train, df_dev, df_test, allowed_symbols, cle = load_train_data() net = Architecture(class_cardinality=6, vocab_size=len(allowed_symbols) + 3, name=project_id) sess = start_tensorflow_session(device=str(config["device"]), memory_fraction=config["memory_fraction"]) fw = get_summary_writer(sess, get_tensorboard_logs_path(), project_id, version_id) saver = TensorFlowSaver(get_model_path(project_id, version_id), max_to_keep=10) sess.run(tf.global_variables_initializer()) losses_train = list() min_loss = np.Inf c = 0 for epoch in range(10000): batcher_train = get_batcher(df_train, batch_size) pbar = tqdm(batcher_train, unit=" btch", total=df_train.shape[0] // batch_size, ncols=75) for i, (id_train, batch_train, target_train) in enumerate(pbar): _, s, l = sess.run( [net.op.op, net.summaries.s_tr, net.losses.sigmoid_ce],
loss_train /= c + 1 male_train /= c + 1 mape_train = total_abs_miss_train / total_target_train log_mape_train = total_log_abs_miss_train / total_log_target_train sw.add_scalar("train/epoch/loss", loss_train, epoch) sw.add_scalar("train/epoch/male", male_train, epoch) sw.add_scalar("train/epoch/mape", mape_train, epoch) sw.add_scalar("train/epoch/log_mape", log_mape_train, epoch) logging.info( f"EPOCH: {epoch:06d} | Epoch finished. Train Loss = {loss_train} – " f"MALE = {male_train} – MAPE = {mape_train} – MAPLE = {log_mape_train} – " f"Global steps: {global_step}") wandb.log({ "loss_dev": loss_dev, "male_dev": male_dev, "mape_dev": mape_dev, "log_mape_dev": log_mape_dev, "loss_train": loss_train, "male_train": male_train, "mape_train": mape_train, "log_mape_train": log_mape_train, "epoch": epoch, }) if epoch % 10 == 0: # Save to wandb path = get_model_path(alias=alias) wandb.save(os.path.join(path, "*")) wandb.save(os.path.join(get_log_config_filepath(), "*")) wandb.save(os.path.join(get_tensorboard_path(), "*"))
def test_get_model_path(self): path = get_model_path(self.version_id) self.assertTrue(os.path.exists(path)) shutil.rmtree(path)
verbose=True) best_score = 0 c = 0 loss_train, accuracy_train = None, None for epoch in range(n_epochs): # Evaluate model loss_val, accuracy_val = evaluate_model(model=model, data_feeder=data_feeder_validation, run_in_gpu=run_in_gpu) sw.add_scalar("validation/loss", loss_val, c) sw.add_scalar('validation/accuracy', accuracy_val, c) print(f"[{epoch + 1}] Loss train: {loss_train} | Acc train: {accuracy_train} | Loss val: {loss_val} | Acc val: {accuracy_val}") lr_scheduler.step(accuracy_val) # Save model if accuracy_val > best_score: torch.save(model.state_dict(), os.path.join(get_model_path(model_alias=alias), f'checkpoint_{random_seed}.pth')) best_score = accuracy_val # Train model loss_train, accuracy_train = 0, 0 data_feeder_train.shuffle_data() # Shuffle the whole data matrix to get rid of incremental gradient [Bengio 2012] for n, (batch_audio_train, batch_target_train) in enumerate(data_feeder_train.get_batches()): if run_in_gpu: batch_audio_train = batch_audio_train.cuda() batch_target_train = batch_target_train.cuda() loss, y_hat = model.step(batch_audio_train, batch_target_train) loss_train += loss.detach().cpu().numpy() accuracy = (batch_target_train.cpu().numpy() == y_hat.argmax(dim=1).cpu().numpy()).mean() accuracy_train += accuracy # Load it in TensorboardX sw.add_scalar("train/loss", loss, c)