Ejemplo n.º 1
0
 def load_checkpoint(self, best=False):
     path = get_model_path(alias=self.name)
     if best:
         filepath = os.path.join(path, "best.dat")
     else:
         filepath = os.path.join(path, "last.dat")
     if os.path.exists(filepath):
         logger.info(f"Checkpoint found! Loading {filepath}")
         state = torch.load(filepath)
         self.load_state_dict(state["model_params"])
         self.optimizer.load_state_dict(state["opt_params"])
         epoch = state["epoch"]
         global_step = state["global_step"]
         best_loss = state["best_loss"]
         logger.info(f"Checkpoint loaded successfully.")
     else:
         logger.warn(
             f"Checkpoint not found at {filepath}. Training a new model...")
         epoch = 0
         global_step = 0
         best_loss = np.Inf
     logger.info(
         f"Model at ep={epoch}, g_step={global_step}, best_loss={best_loss}"
     )
     return epoch, global_step, best_loss
Ejemplo n.º 2
0
 def save_checkpoint(self, epoch, global_step, best_loss, is_best=False):
     # https://pytorch.org/tutorials/beginner/saving_loading_models.html
     state = {
         "model_params": self.state_dict(),
         "opt_params": self.optimizer.state_dict(),
         "epoch": epoch,
         "global_step": global_step,
         "best_loss": best_loss,
     }
     path = get_model_path(alias=self.name)
     filepath = os.path.join(path, "last.dat")
     torch.save(state, filepath)
     if is_best:
         filepath_best = os.path.join(path, "best.dat")
         shutil.copyfile(filepath, filepath_best)
Ejemplo n.º 3
0
config = json.load(open("settings.json"))

columns_target = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]
df_train, df_dev, df_test, allowed_symbols, cle = load_train_data()

net = Architecture(class_cardinality=6,
                   vocab_size=len(allowed_symbols) + 3,
                   name=project_id)

sess = start_tensorflow_session(device=str(config["device"]),
                                memory_fraction=config["memory_fraction"])
fw = get_summary_writer(sess, get_tensorboard_logs_path(), project_id,
                        version_id)
saver = TensorFlowSaver(get_model_path(project_id, version_id), max_to_keep=10)
sess.run(tf.global_variables_initializer())

losses_train = list()
min_loss = np.Inf
c = 0

for epoch in range(10000):
    batcher_train = get_batcher(df_train, batch_size)
    pbar = tqdm(batcher_train,
                unit=" btch",
                total=df_train.shape[0] // batch_size,
                ncols=75)
    for i, (id_train, batch_train, target_train) in enumerate(pbar):
        _, s, l = sess.run(
            [net.op.op, net.summaries.s_tr, net.losses.sigmoid_ce],
Ejemplo n.º 4
0
        loss_train /= c + 1
        male_train /= c + 1
        mape_train = total_abs_miss_train / total_target_train
        log_mape_train = total_log_abs_miss_train / total_log_target_train
        sw.add_scalar("train/epoch/loss", loss_train, epoch)
        sw.add_scalar("train/epoch/male", male_train, epoch)
        sw.add_scalar("train/epoch/mape", mape_train, epoch)
        sw.add_scalar("train/epoch/log_mape", log_mape_train, epoch)

        logging.info(
            f"EPOCH: {epoch:06d} | Epoch finished. Train Loss = {loss_train} – "
            f"MALE = {male_train} – MAPE = {mape_train}  – MAPLE = {log_mape_train} – "
            f"Global steps: {global_step}")
        wandb.log({
            "loss_dev": loss_dev,
            "male_dev": male_dev,
            "mape_dev": mape_dev,
            "log_mape_dev": log_mape_dev,
            "loss_train": loss_train,
            "male_train": male_train,
            "mape_train": mape_train,
            "log_mape_train": log_mape_train,
            "epoch": epoch,
        })

        if epoch % 10 == 0:  # Save to wandb
            path = get_model_path(alias=alias)
            wandb.save(os.path.join(path, "*"))
            wandb.save(os.path.join(get_log_config_filepath(), "*"))
            wandb.save(os.path.join(get_tensorboard_path(), "*"))
Ejemplo n.º 5
0
 def test_get_model_path(self):
     path = get_model_path(self.version_id)
     self.assertTrue(os.path.exists(path))
     shutil.rmtree(path)
Ejemplo n.º 6
0
                                                              verbose=True)
    best_score = 0
    c = 0
    loss_train, accuracy_train = None, None
    for epoch in range(n_epochs):
        # Evaluate model
        loss_val, accuracy_val = evaluate_model(model=model, data_feeder=data_feeder_validation, run_in_gpu=run_in_gpu)
        sw.add_scalar("validation/loss", loss_val, c)
        sw.add_scalar('validation/accuracy', accuracy_val, c)

        print(f"[{epoch + 1}] Loss train: {loss_train} | Acc train: {accuracy_train} | Loss val: {loss_val} | Acc val: {accuracy_val}")
        lr_scheduler.step(accuracy_val)

        # Save model
        if accuracy_val > best_score:
            torch.save(model.state_dict(), os.path.join(get_model_path(model_alias=alias), f'checkpoint_{random_seed}.pth'))
            best_score = accuracy_val

        # Train model
        loss_train, accuracy_train = 0, 0
        data_feeder_train.shuffle_data() # Shuffle the whole data matrix to get rid of incremental gradient [Bengio 2012]
        for n, (batch_audio_train, batch_target_train) in enumerate(data_feeder_train.get_batches()):
            if run_in_gpu:
                batch_audio_train = batch_audio_train.cuda()
                batch_target_train = batch_target_train.cuda()
            loss, y_hat = model.step(batch_audio_train, batch_target_train)
            loss_train += loss.detach().cpu().numpy()
            accuracy = (batch_target_train.cpu().numpy() == y_hat.argmax(dim=1).cpu().numpy()).mean()
            accuracy_train += accuracy
            # Load it in TensorboardX
            sw.add_scalar("train/loss", loss, c)