Beispiel #1
0
def main(test_file, pretrained_file, batch_size=32):
    """
    Test the ESIM model with pretrained weights on some dataset.

    Args:
        test_file: The path to a file containing preprocessed NLI data.
        pretrained_file: The path to a checkpoint produced by the
            'train_model' script.
        vocab_size: The number of words in the vocabulary of the model
            being tested.
        embedding_dim: The size of the embeddings in the model.
        hidden_size: The size of the hidden layers in the model. Must match
            the size used during training. Defaults to 300.
        num_classes: The number of classes in the output of the model. Must
            match the value used during training. Defaults to 3.
        batch_size: The size of the batches used for testing. Defaults to 32.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for testing ", 20 * "=")

    checkpoint = torch.load(pretrained_file)

    # Retrieving model parameters from checkpoint.
    vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0)
    embedding_dim = checkpoint["model"]['_word_embedding.weight'].size(1)
    hidden_size = checkpoint["model"]["_projection.0.weight"].size(0)
    num_classes = checkpoint["model"]["_classification.4.weight"].size(0)

    print("\t* Loading test data...")
    with open(test_file, "rb") as pkl:
        test_data = NLIDataset(pickle.load(pkl))

    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

    print("\t* Building model...")
    model = ESIM(vocab_size,
                 embedding_dim,
                 hidden_size,
                 num_classes=num_classes,
                 device=device).to(device)

    model.load_state_dict(checkpoint["model"])

    print(20 * "=",
          " Testing ESIM model on device: {} ".format(device),
          20 * "=")
    batch_time, total_time, accuracy = test(model, test_loader)

    print("-> Average batch processing time: {:.4f}s, total test time:\
 {:.4f}s, accuracy: {:.4f}%".format(batch_time, total_time, (accuracy*100)))
Beispiel #2
0
def main(train_file,
         valid_file,
         embeddings_file,
         target_dir,
         hidden_size=300,
         dropout=0.5,
         num_classes=3,
         epochs=64,
         batch_size=32,
         lr=0.0004,
         patience=5,
         max_grad_norm=10.0,
         checkpoint=None):
    """
    Train the ESIM model on the SNLI dataset.

    Args:
        train_file: A path to some preprocessed data that must be used
            to train the model.
        valid_file: A path to some preprocessed data that must be used
            to validate the model.
        embeddings_file: A path to some preprocessed word embeddings that
            must be used to initialise the model.
        target_dir: The path to a directory where the trained model must
            be saved.
        hidden_size: The size of the hidden layers in the model. Defaults
            to 300.
        dropout: The dropout rate to use in the model. Defaults to 0.5.
        num_classes: The number of classes in the output of the model.
            Defaults to 3.
        epochs: The maximum number of epochs for training. Defaults to 64.
        batch_size: The size of the batches for training. Defaults to 32.
        lr: The learning rate for the optimizer. Defaults to 0.0004.
        patience: The patience to use for early stopping. Defaults to 5.
        checkpoint: A checkpoint from which to continue training. If None,
            training starts from scratch. Defaults to None.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for training ", 20 * "=")

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    with open(train_file, 'rb') as pkl:
        train_data = NLIDataset(pickle.load(pkl))

    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

    print("\t* Loading validation data...")
    with open(valid_file, 'rb') as pkl:
        valid_data = NLIDataset(pickle.load(pkl))

    valid_loader = DataLoader(valid_data, shuffle=False, batch_size=batch_size)

    # -------------------- Model definition ------------------- #
    print('\t* Building model...')
    with open(embeddings_file, 'rb') as pkl:
        embeddings = torch.tensor(pickle.load(pkl), dtype=torch.float)\
                     .to(device)

    model = ESIM(embeddings.shape[0],
                 embeddings.shape[1],
                 hidden_size,
                 embeddings=embeddings,
                 dropout=dropout,
                 num_classes=num_classes,
                 device=device).to(device)

    # -------------------- Preparation for training  ------------------- #
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode='max',
                                                           factor=0.5,
                                                           patience=0)

    best_score = 0.0
    start_epoch = 1

    # Data for loss curves plot.
    epochs_count = []
    train_losses = []
    valid_losses = []

    # Continuing training from a checkpoint if one was given as argument.
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        best_score = checkpoint['best_score']

        print("\t* Training will continue on existing model from epoch {}..."
              .format(start_epoch))

        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        epochs_count = checkpoint['epochs_count']
        train_losses = checkpoint['train_losses']
        valid_losses = checkpoint['valid_losses']

    # Compute loss and accuracy before starting (or resuming) training.
    _, valid_loss, valid_accuracy = validate(model,
                                             valid_loader,
                                             criterion)
    print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%"
          .format(valid_loss, (valid_accuracy*100)))

    # -------------------- Training epochs ------------------- #
    print("\n",
          20 * "=",
          "Training ESIM model on device: {}".format(device),
          20 * "=")

    patience_counter = 0
    for epoch in range(start_epoch, epochs+1):
        epochs_count.append(epoch)

        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model,
                                                       train_loader,
                                                       optimizer,
                                                       criterion,
                                                       epoch,
                                                       max_grad_norm)

        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%"
              .format(epoch_time, epoch_loss, (epoch_accuracy*100)))

        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = validate(model,
                                                          valid_loader,
                                                          criterion)

        valid_losses.append(epoch_loss)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n"
              .format(epoch_time, epoch_loss, (epoch_accuracy*100)))

        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)

        # Early stopping on validation accuracy.
        if epoch_accuracy < best_score:
            patience_counter += 1
        else:
            best_score = epoch_accuracy
            patience_counter = 0
            # Save the best model. The optimizer is not saved to avoid having
            # a checkpoint file that is too heavy to be shared. To resume
            # training from the best model, use the 'esim_*.pth.tar'
            # checkpoints instead.
            torch.save({'epoch': epoch,
                        'model': model.state_dict(),
                        'best_score': best_score,
                        'epochs_count': epochs_count,
                        'train_losses': train_losses,
                        'valid_losses': valid_losses},
                       os.path.join(target_dir, "best.pth.tar"))

        # Save the model at each epoch.
        torch.save({'epoch': epoch,
                    'model': model.state_dict(),
                    'best_score': best_score,
                    'optimizer': optimizer.state_dict(),
                    'epochs_count': epochs_count,
                    'train_losses': train_losses,
                    'valid_losses': valid_losses},
                   os.path.join(target_dir, "esim_{}.pth.tar".format(epoch)))

        if patience_counter >= patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break

    # Plotting of the loss curves for the train and validation sets.
    plt.figure()
    plt.plot(epochs_count, train_losses, '-r')
    plt.plot(epochs_count, valid_losses, '-b')
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.legend(['Training loss', 'Validation loss'])
    plt.title('Cross entropy loss')
    plt.show()
Beispiel #3
0
def main(test_files, pretrained_file, labeldict, output_dir, batch_size=32):
    """
    Test the ESIM model with pretrained weights on the MultiNLI dataset.

    Args:
        test_files: The paths to the preprocessed matched and mismatched MNLI
            test sets.
        pretrained_file: The path to a checkpoint produced by the
            'train_mnli' script.
        labeldict: A dictionary associating labels (classes) to integer values.
        output_dir: The path to a directory where the predictions of the model
            must be saved.
        batch_size: The size of the batches used for testing. Defaults to 32.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for testing ", 20 * "=")

    output_dir = os.path.normpath(output_dir)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    checkpoint = torch.load(pretrained_file, map_location='cpu')

    # Retrieve model parameters from the checkpoint.
    vocab_size = checkpoint['model']['_word_embedding.weight'].size(0)
    embedding_dim = checkpoint['model']['_word_embedding.weight'].size(1)
    hidden_size = checkpoint['model']['_projection.0.weight'].size(0)
    num_classes = checkpoint['model']['_classification.4.weight'].size(0)

    print("\t* Loading test data...")
    with open(os.path.normpath(test_files["matched"]), 'rb') as pkl:
        matched_test_data = NLIDataset(pickle.load(pkl))
    # with open(os.path.normpath(test_files["mismatched"]), 'rb') as pkl:
    #     mismatched_test_data = NLIDataset(pickle.load(pkl))

    matched_test_loader = DataLoader(matched_test_data,
                                     shuffle=False,
                                     batch_size=batch_size)
    # mismatched_test_loader = DataLoader(mismatched_test_data,
    #                                     shuffle=False,
    #                                     batch_size=batch_size)

    print("\t* Building model...")
    model = ESIM(vocab_size,
                 embedding_dim,
                 hidden_size,
                 num_classes=num_classes,
                 device=device).to(device)

    model.load_state_dict(checkpoint['model'])

    print(20 * "=",
          " Prediction on MNLI with ESIM model on device: {} ".format(device),
          20 * "=")

    print("\t* Prediction for matched test set...")
    predictions = predict(model, matched_test_loader, labeldict)

    with open(os.path.join(output_dir, "matched_predictions.csv"), 'w') as output_f:
        output_f.write("pairID,gold_label\n")
        for pair_id in predictions:
            output_f.write(pair_id+","+predictions[pair_id]+"\n")
Beispiel #4
0
def main(train_file,
         valid_file,
         test_file,
         embeddings_file,
         target_dir,
         hidden_size=300,
         dropout=0.5,
         num_classes=2,
         epochs=64,
         batch_size=32,
         lr=0.0004,
         patience=5,
         max_grad_norm=10.0,
         checkpoint=None,
         proportion=1,
         output=None):
    """
    Train the ESIM model on the SNLI dataset.

    Args:
        train_file: A path to some preprocessed data that must be used
            to train the model.
        valid_file: A path to some preprocessed data that must be used
            to validate the model.
        embeddings_file: A path to some preprocessed word embeddings that
            must be used to initialise the model.
        target_dir: The path to a directory where the trained model must
            be saved.
        hidden_size: The size of the hidden layers in the model. Defaults
            to 300.
        dropout: The dropout rate to use in the model. Defaults to 0.5.
        num_classes: The number of classes in the output of the model.
            Defaults to 3.
        epochs: The maximum number of epochs for training. Defaults to 64.
        batch_size: The size of the batches for training. Defaults to 32.
        lr: The learning rate for the optimizer. Defaults to 0.0004.
        patience: The patience to use for early stopping. Defaults to 5.
        checkpoint: A checkpoint from which to continue training. If None,
            training starts from scratch. Defaults to None.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for training ", 20 * "=")

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    with open(train_file, "rb") as pkl:
        train_data = NLIDataset(pickle.load(pkl), proportion, isRandom=True)#training data will be shuffled first, then we will get random data of different proportion

    train_loader = DataLoader(train_data, shuffle=False, batch_size=batch_size)


    print("\t* Loading validation data...")
    with open(valid_file, "rb") as pkl:
        valid_data = NLIDataset(pickle.load(pkl))

    valid_loader = DataLoader(valid_data, shuffle=False, batch_size=batch_size)

    print("\t* Loading test data...")
    with open(test_file, "rb") as pkl:
        test_data = NLIDataset(pickle.load(pkl))

    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    with open(embeddings_file, "rb") as pkl:
        embeddings = torch.tensor(pickle.load(pkl), dtype=torch.float)\
                     .to(device)

    model = ESIM(embeddings.shape[0],
                 embeddings.shape[1],
                 hidden_size,
                 embeddings=embeddings,
                 dropout=dropout,
                 num_classes=num_classes,
                 device=device).to(device)

    # -------------------- Preparation for training  ------------------- #
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="max",
                                                           factor=0.5,
                                                           patience=0)

    best_score = 0.0
    start_epoch = 1

    # Data for loss curves plot.
    epochs_count = []
    train_losses = []
    valid_losses = []

    # Continuing training from a checkpoint if one was given as argument.
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint["epoch"] + 1
        best_score = checkpoint["best_score"]

        print("\t* Training will continue on existing model from epoch {}..."
              .format(start_epoch))

        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        epochs_count = checkpoint["epochs_count"]
        train_losses = checkpoint["train_losses"]
        valid_losses = checkpoint["valid_losses"]

    # Compute loss and accuracy before starting (or resuming) training.

    _, valid_loss, valid_accuracy = validate(model,
                                             valid_loader,
                                             criterion)
    print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%"
          .format(valid_loss, (valid_accuracy*100)))

    # -------------------- Training epochs ------------------- #
    print("\n",
          20 * "=",
          "Training ESIM model on device: {}".format(device),
          20 * "=")

    patience_counter = 0
    for epoch in range(start_epoch, epochs+1):
        epochs_count.append(epoch)

        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model,
                                                       train_loader,
                                                       optimizer,
                                                       criterion,
                                                       epoch,
                                                       max_grad_norm)

        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%"
              .format(epoch_time, epoch_loss, (epoch_accuracy*100)))


        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = validate(model,
                                                          valid_loader,
                                                          criterion)

        valid_losses.append(epoch_loss)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n"
              .format(epoch_time, epoch_loss, (epoch_accuracy*100)))

        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)

        print("* Testing for epoch {}:".format(epoch))
        batch_time, total_time, f1, accuracy = test(model, num_classes, test_loader)
        print(
            "-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, f1: {:.4f}, accuracy: {:.4f}%".format(
                batch_time, total_time, f1, (accuracy * 100)))
        print(20 * "====")


        # Early stopping on validation accuracy.
        if epoch > 2:
            if epoch_accuracy <= best_score:
                patience_counter += 1
            else:
                best_score = epoch_accuracy
                patience_counter = 0
                # Save the best model. The optimizer is not saved to avoid having
                # a checkpoint file that is too heavy to be shared. To resume
                # training from the best model, use the 'esim_*.pth.tar'
                # checkpoints instead.
                torch.save({"epoch": epoch,
                            "model": model.state_dict(),
                            "best_score": best_score,
                            "epochs_count": epochs_count,
                            "train_losses": train_losses,
                            "valid_losses": valid_losses},
                           os.path.join(target_dir, output + "_" + str(proportion) + "_best.pth.tar"))
            
            if patience_counter >= patience:
                print("-> Early stopping: patience limit reached, stopping...")
                checkpoint = torch.load(os.path.join(target_dir, output + "_" + str(proportion) + "_best.pth.tar"))
                # Retrieving model parameters from checkpoint.
                vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0)
                embedding_dim = checkpoint["model"]['_word_embedding.weight'].size(1)
                hidden_size = checkpoint["model"]["_projection.0.weight"].size(0)
                num_classes = checkpoint["model"]["_classification.4.weight"].size(0)
                print("\t* Final test...")
                model = ESIM(vocab_size,
                             embedding_dim,
                             hidden_size,
                             num_classes=num_classes,
                             device=device).to(device)
                model.load_state_dict(checkpoint["model"])
                batch_time, total_time, f1, accuracy = test(model, num_classes, test_loader, print_Confusion=True)
                print("-> Final f1, accuracy: {:.4f}, {:.4f}%".format(f1, accuracy * 100))
                os.remove(os.path.join(target_dir, output + "_" + str(proportion) + "_best.pth.tar"))
                break
        if epoch == 15:
            checkpoint = torch.load(os.path.join(target_dir, output + "_" + str(proportion) + "_best.pth.tar"))
            # Retrieving model parameters from checkpoint.
            vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0)
            embedding_dim = checkpoint["model"]['_word_embedding.weight'].size(1)
            hidden_size = checkpoint["model"]["_projection.0.weight"].size(0)
            num_classes = checkpoint["model"]["_classification.4.weight"].size(0)
            print("\t* Final test...")
            model = ESIM(vocab_size,
                         embedding_dim,
                         hidden_size,
                         num_classes=num_classes,
                         device=device).to(device)
            model.load_state_dict(checkpoint["model"])
            batch_time, total_time, f1, accuracy = test(model, num_classes, test_loader, print_Confusion=True)
            print("-> Final f1, accuracy: {:.4f}, {:.4f}%".format(f1, accuracy * 100))
            os.remove(os.path.join(target_dir, output + "_" + str(proportion) + "_best.pth.tar"))
Beispiel #5
0
def main(test_files, pretrained_file, labeldict, output_dir, batch_size=32):
    """
    Test the ESIM model with pretrained weights on the MultiNLI dataset.

    Args:
        test_files: The paths to the preprocessed matched and mismatched MNLI
            test sets.
        pretrained_file: The path to a checkpoint produced by the
            'train_mnli' script.
        labeldict: A dictionary associating labels (classes) to integer values.
        output_dir: The path to a directory where the predictions of the model
            must be saved.
        batch_size: The size of the batches used for testing. Defaults to 32.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for testing ", 20 * "=")

    output_dir = os.path.normpath(output_dir)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    checkpoint = torch.load(pretrained_file)

    # Retrieve model parameters from the checkpoint.
    vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0)
    embedding_dim = checkpoint["model"]["_word_embedding.weight"].size(1)
    hidden_size = checkpoint["model"]["_projection.0.weight"].size(0)
    num_classes = checkpoint["model"]["_classification.4.weight"].size(0)

    print("\t* Loading test data...")
    with open(os.path.normpath(test_files["test"]), "rb") as pkl:
        data = pickle.load(pkl)["fr"]
        test_data = NLIDataset(data)

    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

    print("\t* Building model...")
    model = ESIM(
        vocab_size, embedding_dim, hidden_size, num_classes=num_classes, device=device
    ).to(device)

    model.load_state_dict(checkpoint["model"])

    with open(os.path.normpath(test_files["embeddings"]), "rb") as pkl:
        embeddings = pickle.load(pkl)
        tgt_embeddings = nn.Embedding(embeddings.shape[0], embeddings.shape[1])
        tgt_embeddings.weight = nn.Parameter(torch.tensor(embeddings))

    # replace model embeddings with xling embeddings from target language
    model._word_embedding = tgt_embeddings.to(device)

    print(
        20 * "=",
        " Prediction on MNLI with ESIM model on device: {} ".format(device),
        20 * "=",
    )

    set_trace()

    print("\t* Prediction for test set...")
    predictions = predict(model, test_loader, labeldict)

    with open(os.path.join(output_dir, "predictions.csv"), "w") as output_f:
        output_f.write("pairID,gold_label\n")
        for pair_id in predictions:
            output_f.write(pair_id + "," + predictions[pair_id] + "\n")