Python NLIDataset Examples

Programming Language: Python

Namespace/Package Name: esim.data

Class/Type: NLIDataset

Examples at hotexamples.com: 5

Python NLIDataset - 5 examples found. These are the top rated real world Python examples of esim.data.NLIDataset extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

NLIDataset(5)

Frequently Used Methods

NLIDataset (5)

Example #1

Show file

def main(test_file, pretrained_file, batch_size=32):
    """
    Test the ESIM model with pretrained weights on some dataset.

    Args:
        test_file: The path to a file containing preprocessed NLI data.
        pretrained_file: The path to a checkpoint produced by the
            'train_model' script.
        vocab_size: The number of words in the vocabulary of the model
            being tested.
        embedding_dim: The size of the embeddings in the model.
        hidden_size: The size of the hidden layers in the model. Must match
            the size used during training. Defaults to 300.
        num_classes: The number of classes in the output of the model. Must
            match the value used during training. Defaults to 3.
        batch_size: The size of the batches used for testing. Defaults to 32.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for testing ", 20 * "=")

    checkpoint = torch.load(pretrained_file)

    # Retrieving model parameters from checkpoint.
    vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0)
    embedding_dim = checkpoint["model"]['_word_embedding.weight'].size(1)
    hidden_size = checkpoint["model"]["_projection.0.weight"].size(0)
    num_classes = checkpoint["model"]["_classification.4.weight"].size(0)

    print("\t* Loading test data...")
    with open(test_file, "rb") as pkl:
        test_data = NLIDataset(pickle.load(pkl))

    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

    print("\t* Building model...")
    model = ESIM(vocab_size,
                 embedding_dim,
                 hidden_size,
                 num_classes=num_classes,
                 device=device).to(device)

    model.load_state_dict(checkpoint["model"])

    print(20 * "=",
          " Testing ESIM model on device: {} ".format(device),
          20 * "=")
    batch_time, total_time, accuracy = test(model, test_loader)

    print("-> Average batch processing time: {:.4f}s, total test time:\
 {:.4f}s, accuracy: {:.4f}%".format(batch_time, total_time, (accuracy*100)))

Example #2

Show file

def main(train_file,
         valid_file,
         embeddings_file,
         target_dir,
         hidden_size=300,
         dropout=0.5,
         num_classes=3,
         epochs=64,
         batch_size=32,
         lr=0.0004,
         patience=5,
         max_grad_norm=10.0,
         checkpoint=None):
    """
    Train the ESIM model on the SNLI dataset.

    Args:
        train_file: A path to some preprocessed data that must be used
            to train the model.
        valid_file: A path to some preprocessed data that must be used
            to validate the model.
        embeddings_file: A path to some preprocessed word embeddings that
            must be used to initialise the model.
        target_dir: The path to a directory where the trained model must
            be saved.
        hidden_size: The size of the hidden layers in the model. Defaults
            to 300.
        dropout: The dropout rate to use in the model. Defaults to 0.5.
        num_classes: The number of classes in the output of the model.
            Defaults to 3.
        epochs: The maximum number of epochs for training. Defaults to 64.
        batch_size: The size of the batches for training. Defaults to 32.
        lr: The learning rate for the optimizer. Defaults to 0.0004.
        patience: The patience to use for early stopping. Defaults to 5.
        checkpoint: A checkpoint from which to continue training. If None,
            training starts from scratch. Defaults to None.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for training ", 20 * "=")

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    with open(train_file, 'rb') as pkl:
        train_data = NLIDataset(pickle.load(pkl))

    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

    print("\t* Loading validation data...")
    with open(valid_file, 'rb') as pkl:
        valid_data = NLIDataset(pickle.load(pkl))

    valid_loader = DataLoader(valid_data, shuffle=False, batch_size=batch_size)

    # -------------------- Model definition ------------------- #
    print('\t* Building model...')
    with open(embeddings_file, 'rb') as pkl:
        embeddings = torch.tensor(pickle.load(pkl), dtype=torch.float)\
                     .to(device)

    model = ESIM(embeddings.shape[0],
                 embeddings.shape[1],
                 hidden_size,
                 embeddings=embeddings,
                 dropout=dropout,
                 num_classes=num_classes,
                 device=device).to(device)

    # -------------------- Preparation for training  ------------------- #
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode='max',
                                                           factor=0.5,
                                                           patience=0)

    best_score = 0.0
    start_epoch = 1

    # Data for loss curves plot.
    epochs_count = []
    train_losses = []
    valid_losses = []

    # Continuing training from a checkpoint if one was given as argument.
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        best_score = checkpoint['best_score']

        print("\t* Training will continue on existing model from epoch {}..."
              .format(start_epoch))

        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        epochs_count = checkpoint['epochs_count']
        train_losses = checkpoint['train_losses']
        valid_losses = checkpoint['valid_losses']

    # Compute loss and accuracy before starting (or resuming) training.
    _, valid_loss, valid_accuracy = validate(model,
                                             valid_loader,
                                             criterion)
    print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%"
          .format(valid_loss, (valid_accuracy*100)))

    # -------------------- Training epochs ------------------- #
    print("\n",
          20 * "=",
          "Training ESIM model on device: {}".format(device),
          20 * "=")

    patience_counter = 0
    for epoch in range(start_epoch, epochs+1):
        epochs_count.append(epoch)

        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model,
                                                       train_loader,
                                                       optimizer,
                                                       criterion,
                                                       epoch,
                                                       max_grad_norm)

        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%"
              .format(epoch_time, epoch_loss, (epoch_accuracy*100)))

        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = validate(model,
                                                          valid_loader,
                                                          criterion)

        valid_losses.append(epoch_loss)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n"
              .format(epoch_time, epoch_loss, (epoch_accuracy*100)))

        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)

        # Early stopping on validation accuracy.
        if epoch_accuracy < best_score:
            patience_counter += 1
        else:
            best_score = epoch_accuracy
            patience_counter = 0
            # Save the best model. The optimizer is not saved to avoid having
            # a checkpoint file that is too heavy to be shared. To resume
            # training from the best model, use the 'esim_*.pth.tar'
            # checkpoints instead.
            torch.save({'epoch': epoch,
                        'model': model.state_dict(),
                        'best_score': best_score,
                        'epochs_count': epochs_count,
                        'train_losses': train_losses,
                        'valid_losses': valid_losses},
                       os.path.join(target_dir, "best.pth.tar"))

        # Save the model at each epoch.
        torch.save({'epoch': epoch,
                    'model': model.state_dict(),
                    'best_score': best_score,
                    'optimizer': optimizer.state_dict(),
                    'epochs_count': epochs_count,
                    'train_losses': train_losses,
                    'valid_losses': valid_losses},
                   os.path.join(target_dir, "esim_{}.pth.tar".format(epoch)))

        if patience_counter >= patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break

    # Plotting of the loss curves for the train and validation sets.
    plt.figure()
    plt.plot(epochs_count, train_losses, '-r')
    plt.plot(epochs_count, valid_losses, '-b')
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.legend(['Training loss', 'Validation loss'])
    plt.title('Cross entropy loss')
    plt.show()

Example #3

Show file

def main(test_files, pretrained_file, labeldict, output_dir, batch_size=32):
    """
    Test the ESIM model with pretrained weights on the MultiNLI dataset.

    Args:
        test_files: The paths to the preprocessed matched and mismatched MNLI
            test sets.
        pretrained_file: The path to a checkpoint produced by the
            'train_mnli' script.
        labeldict: A dictionary associating labels (classes) to integer values.
        output_dir: The path to a directory where the predictions of the model
            must be saved.
        batch_size: The size of the batches used for testing. Defaults to 32.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for testing ", 20 * "=")

    output_dir = os.path.normpath(output_dir)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    checkpoint = torch.load(pretrained_file, map_location='cpu')

    # Retrieve model parameters from the checkpoint.
    vocab_size = checkpoint['model']['_word_embedding.weight'].size(0)
    embedding_dim = checkpoint['model']['_word_embedding.weight'].size(1)
    hidden_size = checkpoint['model']['_projection.0.weight'].size(0)
    num_classes = checkpoint['model']['_classification.4.weight'].size(0)

    print("\t* Loading test data...")
    with open(os.path.normpath(test_files["matched"]), 'rb') as pkl:
        matched_test_data = NLIDataset(pickle.load(pkl))
    # with open(os.path.normpath(test_files["mismatched"]), 'rb') as pkl:
    #     mismatched_test_data = NLIDataset(pickle.load(pkl))

    matched_test_loader = DataLoader(matched_test_data,
                                     shuffle=False,
                                     batch_size=batch_size)
    # mismatched_test_loader = DataLoader(mismatched_test_data,
    #                                     shuffle=False,
    #                                     batch_size=batch_size)

    print("\t* Building model...")
    model = ESIM(vocab_size,
                 embedding_dim,
                 hidden_size,
                 num_classes=num_classes,
                 device=device).to(device)

    model.load_state_dict(checkpoint['model'])

    print(20 * "=",
          " Prediction on MNLI with ESIM model on device: {} ".format(device),
          20 * "=")

    print("\t* Prediction for matched test set...")
    predictions = predict(model, matched_test_loader, labeldict)

    with open(os.path.join(output_dir, "matched_predictions.csv"), 'w') as output_f:
        output_f.write("pairID,gold_label\n")
        for pair_id in predictions:
            output_f.write(pair_id+","+predictions[pair_id]+"\n")

Example #4

Show file

File: main_url.py Project: SuzumeWu/Bert_sim

def main(train_file,
         valid_file,
         test_file,
         embeddings_file,
         target_dir,
         hidden_size=300,
         dropout=0.5,
         num_classes=2,
         epochs=64,
         batch_size=32,
         lr=0.0004,
         patience=5,
         max_grad_norm=10.0,
         checkpoint=None,
         proportion=1,
         output=None):
    """
    Train the ESIM model on the SNLI dataset.

    Args:
        train_file: A path to some preprocessed data that must be used
            to train the model.
        valid_file: A path to some preprocessed data that must be used
            to validate the model.
        embeddings_file: A path to some preprocessed word embeddings that
            must be used to initialise the model.
        target_dir: The path to a directory where the trained model must
            be saved.
        hidden_size: The size of the hidden layers in the model. Defaults
            to 300.
        dropout: The dropout rate to use in the model. Defaults to 0.5.
        num_classes: The number of classes in the output of the model.
            Defaults to 3.
        epochs: The maximum number of epochs for training. Defaults to 64.
        batch_size: The size of the batches for training. Defaults to 32.
        lr: The learning rate for the optimizer. Defaults to 0.0004.
        patience: The patience to use for early stopping. Defaults to 5.
        checkpoint: A checkpoint from which to continue training. If None,
            training starts from scratch. Defaults to None.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for training ", 20 * "=")

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    with open(train_file, "rb") as pkl:
        train_data = NLIDataset(pickle.load(pkl), proportion, isRandom=True)#training data will be shuffled first, then we will get random data of different proportion

    train_loader = DataLoader(train_data, shuffle=False, batch_size=batch_size)


    print("\t* Loading validation data...")
    with open(valid_file, "rb") as pkl:
        valid_data = NLIDataset(pickle.load(pkl))

    valid_loader = DataLoader(valid_data, shuffle=False, batch_size=batch_size)

    print("\t* Loading test data...")
    with open(test_file, "rb") as pkl:
        test_data = NLIDataset(pickle.load(pkl))

    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    with open(embeddings_file, "rb") as pkl:
        embeddings = torch.tensor(pickle.load(pkl), dtype=torch.float)\
                     .to(device)

    model = ESIM(embeddings.shape[0],
                 embeddings.shape[1],
                 hidden_size,
                 embeddings=embeddings,
                 dropout=dropout,
                 num_classes=num_classes,
                 device=device).to(device)

    # -------------------- Preparation for training  ------------------- #
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="max",
                                                           factor=0.5,
                                                           patience=0)

    best_score = 0.0
    start_epoch = 1

    # Data for loss curves plot.
    epochs_count = []
    train_losses = []
    valid_losses = []

    # Continuing training from a checkpoint if one was given as argument.
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint["epoch"] + 1
        best_score = checkpoint["best_score"]

        print("\t* Training will continue on existing model from epoch {}..."
              .format(start_epoch))

        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        epochs_count = checkpoint["epochs_count"]
        train_losses = checkpoint["train_losses"]
        valid_losses = checkpoint["valid_losses"]

    # Compute loss and accuracy before starting (or resuming) training.

    _, valid_loss, valid_accuracy = validate(model,
                                             valid_loader,
                                             criterion)
    print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%"
          .format(valid_loss, (valid_accuracy*100)))

    # -------------------- Training epochs ------------------- #
    print("\n",
          20 * "=",
          "Training ESIM model on device: {}".format(device),
          20 * "=")

    patience_counter = 0
    for epoch in range(start_epoch, epochs+1):
        epochs_count.append(epoch)

        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model,
                                                       train_loader,
                                                       optimizer,
                                                       criterion,
                                                       epoch,
                                                       max_grad_norm)

        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%"
              .format(epoch_time, epoch_loss, (epoch_accuracy*100)))


        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = validate(model,
                                                          valid_loader,
                                                          criterion)

        valid_losses.append(epoch_loss)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n"
              .format(epoch_time, epoch_loss, (epoch_accuracy*100)))

        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)

        print("* Testing for epoch {}:".format(epoch))
        batch_time, total_time, f1, accuracy = test(model, num_classes, test_loader)
        print(
            "-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, f1: {:.4f}, accuracy: {:.4f}%".format(
                batch_time, total_time, f1, (accuracy * 100)))
        print(20 * "====")


        # Early stopping on validation accuracy.
        if epoch > 2:
            if epoch_accuracy <= best_score:
                patience_counter += 1
            else:
                best_score = epoch_accuracy
                patience_counter = 0
                # Save the best model. The optimizer is not saved to avoid having
                # a checkpoint file that is too heavy to be shared. To resume
                # training from the best model, use the 'esim_*.pth.tar'
                # checkpoints instead.
                torch.save({"epoch": epoch,
                            "model": model.state_dict(),
                            "best_score": best_score,
                            "epochs_count": epochs_count,
                            "train_losses": train_losses,
                            "valid_losses": valid_losses},
                           os.path.join(target_dir, output + "_" + str(proportion) + "_best.pth.tar"))
            
            if patience_counter >= patience:
                print("-> Early stopping: patience limit reached, stopping...")
                checkpoint = torch.load(os.path.join(target_dir, output + "_" + str(proportion) + "_best.pth.tar"))
                # Retrieving model parameters from checkpoint.
                vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0)
                embedding_dim = checkpoint["model"]['_word_embedding.weight'].size(1)
                hidden_size = checkpoint["model"]["_projection.0.weight"].size(0)
                num_classes = checkpoint["model"]["_classification.4.weight"].size(0)
                print("\t* Final test...")
                model = ESIM(vocab_size,
                             embedding_dim,
                             hidden_size,
                             num_classes=num_classes,
                             device=device).to(device)
                model.load_state_dict(checkpoint["model"])
                batch_time, total_time, f1, accuracy = test(model, num_classes, test_loader, print_Confusion=True)
                print("-> Final f1, accuracy: {:.4f}, {:.4f}%".format(f1, accuracy * 100))
                os.remove(os.path.join(target_dir, output + "_" + str(proportion) + "_best.pth.tar"))
                break
        if epoch == 15:
            checkpoint = torch.load(os.path.join(target_dir, output + "_" + str(proportion) + "_best.pth.tar"))
            # Retrieving model parameters from checkpoint.
            vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0)
            embedding_dim = checkpoint["model"]['_word_embedding.weight'].size(1)
            hidden_size = checkpoint["model"]["_projection.0.weight"].size(0)
            num_classes = checkpoint["model"]["_classification.4.weight"].size(0)
            print("\t* Final test...")
            model = ESIM(vocab_size,
                         embedding_dim,
                         hidden_size,
                         num_classes=num_classes,
                         device=device).to(device)
            model.load_state_dict(checkpoint["model"])
            batch_time, total_time, f1, accuracy = test(model, num_classes, test_loader, print_Confusion=True)
            print("-> Final f1, accuracy: {:.4f}, {:.4f}%".format(f1, accuracy * 100))
            os.remove(os.path.join(target_dir, output + "_" + str(proportion) + "_best.pth.tar"))

Example #5

Show file

def main(test_files, pretrained_file, labeldict, output_dir, batch_size=32):
    """
    Test the ESIM model with pretrained weights on the MultiNLI dataset.

    Args:
        test_files: The paths to the preprocessed matched and mismatched MNLI
            test sets.
        pretrained_file: The path to a checkpoint produced by the
            'train_mnli' script.
        labeldict: A dictionary associating labels (classes) to integer values.
        output_dir: The path to a directory where the predictions of the model
            must be saved.
        batch_size: The size of the batches used for testing. Defaults to 32.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for testing ", 20 * "=")

    output_dir = os.path.normpath(output_dir)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    checkpoint = torch.load(pretrained_file)

    # Retrieve model parameters from the checkpoint.
    vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0)
    embedding_dim = checkpoint["model"]["_word_embedding.weight"].size(1)
    hidden_size = checkpoint["model"]["_projection.0.weight"].size(0)
    num_classes = checkpoint["model"]["_classification.4.weight"].size(0)

    print("\t* Loading test data...")
    with open(os.path.normpath(test_files["test"]), "rb") as pkl:
        data = pickle.load(pkl)["fr"]
        test_data = NLIDataset(data)

    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

    print("\t* Building model...")
    model = ESIM(
        vocab_size, embedding_dim, hidden_size, num_classes=num_classes, device=device
    ).to(device)

    model.load_state_dict(checkpoint["model"])

    with open(os.path.normpath(test_files["embeddings"]), "rb") as pkl:
        embeddings = pickle.load(pkl)
        tgt_embeddings = nn.Embedding(embeddings.shape[0], embeddings.shape[1])
        tgt_embeddings.weight = nn.Parameter(torch.tensor(embeddings))

    # replace model embeddings with xling embeddings from target language
    model._word_embedding = tgt_embeddings.to(device)

    print(
        20 * "=",
        " Prediction on MNLI with ESIM model on device: {} ".format(device),
        20 * "=",
    )

    set_trace()

    print("\t* Prediction for test set...")
    predictions = predict(model, test_loader, labeldict)

    with open(os.path.join(output_dir, "predictions.csv"), "w") as output_f:
        output_f.write("pairID,gold_label\n")
        for pair_id in predictions:
            output_f.write(pair_id + "," + predictions[pair_id] + "\n")