Exemple #1
0
def main(test_file, pretrained_file, batch_size=32):
    """
    Test the ESIM model with pretrained weights on some dataset.

    Args:
        test_file: The path to a file containing preprocessed NLI data.
        pretrained_file: The path to a checkpoint produced by the
            'train_model' script.
        vocab_size: The number of words in the vocabulary of the model
            being tested.
        embedding_dim: The size of the embeddings in the model.
        hidden_size: The size of the hidden layers in the model. Must match
            the size used during training. Defaults to 300.
        num_classes: The number of classes in the output of the model. Must
            match the value used during training. Defaults to 3.
        batch_size: The size of the batches used for testing. Defaults to 32.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for testing ", 20 * "=")

    checkpoint = torch.load(pretrained_file)

    # Retrieving model parameters from checkpoint.
    vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0)
    embedding_dim = checkpoint["model"]['_word_embedding.weight'].size(1)
    hidden_size = checkpoint["model"]["_projection.0.weight"].size(0)
    num_classes = checkpoint["model"]["_classification.4.weight"].size(0)

    print("\t* Loading test data...")
    with open(test_file, "rb") as pkl:
        test_data = NLIDataset(pickle.load(pkl))

    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

    print("\t* Building model...")
    model = ESIM(vocab_size,
                 embedding_dim,
                 hidden_size,
                 num_classes=num_classes,
                 device=device).to(device)

    model.load_state_dict(checkpoint["model"])

    print(20 * "=",
          " Testing ESIM model on device: {} ".format(device),
          20 * "=")
    batch_time, total_time, accuracy = test(model, test_loader)

    print("-> Average batch processing time: {:.4f}s, total test time:\
 {:.4f}s, accuracy: {:.4f}%".format(batch_time, total_time, (accuracy*100)))
Exemple #2
0
def main(test_file, pretrained_file, batch_size=32):
    """
    Test the ESIM model with pretrained weights on some dataset.

    Args:
        test_file: The path to a file containing preprocessed NLI data.
        pretrained_file: The path to a checkpoint produced by the
            'train_model' script.
        batch_size: The size of the batches used for testing. Defaults to 32.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for testing ", 20 * "=")

    print("\t* Loading test data...")
    with open(test_file, 'rb') as pkl:
        test_data = NLIDataset(pickle.load(pkl),
                               max_premise_length=100,
                               max_hypothesis_length=12)
    print('\t* Number of data: %s' % len(test_data))
    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

    print("\t* Building model...")
    # hacking the model
    checkpoint = torch.load(pretrained_file)
    vocab_size, embedding_dim = checkpoint['model'][
        '_word_embedding.weight'].size()
    num_classes = 2
    hidden_size = 300

    model = ESIM(vocab_size,
                 embedding_dim,
                 hidden_size,
                 num_classes=num_classes,
                 device=device).to(device)
    if 'fixed_embedding' in checkpoint['model']:
        checkpoint['model'].pop('fixed_embedding')
    model.load_state_dict(checkpoint['model'])

    print(20 * "=", " Testing ESIM model on device: {} ".format(device),
          20 * "=")
    batch_time, total_time, accuracy = test(model, test_loader)

    print("-> Average batch processing time: {:.4f}s, total test time:\
 {:.4f}s, accuracy: {:.4f}%".format(batch_time, total_time, (accuracy * 100)))
Exemple #3
0
def main(train_file,
         valid_file,
         embeddings_file,
         target_dir,
         hidden_size=300,
         dropout=0.5,
         num_classes=3,
         epochs=64,
         batch_size=32,
         patience=5,
         max_grad_norm=10.0,
         checkpoint=None):
    """
    Train the ESIM model on some dataset.

    Args:
        train_file: A path to some preprocessed data that must be used
            to train the model.
        valid_file: A path to some preprocessed data that must be used
            to validate the model.
        embeddings_file: A path to some preprocessed word embeddings that
            must be used to initialise the model.
        target_dir: The path to a directory where the trained model must
            be saved.
        hidden_size: The size of the hidden layers in the model. Defaults
            to 300.
        dropout: The dropout rate to use in the model. Defaults to 0.5.
        num_classes: The number of classes in the output of the model.
            Defaults to 3.
        epochs: The maximum number of epochs for training. Defaults to 64.
        batch_size: The size of the batches for training. Defaults to 32.
        patience: The patience to use for early stopping. Defaults to 5.
        checkpoint: A checkpoint from which to continue training. If None,
            training starts from scratch. Defaults to None.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for training ", 20 * "=")

    # if not os.path.exists(target_dir):
    #     os.makedirs(target_dir)

    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    with open(train_file, 'rb') as pkl:
        train_data = NLIDataset(pickle.load(pkl))

    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

    print("\t* Loading validation data...")
    with open(valid_file, 'rb') as pkl:
        valid_data = NLIDataset(pickle.load(pkl))

    valid_loader = DataLoader(valid_data, shuffle=False, batch_size=batch_size)

    # -------------------- Model definition ------------------- #
    print('\t* Building model...')
    with open(embeddings_file, 'rb') as pkl:
        embeddings = torch.tensor(pickle.load(pkl), dtype=torch.float)\
                     .to(device)

    model = ESIM(embeddings.shape[0],
                 embeddings.shape[1],
                 hidden_size,
                 embeddings=embeddings,
                 dropout=dropout,
                 num_classes=num_classes,
                 device=device).to(device)
    print model

    # -------------------- Preparation for training  ------------------- #
    # criterion = nn.CosineEmbeddingLoss()
    criterion = nn.CosineSimilarity(dim=2)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0004)
    # optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode='max',
                                                           factor=0.5,
                                                           patience=0)

    best_score = 0.0
    start_epoch = 1

    # Data for loss curves plot.
    epochs_count = []
    train_losses = []
    valid_losses = []

    # Continuing training from a checkpoint if one was given as argument.
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        best_score = checkpoint['best_score']

        print("\t* Training will continue on existing model from epoch {}...".
              format(start_epoch))

        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        epochs_count = checkpoint['epochs_count']
        train_losses = checkpoint['train_losses']
        valid_losses = checkpoint['valid_losses']

    # Compute loss and accuracy before starting (or resuming) training.
    _, valid_loss, valid_accuracy = validate(model, valid_loader, criterion)
    print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%".
          format(valid_loss, (valid_accuracy * 100)))

    # -------------------- Training epochs ------------------- #
    print("\n", 20 * "=", "Training ESIM model on device: {}".format(device),
          20 * "=")

    patience_counter = 0
    for epoch in range(start_epoch, epochs + 1):
        epochs_count.append(epoch)

        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader,
                                                       optimizer, criterion,
                                                       epoch, max_grad_norm)

        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))

        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = validate(
            model, valid_loader, criterion)

        valid_losses.append(epoch_loss)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))

        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)

        # Early stopping on validation accuracy.
        if epoch_accuracy < best_score:
            patience_counter += 1
        else:
            best_score = epoch_accuracy
            patience_counter = 0
            # Save the best model. The optimizer is not saved to avoid having
            # a checkpoint file that is too heavy to be shared. To resume
            # training from the best model, use the 'esim_*.pth.tar'
            # checkpoints instead.
            torch.save(
                {
                    'epoch': epoch,
                    'model': model.state_dict(),
                    'best_score': best_score,
                    'epochs_count': epochs_count,
                    'train_losses': train_losses,
                    'valid_losses': valid_losses
                }, os.path.join(target_dir, "best.pth.tar"))

        # Save the model at each epoch.
        torch.save(
            {
                'epoch': epoch,
                'model': model.state_dict(),
                'best_score': best_score,
                'optimizer': optimizer.state_dict(),
                'epochs_count': epochs_count,
                'train_losses': train_losses,
                'valid_losses': valid_losses
            }, os.path.join(target_dir, "esim_{}.pth.tar".format(epoch)))

        if patience_counter >= patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break
Exemple #4
0
    def sentence_retrieval(self):
        print('- sentence retrieval: initialise')
        word_dict = pickle.load(open(self.path_word_dict_stage_2, "rb"))

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        checkpoint = torch.load(self.path_stage_2_model)

        vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0)
        embedding_dim = checkpoint["model"]['_word_embedding.weight'].size(1)
        hidden_size = checkpoint["model"]["_projection.0.weight"].size(0)
        num_classes = checkpoint["model"]["_classification.4.weight"].size(0)

        use_oov_flag = 0
        if 'oov' in self.embeddings_settings_sentence_retrieval_list:
            use_oov_flag = 1

        use_pos_tag_flag = 0
        if 'pos' in self.embeddings_settings_sentence_retrieval_list:
            use_pos_tag_flag = 1

        model = ESIM(vocab_size,
                     embedding_dim,
                     hidden_size,
                     num_classes=num_classes,
                     use_pos_tag_flag=use_pos_tag_flag,
                     use_oov_flag=use_oov_flag,
                     device=device).to(device)

        model.load_state_dict(checkpoint["model"])

        model.eval()

        print('- sentence retrieval: iterate through claims')
        for claim_nr in tqdm(range(self.nr_claims)):
            path_claim = os.path.join(self.path_document_retrieval_dir,
                                      str(claim_nr) + '.json')
            claim_dict = dict_load_json(path_claim)

            list_prob = []
            list_doc_nr = []
            list_line_nr = []

            for doc_nr in claim_dict['document_retrieval']:
                for line_nr in claim_dict['document_retrieval'][doc_nr]:
                    if 'sentence_retrieval' not in claim_dict:
                        claim_dict['sentence_retrieval'] = {}
                    if doc_nr not in claim_dict['sentence_retrieval']:
                        claim_dict['sentence_retrieval'][doc_nr] = {}
                    if line_nr not in claim_dict['sentence_retrieval'][doc_nr]:
                        claim_dict['sentence_retrieval'][doc_nr][line_nr] = {}

                    prob = compute_prob_stage_2(model, claim_dict, doc_nr,
                                                line_nr, device)
                    claim_dict['sentence_retrieval'][doc_nr][line_nr][
                        'prob'] = prob

                    list_doc_nr.append(doc_nr)
                    list_line_nr.append(line_nr)
                    list_prob.append(prob)

            sorted_list_doc_nr = sort_list(list_doc_nr, list_prob)[-5:]
            sorted_list_line_nr = sort_list(list_line_nr, list_prob)[-5:]
            sorted_list_prob = sort_list(list_prob, list_prob)[-5:]
            claim_dict['sentence_retrieval'][
                'doc_nr_list'] = sorted_list_doc_nr
            claim_dict['sentence_retrieval'][
                'line_nr_list'] = sorted_list_line_nr
            claim_dict['sentence_retrieval']['prob_list'] = sorted_list_prob

            claim_dict['predicted_evidence'] = []
            for i in range(len(sorted_list_doc_nr)):
                doc_nr = sorted_list_doc_nr[i]
                title = wiki_database.get_title_from_id(int(doc_nr))
                line_nr = int(sorted_list_line_nr[i])
                claim_dict['predicted_evidence'].append([title, line_nr])

            path_save = os.path.join(self.path_sentence_retrieval_dir,
                                     str(claim_nr) + '.json')
            self.save_dict(claim_dict, path_save)
Exemple #5
0
    def label_prediction(self):
        print('- label prediction: initialise')
        word_dict = pickle.load(open(self.path_word_dict_stage_3, "rb"))

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        checkpoint = torch.load(self.path_stage_3_model)

        vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0)
        embedding_dim = checkpoint["model"]['_word_embedding.weight'].size(1)
        hidden_size = checkpoint["model"]["_projection.0.weight"].size(0)
        num_classes = checkpoint["model"]["_classification.4.weight"].size(0)

        use_oov_flag = 0
        if 'oov' in self.embeddings_settings_label_prediction_list:
            use_oov_flag = 1

        use_pos_tag_flag = 0
        if 'pos' in self.embeddings_settings_label_prediction_list:
            use_pos_tag_flag = 1

        model = ESIM(vocab_size,
                     embedding_dim,
                     hidden_size,
                     num_classes=num_classes,
                     use_pos_tag_flag=use_pos_tag_flag,
                     use_oov_flag=use_oov_flag,
                     device=device).to(device)

        model.load_state_dict(checkpoint["model"])

        model.eval()

        print('- label prediction: iterate through claims')
        for claim_nr in tqdm(range(self.nr_claims)):
            path_claim = os.path.join(self.path_sentence_retrieval_dir,
                                      str(claim_nr) + '.json')
            claim_dict = dict_load_json(path_claim)

            prob_list = []
            prob_list_supported = []
            prob_list_refuted = []
            for i in range(len(
                    claim_dict['sentence_retrieval']['doc_nr_list'])):
                doc_nr = claim_dict['sentence_retrieval']['doc_nr_list'][i]
                line_nr = claim_dict['sentence_retrieval']['line_nr_list'][i]
                if doc_nr in claim_dict['document_retrieval']:
                    if line_nr in claim_dict['document_retrieval'][doc_nr]:
                        prob = compute_prob_stage_3(model, claim_dict, doc_nr,
                                                    line_nr, device)
                        prob_list.append(prob)
                        prob_list_supported.append(prob[2])
                        prob_list_refuted.append(prob[1])
                    else:
                        print('line_nr not in list', line_nr)
                else:
                    print('doc_nr not in list', doc_nr)
            if max(prob_list_supported) > 0.5:
                claim_dict['predicted_label'] = 'SUPPORTS'
            elif max(prob_list_refuted) > 0.5:
                claim_dict['predicted_label'] = 'REFUTES'
            else:
                claim_dict['predicted_label'] = 'NOT ENOUGH INFO'

            path_save = os.path.join(self.path_label_prediction_dir,
                                     str(claim_nr) + '.json')
            self.save_dict(claim_dict, path_save)
Exemple #6
0
def main(test_files, pretrained_file, labeldict, output_dir, batch_size=32):
    """
    Test the ESIM model with pretrained weights on the MultiNLI dataset.

    Args:
        test_files: The paths to the preprocessed matched and mismatched MNLI
            test sets.
        pretrained_file: The path to a checkpoint produced by the
            'train_mnli' script.
        labeldict: A dictionary associating labels (classes) to integer values.
        output_dir: The path to a directory where the predictions of the model
            must be saved.
        batch_size: The size of the batches used for testing. Defaults to 32.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for testing ", 20 * "=")

    output_dir = os.path.normpath(output_dir)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    checkpoint = torch.load(pretrained_file, map_location='cpu')

    # Retrieve model parameters from the checkpoint.
    vocab_size = checkpoint['model']['_word_embedding.weight'].size(0)
    embedding_dim = checkpoint['model']['_word_embedding.weight'].size(1)
    hidden_size = checkpoint['model']['_projection.0.weight'].size(0)
    num_classes = checkpoint['model']['_classification.4.weight'].size(0)

    print("\t* Loading test data...")
    with open(os.path.normpath(test_files["matched"]), 'rb') as pkl:
        matched_test_data = NLIDataset(pickle.load(pkl))
    # with open(os.path.normpath(test_files["mismatched"]), 'rb') as pkl:
    #     mismatched_test_data = NLIDataset(pickle.load(pkl))

    matched_test_loader = DataLoader(matched_test_data,
                                     shuffle=False,
                                     batch_size=batch_size)
    # mismatched_test_loader = DataLoader(mismatched_test_data,
    #                                     shuffle=False,
    #                                     batch_size=batch_size)

    print("\t* Building model...")
    model = ESIM(vocab_size,
                 embedding_dim,
                 hidden_size,
                 num_classes=num_classes,
                 device=device).to(device)

    model.load_state_dict(checkpoint['model'])

    print(20 * "=",
          " Prediction on MNLI with ESIM model on device: {} ".format(device),
          20 * "=")

    print("\t* Prediction for matched test set...")
    predictions = predict(model, matched_test_loader, labeldict)

    with open(os.path.join(output_dir, "matched_predictions.csv"), 'w') as output_f:
        output_f.write("pairID,gold_label\n")
        for pair_id in predictions:
            output_f.write(pair_id+","+predictions[pair_id]+"\n")
Exemple #7
0
def main(train_file,
         valid_file,
         test_file,
         embeddings_file,
         target_dir,
         hidden_size=300,
         dropout=0.5,
         num_classes=2,
         epochs=64,
         batch_size=32,
         lr=0.0004,
         patience=5,
         max_grad_norm=10.0,
         checkpoint=None,
         proportion=1,
         output=None):
    """
    Train the ESIM model on the SNLI dataset.

    Args:
        train_file: A path to some preprocessed data that must be used
            to train the model.
        valid_file: A path to some preprocessed data that must be used
            to validate the model.
        embeddings_file: A path to some preprocessed word embeddings that
            must be used to initialise the model.
        target_dir: The path to a directory where the trained model must
            be saved.
        hidden_size: The size of the hidden layers in the model. Defaults
            to 300.
        dropout: The dropout rate to use in the model. Defaults to 0.5.
        num_classes: The number of classes in the output of the model.
            Defaults to 3.
        epochs: The maximum number of epochs for training. Defaults to 64.
        batch_size: The size of the batches for training. Defaults to 32.
        lr: The learning rate for the optimizer. Defaults to 0.0004.
        patience: The patience to use for early stopping. Defaults to 5.
        checkpoint: A checkpoint from which to continue training. If None,
            training starts from scratch. Defaults to None.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for training ", 20 * "=")

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    with open(train_file, "rb") as pkl:
        train_data = NLIDataset(pickle.load(pkl), proportion, isRandom=True)#training data will be shuffled first, then we will get random data of different proportion

    train_loader = DataLoader(train_data, shuffle=False, batch_size=batch_size)


    print("\t* Loading validation data...")
    with open(valid_file, "rb") as pkl:
        valid_data = NLIDataset(pickle.load(pkl))

    valid_loader = DataLoader(valid_data, shuffle=False, batch_size=batch_size)

    print("\t* Loading test data...")
    with open(test_file, "rb") as pkl:
        test_data = NLIDataset(pickle.load(pkl))

    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    with open(embeddings_file, "rb") as pkl:
        embeddings = torch.tensor(pickle.load(pkl), dtype=torch.float)\
                     .to(device)

    model = ESIM(embeddings.shape[0],
                 embeddings.shape[1],
                 hidden_size,
                 embeddings=embeddings,
                 dropout=dropout,
                 num_classes=num_classes,
                 device=device).to(device)

    # -------------------- Preparation for training  ------------------- #
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="max",
                                                           factor=0.5,
                                                           patience=0)

    best_score = 0.0
    start_epoch = 1

    # Data for loss curves plot.
    epochs_count = []
    train_losses = []
    valid_losses = []

    # Continuing training from a checkpoint if one was given as argument.
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint["epoch"] + 1
        best_score = checkpoint["best_score"]

        print("\t* Training will continue on existing model from epoch {}..."
              .format(start_epoch))

        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        epochs_count = checkpoint["epochs_count"]
        train_losses = checkpoint["train_losses"]
        valid_losses = checkpoint["valid_losses"]

    # Compute loss and accuracy before starting (or resuming) training.

    _, valid_loss, valid_accuracy = validate(model,
                                             valid_loader,
                                             criterion)
    print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%"
          .format(valid_loss, (valid_accuracy*100)))

    # -------------------- Training epochs ------------------- #
    print("\n",
          20 * "=",
          "Training ESIM model on device: {}".format(device),
          20 * "=")

    patience_counter = 0
    for epoch in range(start_epoch, epochs+1):
        epochs_count.append(epoch)

        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model,
                                                       train_loader,
                                                       optimizer,
                                                       criterion,
                                                       epoch,
                                                       max_grad_norm)

        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%"
              .format(epoch_time, epoch_loss, (epoch_accuracy*100)))


        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = validate(model,
                                                          valid_loader,
                                                          criterion)

        valid_losses.append(epoch_loss)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n"
              .format(epoch_time, epoch_loss, (epoch_accuracy*100)))

        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)

        print("* Testing for epoch {}:".format(epoch))
        batch_time, total_time, f1, accuracy = test(model, num_classes, test_loader)
        print(
            "-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, f1: {:.4f}, accuracy: {:.4f}%".format(
                batch_time, total_time, f1, (accuracy * 100)))
        print(20 * "====")


        # Early stopping on validation accuracy.
        if epoch > 2:
            if epoch_accuracy <= best_score:
                patience_counter += 1
            else:
                best_score = epoch_accuracy
                patience_counter = 0
                # Save the best model. The optimizer is not saved to avoid having
                # a checkpoint file that is too heavy to be shared. To resume
                # training from the best model, use the 'esim_*.pth.tar'
                # checkpoints instead.
                torch.save({"epoch": epoch,
                            "model": model.state_dict(),
                            "best_score": best_score,
                            "epochs_count": epochs_count,
                            "train_losses": train_losses,
                            "valid_losses": valid_losses},
                           os.path.join(target_dir, output + "_" + str(proportion) + "_best.pth.tar"))
            
            if patience_counter >= patience:
                print("-> Early stopping: patience limit reached, stopping...")
                checkpoint = torch.load(os.path.join(target_dir, output + "_" + str(proportion) + "_best.pth.tar"))
                # Retrieving model parameters from checkpoint.
                vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0)
                embedding_dim = checkpoint["model"]['_word_embedding.weight'].size(1)
                hidden_size = checkpoint["model"]["_projection.0.weight"].size(0)
                num_classes = checkpoint["model"]["_classification.4.weight"].size(0)
                print("\t* Final test...")
                model = ESIM(vocab_size,
                             embedding_dim,
                             hidden_size,
                             num_classes=num_classes,
                             device=device).to(device)
                model.load_state_dict(checkpoint["model"])
                batch_time, total_time, f1, accuracy = test(model, num_classes, test_loader, print_Confusion=True)
                print("-> Final f1, accuracy: {:.4f}, {:.4f}%".format(f1, accuracy * 100))
                os.remove(os.path.join(target_dir, output + "_" + str(proportion) + "_best.pth.tar"))
                break
        if epoch == 15:
            checkpoint = torch.load(os.path.join(target_dir, output + "_" + str(proportion) + "_best.pth.tar"))
            # Retrieving model parameters from checkpoint.
            vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0)
            embedding_dim = checkpoint["model"]['_word_embedding.weight'].size(1)
            hidden_size = checkpoint["model"]["_projection.0.weight"].size(0)
            num_classes = checkpoint["model"]["_classification.4.weight"].size(0)
            print("\t* Final test...")
            model = ESIM(vocab_size,
                         embedding_dim,
                         hidden_size,
                         num_classes=num_classes,
                         device=device).to(device)
            model.load_state_dict(checkpoint["model"])
            batch_time, total_time, f1, accuracy = test(model, num_classes, test_loader, print_Confusion=True)
            print("-> Final f1, accuracy: {:.4f}, {:.4f}%".format(f1, accuracy * 100))
            os.remove(os.path.join(target_dir, output + "_" + str(proportion) + "_best.pth.tar"))
Exemple #8
0
    def build_model(self):
        print("Start creating model ...")
        # very dirty writing
        pretrained_embeddings = np.load(self.args.pretrained_path)
        setattr(self.args, "vocab_size", pretrained_embeddings.shape[0])

        if self.args.model == "esim":
            from esim.model import ESIM
            self.model = ESIM(vocab_size=self.args.vocab_size,
                              ebd_dim=self.args.embedding_dim,
                              hidden_dim=self.args.hidden_dim,
                              device=self.device,
                              pretrained_embeddings=pretrained_embeddings,
                              padding_idx=0,
                              dropout=self.args.dropout,
                              num_classes=self.args.num_classes,
                              freeze_embeddings=self.args.freeze_embeddings)
        elif self.args.model == "re2":
            from re2.model import RE2
            self.model = RE2(vocab_size=self.args.vocab_size,
                             embedding_dim=self.args.embedding_dim,
                             hidden_dim=self.args.hidden_dim,
                             device=self.device,
                             num_layers=self.args.num_layers,
                             kernel_sizes=self.args.kernel_sizes,
                             num_blocks=self.args.num_blocks,
                             num_feat_type=4,
                             num_classes=self.args.num_classes,
                             dropout=self.args.dropout,
                             pretrained_embeddings=pretrained_embeddings,
                             freeze_embeddings=self.args.freeze_embeddings)
        elif self.args.model == "cafe":
            from cafe.model import CAFE
            self.model = CAFE(
                vocab_size=self.args.vocab_size,
                embedding_dim=self.args.embedding_dim,
                hidden_dim=self.args.hidden_dim,
                k_factor=self.args.k_factor,
                enhance_mode=self.args.enhance_mode,
                pool_mode=self.args.pool_mode,
                device=self.device,
                pretrained_embeddings=pretrained_embeddings,
                freeze_word_embeddings=self.args.freeze_word_embeddings,
                use_char=self.args.use_char,
                use_pos=self.args.use_pos,
                use_local_feat=self.args.use_local_feat,
                char_dim=self.args.char_dim,
                pos_dim=self.args.pos_dim,
                local_feat_dim=self.args.local_feat_dim,
                char_size=self.args.char_size,
                pos_size=self.args.pos_size,
                local_feat_size=self.args.local_feat_size,
                padding_idx=0,
                char_kernel_size=self.args.char_kernel_size,
                dropout=self.args.dropout,
                word_dropout=self.args.word_dropout,
                num_classes=3,
                num_layers=self.args.num_layers)
        else:
            raise ValueError(f"the model {self.args.model} not implemented")
        if self.args.use_pretrain:
            pass
        else:
            self.print_write_to_log("[Warning]: Not use pretrained embeddings")

        self.model.to(self.device)
Exemple #9
0
# vocab_size = checkpoint['model']['_word_embedding.weight'].size(0)
# embedding_dim = checkpoint['model']['_word_embedding.weight'].size(1)
vocab_size = checkpoint['model']['word_embedding.weight'].size(0)
embedding_dim = checkpoint['model']['word_embedding.weight'].size(1)
hidden_size = checkpoint['model']['projection.0.weight'].size(0)
num_classes = checkpoint['model']['classification.6.weight'].size(0)

# print(vocab_size)
# print(embedding_dim)
# print(hidden_size)
# print(num_classes)

print("\t* Building model...")
model = ESIM(vocab_size,
             embedding_dim,
             hidden_size,
             num_classes=num_classes,
             device='cuda').to('cuda')

model.load_state_dict(checkpoint['model'])

worddict = []
worddict_path = '/content/drive/My Drive/Research/bert_hotflip/parameter/ESIM/worddict.pkl'
with open(worddict_path, 'rb') as pkl:
    worddict = pickle.load(pkl)

indexdict = {value: key for key, value in worddict.items()}

embedding_grad = None
now_length = None
def main(train_file,
         valid_file,
         embeddings_file,
         target_dir,
         hidden_size=300,
         dropout=0.5,
         num_classes=3,
         use_pos_tag_flag=0,
         use_oov_flag=0,
         epochs=64,
         batch_size=32,
         lr=0.0004,
         patience=5,
         max_grad_norm=10.0,
         checkpoint=None):
    """
    Train the ESIM model on the FEVER dataset.

    Args:
        train_file: A path to some preprocessed data that must be used
            to train the model.
        valid_file: A path to some preprocessed data that must be used
            to validate the model.
        embeddings_file: A path to some preprocessed word embeddings that
            must be used to initialise the model.
        target_dir: The path to a directory where the trained model must
            be saved.
        hidden_size: The size of the hidden layers in the model. Defaults
            to 300.
        dropout: The dropout rate to use in the model. Defaults to 0.5.
        num_classes: The number of classes in the output of the model.
            Defaults to 3.
        epochs: The maximum number of epochs for training. Defaults to 64.
        batch_size: The size of the batches for training. Defaults to 32.
        lr: The learning rate for the optimizer. Defaults to 0.0004.
        patience: The patience to use for early stopping. Defaults to 5.
        checkpoint: A checkpoint from which to continue training. If None,
            training starts from scratch. Defaults to None.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        print('GPU')
    else:
        print('CPU')
    print(20 * "=", " Preparing for training ", 20 * "=")

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    with open(train_file, "rb") as pkl:
        train_data = FEVERDataset(pickle.load(pkl))

    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

    print("\t* Loading validation data...")
    with open(valid_file, "rb") as pkl:
        valid_data = FEVERDataset(pickle.load(pkl))

    valid_loader = DataLoader(valid_data, shuffle=False, batch_size=batch_size)

    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    with open(embeddings_file, "rb") as pkl:
        embeddings = torch.tensor(pickle.load(pkl), dtype=torch.float)\
                     .to(device)

    model = ESIM(embeddings.shape[0],
                 embeddings.shape[1],
                 hidden_size,
                 embeddings=embeddings,
                 dropout=dropout,
                 num_classes=num_classes,
                 use_pos_tag_flag=use_pos_tag_flag,
                 use_oov_flag=use_oov_flag,
                 device=device).to(device)

    # -------------------- Preparation for training  ------------------- #
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="max",
                                                           factor=0.5,
                                                           patience=0)

    best_score = 0.0
    start_epoch = 1

    # Data for loss curves plot.
    epochs_count = []
    train_losses = []
    valid_losses = []

    # Continuing training from a checkpoint if one was given as argument.
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint["epoch"] + 1
        best_score = checkpoint["best_score"]

        print("\t* Training will continue on existing model from epoch {}...".
              format(start_epoch))

        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        epochs_count = checkpoint["epochs_count"]
        train_losses = checkpoint["train_losses"]
        valid_losses = checkpoint["valid_losses"]

    # Compute loss and accuracy before starting (or resuming) training.
    _, valid_loss, valid_accuracy = validate(model, valid_loader, criterion)
    print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%".
          format(valid_loss, (valid_accuracy * 100)))

    # -------------------- Training epochs ------------------- #
    print("\n", 20 * "=", "Training ESIM model on device: {}".format(device),
          20 * "=")

    patience_counter = 0
    for epoch in range(start_epoch, epochs + 1):
        epochs_count.append(epoch)

        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader,
                                                       optimizer, criterion,
                                                       epoch, max_grad_norm)

        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))

        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = validate(
            model, valid_loader, criterion)

        valid_losses.append(epoch_loss)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))

        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)

        # Early stopping on validation accuracy.
        if epoch_accuracy < best_score:
            patience_counter += 1
        else:
            best_score = epoch_accuracy
            patience_counter = 0
            # Save the best model. The optimizer is not saved to avoid having
            # a checkpoint file that is too heavy to be shared. To resume
            # training from the best model, use the 'esim_*.pth.tar'
            # checkpoints instead.
            torch.save(
                {
                    "epoch": epoch,
                    "model": model.state_dict(),
                    "best_score": best_score,
                    "epochs_count": epochs_count,
                    "train_losses": train_losses,
                    "valid_losses": valid_losses
                }, os.path.join(target_dir, "best.pth.tar"))

        # Save the model at each epoch.
        torch.save(
            {
                "epoch": epoch,
                "model": model.state_dict(),
                "best_score": best_score,
                "optimizer": optimizer.state_dict(),
                "epochs_count": epochs_count,
                "train_losses": train_losses,
                "valid_losses": valid_losses
            }, os.path.join(target_dir, "esim_{}.pth.tar".format(epoch)))

        if patience_counter >= patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break

    # Plotting of the loss curves for the train and validation sets.
    plt.figure()
    plt.plot(epochs_count, train_losses, "-r")
    plt.plot(epochs_count, valid_losses, "-b")
    plt.xlabel("epoch")
    plt.ylabel("loss")
    plt.legend(["Training loss", "Validation loss"])
    plt.title("Cross entropy loss")
Exemple #11
0
def main(test_files, pretrained_file, labeldict, output_dir, batch_size=32):
    """
    Test the ESIM model with pretrained weights on the MultiNLI dataset.

    Args:
        test_files: The paths to the preprocessed matched and mismatched MNLI
            test sets.
        pretrained_file: The path to a checkpoint produced by the
            'train_mnli' script.
        labeldict: A dictionary associating labels (classes) to integer values.
        output_dir: The path to a directory where the predictions of the model
            must be saved.
        batch_size: The size of the batches used for testing. Defaults to 32.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for testing ", 20 * "=")

    output_dir = os.path.normpath(output_dir)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    checkpoint = torch.load(pretrained_file)

    # Retrieve model parameters from the checkpoint.
    vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0)
    embedding_dim = checkpoint["model"]["_word_embedding.weight"].size(1)
    hidden_size = checkpoint["model"]["_projection.0.weight"].size(0)
    num_classes = checkpoint["model"]["_classification.4.weight"].size(0)

    print("\t* Loading test data...")
    with open(os.path.normpath(test_files["test"]), "rb") as pkl:
        data = pickle.load(pkl)["fr"]
        test_data = NLIDataset(data)

    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

    print("\t* Building model...")
    model = ESIM(
        vocab_size, embedding_dim, hidden_size, num_classes=num_classes, device=device
    ).to(device)

    model.load_state_dict(checkpoint["model"])

    with open(os.path.normpath(test_files["embeddings"]), "rb") as pkl:
        embeddings = pickle.load(pkl)
        tgt_embeddings = nn.Embedding(embeddings.shape[0], embeddings.shape[1])
        tgt_embeddings.weight = nn.Parameter(torch.tensor(embeddings))

    # replace model embeddings with xling embeddings from target language
    model._word_embedding = tgt_embeddings.to(device)

    print(
        20 * "=",
        " Prediction on MNLI with ESIM model on device: {} ".format(device),
        20 * "=",
    )

    set_trace()

    print("\t* Prediction for test set...")
    predictions = predict(model, test_loader, labeldict)

    with open(os.path.join(output_dir, "predictions.csv"), "w") as output_f:
        output_f.write("pairID,gold_label\n")
        for pair_id in predictions:
            output_f.write(pair_id + "," + predictions[pair_id] + "\n")
def main(test_file,
         test_embeddings_file,
         pretrained_file,
         target_file,
         dataset,
         embedding_dim,
         distmult=False,
         distmultPath=None,
         distmultEmbeddingDim=None,
         hidden_size=300,
         num_classes=3,
         epochs=64,
         batch_size=32,
         lr=0.0004,
         patience=5,
         max_grad_norm=10.0,
         testing=True,
         multipassiterations=1,
         lstm=False,
         weightedattention=False,
         sentiment=False):
    """
    Test the ESIM model with pretrained weights on some dataset.

    Args:
        test_file: The path to a file containing preprocessed NLI data.
        pretrained_file: The path to a checkpoint produced by the
            'train_model' script.
        vocab_size: The number of words in the vocabulary of the model
            being tested.
        embedding_dim: The size of the embeddings in the model.
        hidden_size: The size of the hidden layers in the model. Must match
            the size used during training. Defaults to 300.
        num_classes: The number of classes in the output of the model. Must
            match the value used during training. Defaults to 3.
        batch_size: The size of the batches used for testing. Defaults to 32.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for testing ", 20 * "=")

    checkpoint = torch.load(pretrained_file, map_location="cpu")

    # Retrieving model parameters from checkpoint.
    #vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0)
    #embedding_dim = checkpoint["model"]['_word_embedding.weight'].size(1)
    #hidden_size = checkpoint["model"]["_projection.0.weight"].size(0)
    #num_classes = checkpoint["model"]["_classification.4.weight"].size(0)

    print("\t* Loading validation data...")
    with open(test_file, "rb") as pkl:
        test_data = MedNLIDataset(pickle.load(pkl), batch_size=batch_size)

    # valid_loader = valid_data.batch(batch_size)#DataLoader(valid_data, shuffle=False, batch_size=batch_size)
    print("\t* Loading validation embeddings...")
    with open(test_embeddings_file, "rb") as pkl:
        bleh = pickle.load(pkl)
        test_embeddings = MedNLIEmbedding(bleh, batch_size=batch_size)

    print("\t* Building model...")
    model = ESIM(embedding_dim,
                 hidden_size,
                 dataset=dataset,
                 distmult=distmult,
                 distmultPath=distmultPath,
                 distmultEmbeddingDim=distmultEmbeddingDim,
                 num_classes=num_classes,
                 multipassiterations=multipassiterations,
                 lstm=lstm,
                 weightedattention=weightedattention,
                 testing=testing,
                 sentiment=sentiment,
                 device=device).to(device)
    print("\t* Model initialized...")
    for idx in range(multipassiterations):
        model.initialize_layers(idx)
    print(model)
    print("\t* Model build...")
    model.load_state_dict(checkpoint["model"])

    print(20 * "=", " Testing ESIM model on device: {} ".format(device),
          20 * "=")
    batch_time, total_time, accuracy, out_classes, labels, similarity_matrices, probs, all_premises, all_hypotheses = _test(
        model, test_data, test_embeddings, batch_size, testing)

    with open(target_file, 'w') as f:
        data_points = len(test_data)
        f.write("premises_raw" + "\t" + "premises_umls" + "\t" +
                "hypotheses_raw" + "\t" + "hypotheses_umls" + "\t" + "output" +
                "\t" + "gold_label" + "\n")
        count = 0
        for idx1, item1 in enumerate(all_premises):
            for idx2, item2 in enumerate(all_premises[idx1]):
                if out_classes[idx1][idx2] == 0:
                    opt1 = "entailment"
                elif out_classes[idx1][idx2] == 2:
                    opt1 = "contradiction"
                elif out_classes[idx1][idx2] == 1:
                    opt1 = "neutral"
                if labels[idx1][idx2] == 0:
                    opt2 = "entailment"
                elif labels[idx1][idx2] == 2:
                    opt2 = "contradiction"
                elif labels[idx1][idx2] == 1:
                    opt2 = "neutral"
                f.write(" ".join(all_premises[idx1][idx2][0]) + "\t" +
                        " ".join(all_premises[idx1][idx2][1]) + "\t" +
                        " ".join(all_hypotheses[idx1][idx2][0]) + "\t" +
                        " ".join(all_hypotheses[idx1][idx2][1]) + "\t" + opt1 +
                        "\t" + opt2 + "\n")
                # Uncomment to save similarity matrices
                # f.write(" ".join(all_premises[idx1][idx2][0]) + "\t" + " ".join(all_hypotheses[idx1][idx2][0]) + "\t" + opt1 + "\t" + opt2 + "\n")
                # torch.save(similarity_matrices[idx1][idx2],
                #            "/".join(target_file.split("/")[:-1]) + "/" + str(count) + ".tensor")
                count += 1

    print("-> Average batch processing time: {:.4f}s, total test time:\
 {:.4f}s, accuracy: {:.4f}%".format(batch_time, total_time, (accuracy * 100)))
    return out_classes, labels, similarity_matrices, probs