Beispiel #1
0
def main(test_file, vocab_file, embeddings_file, pretrained_file, max_length=50, gpu_index=0, batch_size=128):
    """
    Test the ESIM model with pretrained weights on some dataset.
    Args:
        test_file: The path to a file containing preprocessed NLI data.
        pretrained_file: The path to a checkpoint produced by the
            'train_model' script.
        vocab_size: The number of words in the vocabulary of the model
            being tested.
        embedding_dim: The size of the embeddings in the model.
        hidden_size: The size of the hidden layers in the model. Must match
            the size used during training. Defaults to 300.
        num_classes: The number of classes in the output of the model. Must
            match the value used during training. Defaults to 3.
        batch_size: The size of the batches used for testing. Defaults to 32.
    """
    device = torch.device("cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu")
    print(20 * "=", " Preparing for testing ", 20 * "=")
    if platform == "linux" or platform == "linux2":
        checkpoint = torch.load(pretrained_file)
    else:
        checkpoint = torch.load(pretrained_file, map_location=device)
    # Retrieving model parameters from checkpoint.
    hidden_size = checkpoint["model"]["projection.0.weight"].size(0)
    num_classes = checkpoint["model"]["classification.6.weight"].size(0)
    embeddings = load_embeddings(embeddings_file)
    print("\t* Loading test data...")    
    test_data = LCQMC_Dataset(test_file, vocab_file, max_length)
    test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)
    print("\t* Building model...")
    model = ESIM(hidden_size, embeddings=embeddings, num_classes=num_classes, device=device).to(device)
    model.load_state_dict(checkpoint["model"])
    print(20 * "=", " Testing ESIM model on device: {} ".format(device), 20 * "=")
    batch_time, total_time, accuracy, auc = test(model, test_loader)
    print("\n-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, accuracy: {:.4f}%, auc: {:.4f}\n".format(batch_time, total_time, (accuracy*100), auc))
Beispiel #2
0
def train(preproc_dir, n_classes, max_length, hidden_units, dropout,
          batch_size, epochs, output_dir):
    """
    Train the ESIM model on some dataset and save the learned weights.

    Args:
        preproc_dir: The directory where the preprocessed data is saved.
        n_classes: The number of classes in the problem.
        max_length: The maximum length of the sentences in the premises and
                    hypotheses of the dataset.
        hidden_units: The number of hidden units to use in the various layers
                      of the model.
        dropout: The dropout rate to use in the model.
        batch_size: The size of the batches to use for training.
        epochs: The number of epochs to apply during training.
        output_dir: The path to the directory where the weights learned during
                    training must be saved.
    """
    print("Loading training and validation data...")
    train_premises, train_hyps, train_labels = prepare_data(
        preproc_dir, 'train', n_classes, max_length)
    valid_premises, valid_hyps, valid_labels = prepare_data(
        preproc_dir, 'dev', n_classes, max_length)
    # train_premises是如下形式:
    # [[5, 6, 7, 8, 9, 3, 10, 11, 12, 13, 14, 2, 15, 16, 3,0,0,0,0],
    #  [17, 18, 19, 20, 21, 22, 4, 23, 2, 24,0,0,0,0,0,0,0,0,0],
    #  [25, 26, 27,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]

    print("Loading embedding weights...")
    embedding_weights = load_embeddings(
        os.path.join(preproc_dir, "embedding_weights.pkl"))

    # Build the model.
    esim = ESIM(n_classes, embedding_weights, max_length, hidden_units,
                dropout)
    model = esim.build_model()

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    filepath = os.path.join(output_dir,
                            "weights-{epoch:02d}-{val_acc:.2f}.hdf5")
    checkpoint = ModelCheckpoint(filepath,
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='max')

    model.fit(x=[train_premises, train_hyps],
              y=train_labels,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=([valid_premises, valid_hyps], valid_labels),
              callbacks=[checkpoint],
              shuffle=True)
def model_load_test(test_df,
                    vocab_file,
                    embeddings_file,
                    pretrained_file,
                    test_prediction_dir,
                    test_prediction_name,
                    mode,
                    num_labels,
                    max_length=50,
                    gpu_index=0,
                    batch_size=128):

    device = torch.device(
        "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu")
    print(20 * "=", " Preparing for testing ", 20 * "=")
    if platform == "linux" or platform == "linux2":
        checkpoint = torch.load(pretrained_file, map_location=device)
    else:
        checkpoint = torch.load(pretrained_file, map_location=device)
    # Retrieving model parameters from checkpoint.
    hidden_size = checkpoint["model"]["projection.0.weight"].size(0)
    num_classes = checkpoint["model"]["classification.6.weight"].size(0)
    embeddings = load_embeddings(embeddings_file)
    print("\t* Loading test data...")
    test_data = My_Dataset(test_df, vocab_file, max_length, mode)
    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
    print("\t* Building model...")
    model = ESIM(hidden_size,
                 embeddings=embeddings,
                 num_labels=num_labels,
                 device=device).to(device)
    model.load_state_dict(checkpoint["model"])
    print(20 * "=", " Testing ESIM model on device: {} ".format(device),
          20 * "=")
    batch_time, total_time, accuracy, predictions = test(model, test_loader)
    print(
        "\n-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, accuracy: {:.4f}%\n"
        .format(batch_time, total_time, (accuracy * 100)))
    test_prediction = pd.DataFrame({'prediction': predictions})
    if not os.path.exists(test_prediction_dir):
        os.makedirs(test_prediction_dir)
    test_prediction.to_csv(os.path.join(test_prediction_dir,
                                        test_prediction_name),
                           index=False)
Beispiel #4
0
def main(args):
    print(20 * "=", " Preparing for training ", 20 * "=")
    if not os.path.exists(args.result):
        os.makedirs(args.result)

    # -------------------- Loda pretraining model ------------------- #
    checkpoints = torch.load(args.pretrained_file)
    # 可以从模型中直接恢复,也可以直接在前面定义 Retrieving model parameters from checkpoint.
    # hidden_size = checkpoints["model"]["projection.0.weight"].size(0)
    # num_classes = checkpoints["model"]["classification.6.weight"].size(0)
    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    test_data = LCQMC_dataset(args.test_file,
                              args.vocab_file,
                              args.max_length,
                              test_flag=True)
    test_loader = DataLoader(test_data, batch_size=args.batch_size)
    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    embeddings = load_embeddings(args.embed_file)
    model = ESIM(args, embeddings=embeddings).to(args.device)
    model.load_state_dict(checkpoints["model"])
    print(20 * "=", " Testing ESIM model on device: {} ".format(args.device),
          20 * "=")
    all_predict = predict(model, test_loader)
    index = np.array([], dtype=int)
    for i in range(len(all_predict)):
        index = np.append(index, i)
    # ---------------------生成文件--------------------------
    df_test = pd.DataFrame(columns=['index', 'prediction'])
    df_test['index'] = index
    df_test['prediction'] = all_predict
    df_test.to_csv(args.submit_example_path,
                   index=False,
                   columns=['index', 'prediction'],
                   sep='\t')
def load(args, checkpoint_dir):
    state_dict = torch.load(os.path.join(checkpoint_dir, 'checkpoint.pth'))
    from collections import OrderedDict
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        if 'module' in k:
            namekey = k[7:]  # remove `module.`
        else:
            namekey = k
        new_state_dict[namekey] = v

    if args.model_type == 'bert':
        config = BertConfig.from_json_file(
            os.path.join(checkpoint_dir, 'config.bin'))
        model = BertForSequenceClassification(config)
        model.load_state_dict(new_state_dict)
    elif args.model_type == 'bow':
        model = BOWModel(new_state_dict['embedding.weight'],
                         n_vocab=args.vocab_size,
                         embed_size=args.embed_size,
                         hidden_size=args.hidden_size,
                         num_classes=args.num_labels)
        model.load_state_dict(new_state_dict)
    elif args.model_type == 'decom_att':
        model = DecompAttentionModel(args.word_mat,
                                     n_vocab=args.vocab_size,
                                     embed_size=args.embed_size,
                                     hidden_size=args.hidden_size,
                                     num_classes=args.num_labels)
        model.load_state_dict(new_state_dict)
    elif args.model_type == 'esim':
        model = ESIM(vocab_size=args.vocab_size,
                     embedding_dim=args.embed_size,
                     hidden_size=args.hidden_size,
                     embeddings=None,
                     padding_idx=0,
                     dropout=0.1,
                     num_classes=args.num_labels,
                     device=args.device)
        model.load_state_dict(new_state_dict)
    else:
        raise ValueError('model type is not found!')

    return model.to(args.device)
Beispiel #6
0
def main(train_file,
         dev_file,
         vocab_file,
         target_dir,
         max_length=50,
         hidden_size=300,
         dropout=0.2,
         num_classes=2,
         epochs=1,
         batch_size=256,
         lr=0.0005,
         patience=5,
         max_grad_norm=10.0,
         gpu_index=0,
         checkpoint=None):
    #device = torch.device("cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu")
    device = torch.device("cpu")
    print(20 * "=", " Preparing for training ", 20 * "=")
    # 保存模型的路径
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    train_data = LCQMC_Dataset(train_file, vocab_file, max_length)
    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    print("\t* Loading validation data...")
    dev_data = LCQMC_Dataset(dev_file, vocab_file, max_length)
    dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size)
    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    # embeddings = load_embeddings(embeddings_file)
    model = ESIM(hidden_size,
                 dropout=dropout,
                 num_labels=num_classes,
                 device=device).to(device)
    # -------------------- Preparation for training  ------------------- #
    print('a')
    criterion = nn.CrossEntropyLoss()
    # 过滤出需要梯度更新的参数
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    print('b')
    # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"])
    optimizer = torch.optim.Adam(parameters, lr=lr)
    # optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="max",
                                                           factor=0.85,
                                                           patience=0)

    best_score = 0.0
    start_epoch = 1
    # Data for loss curves plot
    epochs_count = []
    train_losses = []
    valid_losses = []
    # Continuing training from a checkpoint if one was given as argument
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint["epoch"] + 1
        best_score = checkpoint["best_score"]
        print("\t* Training will continue on existing model from epoch {}...".
              format(start_epoch))
        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        epochs_count = checkpoint["epochs_count"]
        train_losses = checkpoint["train_losses"]
        valid_losses = checkpoint["valid_losses"]
    # Compute loss and accuracy before starting (or resuming) training.
    _, valid_loss, valid_accuracy, auc = validate(model, dev_loader, criterion)
    print(
        "\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}"
        .format(valid_loss, (valid_accuracy * 100), auc))
    # -------------------- Training epochs ------------------- #
    print("\n", 20 * "=", "Training ESIM model on device: {}".format(device),
          20 * "=")
    patience_counter = 0
    for epoch in range(start_epoch, epochs + 1):
        epochs_count.append(epoch)
        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader,
                                                       optimizer, criterion,
                                                       epoch, max_grad_norm)
        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))
        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy, epoch_auc = validate(
            model, dev_loader, criterion)
        valid_losses.append(epoch_loss)
        print(
            "-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}\n"
            .format(epoch_time, epoch_loss, (epoch_accuracy * 100), epoch_auc))
        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)
        # Early stopping on validation accuracy.

        if epoch_accuracy < best_score:
            patience_counter += 1
        else:
            best_score = epoch_accuracy
            patience_counter = 0
            torch.save(
                {
                    "epoch": epoch,
                    "model": model.state_dict(),
                    "best_score": best_score,
                    "epochs_count": epochs_count,
                    "train_losses": train_losses,
                    "valid_losses": valid_losses
                }, os.path.join(target_dir, "best.pth.tar"))
            # Save the model at each epoch.
        torch.save(
            {
                "epoch": epoch,
                "model": model.state_dict(),
                "best_score": best_score,
                "optimizer": optimizer.state_dict(),
                "epochs_count": epochs_count,
                "train_losses": train_losses,
                "valid_losses": valid_losses
            }, os.path.join(target_dir, "esim_{}.pth.tar".format(epoch)))

        if patience_counter >= patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--embeddings_file",
                        default=None,
                        type=str,
                        required=True)
    parser.add_argument("--output_dir", default=None, type=str, required=True)
    parser.add_argument("--train_language",
                        default=None,
                        type=str,
                        required=True)
    parser.add_argument("--train_steps", default=-1, type=int, required=True)
    parser.add_argument("--eval_steps", default=-1, type=int, required=True)
    parser.add_argument(
        "--load_word2vec",
        action='store_true',
        help=
        'if true, load word2vec file for the first time; if false, load generated word-vector csv file'
    )
    parser.add_argument("--generate_word2vec_csv",
                        action='store_true',
                        help='if true, generate word2vec csv file')
    ## normal parameters
    parser.add_argument("--embedding_size", default=300, type=int)
    parser.add_argument("--query_maxlen", default=30, type=int)
    parser.add_argument("--hidden_size", default=300, type=int)
    parser.add_argument("--learning_rate",
                        default=5e-4,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_classes", default=2, type=int)
    parser.add_argument("--dropout", default=0.2, type=float)
    parser.add_argument("--do_test",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_eval_train",
                        action='store_true',
                        help="Whether to run eval on the train set.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--per_gpu_eval_batch_size", default=10, type=int)
    parser.add_argument("--per_gpu_train_batch_size", default=10, type=int)
    parser.add_argument("--seed", default=1, type=int)
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--gradient_accumulation_steps", default=1, type=int)

    args = parser.parse_args()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()
    # device = torch.device("cpu")
    args.device = device

    # Set seed
    set_seed(args)

    logger.info("Training/evaluation parameters %s", args)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Training
    if args.do_train:
        # build model
        logger.info("*** building model ***")
        embeddings = load_embeddings(args)
        model = ESIM(args.hidden_size,
                     embeddings=embeddings,
                     dropout=args.dropout,
                     num_classes=args.num_classes,
                     device=args.device)
        model.to(args.device)

        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        args.train_batch_size = args.per_gpu_train_batch_size * max(
            1, args.n_gpu)

        logger.info("*** Loading training data ***")
        train_data = ATEC_Dataset(os.path.join(args.data_dir, 'train.csv'),
                                  os.path.join(args.data_dir, 'vocab.csv'),
                                  args.query_maxlen)
        train_loader = DataLoader(train_data,
                                  shuffle=True,
                                  batch_size=args.train_batch_size)

        logger.info("*** Loading validation data ***")
        dev_data = ATEC_Dataset(os.path.join(args.data_dir, 'dev.csv'),
                                os.path.join(args.data_dir, 'vocab.csv'),
                                args.query_maxlen)
        dev_loader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=args.eval_batch_size)

        num_train_optimization_steps = args.train_steps

        # 过滤出需要梯度更新的参数
        parameters = filter(lambda p: p.requires_grad, model.parameters())
        # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"])
        optimizer = torch.optim.Adam(parameters, lr=args.learning_rate)
        # optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                               mode="max",
                                                               factor=0.85,
                                                               patience=0)
        criterion = nn.CrossEntropyLoss()

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_data))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Gradient Accumulation steps = %d",
                    args.gradient_accumulation_steps)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_acc = 0
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        bar = tqdm(range(num_train_optimization_steps),
                   total=num_train_optimization_steps)
        train_loader = cycle(train_loader)

        output_dir = args.output_dir + "eval_results_{}_{}_{}_{}_{}_{}".format(
            'ESIM', str(args.query_maxlen), str(args.learning_rate),
            str(args.train_batch_size), str(args.train_language),
            str(args.train_steps))
        try:
            os.makedirs(output_dir)
        except:
            pass
        output_eval_file = os.path.join(output_dir, 'eval_result.txt')
        with open(output_eval_file, "w") as writer:
            writer.write('*' * 80 + '\n')
        for step in bar:
            batch = next(train_loader)
            batch = tuple(t.to(device) for t in batch)
            q1, q1_lens, q2, q2_lens, labels = batch
            # 正常训练
            optimizer.zero_grad()
            logits, probs = model(q1, q1_lens, q2, q2_lens)
            loss = criterion(logits, labels)
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            tr_loss += loss.item()
            train_loss = round(
                tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1),
                4)
            bar.set_description("loss {}".format(train_loss))
            nb_tr_examples += q1.size(0)
            nb_tr_steps += 1

            loss.backward()
            # 对抗训练
            # fgm.attack() # 在embedding上添加对抗扰动
            # loss_adv = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
            # if args.n_gpu > 1:
            #     loss_adv = loss_adv.mean() # mean() to average on multi-gpu.
            # if args.gradient_accumulation_steps > 1:
            #     loss_adv = loss_adv / args.gradient_accumulation_steps
            # loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
            # fgm.restore() # 恢复embedding参数

            if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
                #                 scheduler.step()
                optimizer.step()
                global_step += 1

            if (step + 1) % (args.eval_steps *
                             args.gradient_accumulation_steps) == 0:
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))

            if args.do_eval and (step + 1) % (
                    args.eval_steps * args.gradient_accumulation_steps) == 0:
                if args.do_eval_train:
                    file_list = ['train.csv', 'dev.csv']
                else:
                    file_list = ['dev.csv']
                for file in file_list:
                    inference_labels = []
                    gold_labels = []
                    inference_logits = []

                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(dev_data))
                    logger.info("  Batch size = %d", args.eval_batch_size)

                    model.eval()
                    eval_loss, eval_accuracy = 0, 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    for q1, q1_lens, q2, q2_lens, labels in tqdm(dev_loader):
                        with torch.no_grad():
                            logits, probs = model(q1, q1_lens, q2, q2_lens)
                        probs = probs.detach().cpu().numpy()
                        # print(logits.shape, probs.shape)
                        # label_ids = labels.to('cpu').numpy()
                        inference_labels.append(np.argmax(probs, 1))
                        gold_labels.append(labels)
                        # eval_loss += tmp_eval_loss.mean().item()
                        nb_eval_examples += logits.size(0)
                        nb_eval_steps += 1

                    gold_labels = np.concatenate(gold_labels, 0)
                    inference_labels = np.concatenate(inference_labels, 0)
                    model.train()
                    eval_loss = eval_loss / nb_eval_steps
                    eval_accuracy = get_f1(inference_labels, gold_labels)

                    result = {
                        # 'eval_loss': eval_loss,
                        'eval_accuracy': eval_accuracy,
                        'global_step': global_step,
                        'train_loss': train_loss
                    }

                    if 'dev' in file:
                        with open(output_eval_file, "a") as writer:
                            writer.write(file + '\n')
                            for key in sorted(result.keys()):
                                logger.info("  %s = %s", key, str(result[key]))
                                writer.write("%s = %s\n" %
                                             (key, str(result[key])))
                            writer.write('*' * 80)
                            writer.write('\n')
                    if eval_accuracy > best_acc and 'dev' in file:
                        print("=" * 80)
                        print("Best ACC", eval_accuracy)
                        print("Saving Model......")
                        best_acc = eval_accuracy
                        # Save a trained model
                        model_to_save = model.module if hasattr(
                            model,
                            'module') else model  # Only save the model it-self
                        output_model_file = os.path.join(
                            output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        print("=" * 80)
                    else:
                        print("=" * 80)
        with open(output_eval_file, "a") as writer:
            writer.write('bert_acc: %f' % best_acc)

    if args.do_test:
        if args.do_train == False:
            output_dir = args.output_dir

        # build model
        logger.info("*** building model ***")
        embeddings = load_embeddings(args)
        model = ESIM(args.hidden_size,
                     embeddings=embeddings,
                     dropout=args.dropout,
                     num_classes=args.num_classes,
                     device=args.device)
        model.load_state_dict(
            torch.load(os.path.join(output_dir, 'pytorch_model.bin')))
        model.to(args.device)

        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        inference_labels = []
        gold_labels = []

        logger.info("*** Loading testing data ***")
        dev_data = ATEC_Dataset(os.path.join(args.data_dir, 'test.csv'),
                                os.path.join(args.data_dir, 'vocab.csv'),
                                args.query_maxlen)
        dev_loader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=args.eval_batch_size)

        logger.info(" *** Run Prediction ***")
        logger.info("  Num examples = %d", len(dev_data))
        logger.info("  Batch size = %d", args.eval_batch_size)

        model.eval()
        for q1, q1_lens, q2, q2_lens, labels in tqdm(dev_loader):
            with torch.no_grad():
                logits, probs = model(q1, q1_lens, q2, q2_lens)
            probs = probs.detach().cpu().numpy()
            inference_labels.append(np.argmax(probs, 1))
            gold_labels.append(labels)

        gold_labels = np.concatenate(gold_labels, 0)
        logits = np.concatenate(inference_labels, 0)
        test_f1 = get_f1(logits, gold_labels)
        logger.info('predict f1:{}'.format(str(test_f1)))
Beispiel #8
0
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.device != -1:
        torch.cuda.manual_seed(args.seed)

    logger = get_logger()
    logger.info(pprint.pformat(vars(args)))

    dataset_cls, embedding, train_loader, test_loader, dev_loader \
        = DatasetFactory.get_dataset(args.dataset, args.word_vectors_dir, args.word_vectors_file, args.batch_size, args.device)

    filter_widths = list(range(1, args.max_window_size + 1)) + [np.inf]
    ext_feats = dataset_cls.EXT_FEATS if args.sparse_features else 0

    model = ESIM(embedding_size=args.word_vectors_dim, device=device, num_units=args.word_vectors_dim,
                  num_classes=dataset_cls.NUM_CLASSES, dropout=args.dropout, max_sentence_length=args.maxlen)

    model = model.to(device)
    embedding = embedding.to(device)

    optimizer = None
    if args.optimizer == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.regularization, eps=args.epsilon)
    elif args.optimizer == 'sgd':
        optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.regularization)
    else:
        raise ValueError('optimizer not recognized: it should be either adam or sgd')

    train_evaluator = EvaluatorFactory.get_evaluator(dataset_cls, model, embedding, train_loader, args.batch_size,
                                                     args.device)
    test_evaluator = EvaluatorFactory.get_evaluator(dataset_cls, model, embedding, test_loader, args.batch_size,
Beispiel #9
0
def main(train_file,
         valid_file,
         embeddings_file,
         target_dir,
         hidden_size=300,
         dropout=0.5,
         num_classes=3,
         epochs=50,
         batch_size=32,
         lr=0.0004,
         patience=5,
         max_grad_norm=10.0,
         checkpoint=None):
    """
	Train the ESIM model on the SNLI dataset.
	Args:
		train_file: A path to some preprocessed data that must be used
			to train the model.
		valid_file: A path to some preprocessed data that must be used
			to validate the model.
		embeddings_file: A path to some preprocessed word embeddings that
			must be used to initialise the model.
		target_dir: The path to a directory where the trained model must
			be saved.
		hidden_size: The size of the hidden layers in the model. Defaults
			to 300.
		dropout: The dropout rate to use in the model. Defaults to 0.5.
		num_classes: The number of classes in the output of the model.
			Defaults to 3.
		epochs: The maximum number of epochs for training. Defaults to 64.
		batch_size: The size of the batches for training. Defaults to 32.
		lr: The learning rate for the optimizer. Defaults to 0.0004.
		patience: The patience to use for early stopping. Defaults to 5.
		checkpoint: A checkpoint from which to continue training. If None,
			training starts from scratch. Defaults to None.
	"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    print(20 * "=", " Preparing for training ", 20 * "=")
    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    with open(train_file, "rb") as pkl:
        train_data = NLIDataset(pickle.load(pkl))
    print("Training data length: ", len(train_data))

    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

    print("\t* Loading validation data...")
    with open(valid_file, "rb") as pkl:
        valid_data = NLIDataset(pickle.load(pkl))
    print("Validation data length: ", len(valid_data))

    valid_loader = DataLoader(valid_data, shuffle=False, batch_size=batch_size)

    # print(train_loader)
    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    with open(embeddings_file, "rb") as pkl:
        embeddings = torch.tensor(pickle.load(pkl), dtype=torch.float)\
            .to(device)

    print(embeddings.size())

    esim_model = ESIM(embeddings.shape[0], embeddings.shape[1], hidden_size,
                      embeddings, dropout, num_classes, device).to(device)

    # -------------------- Preparation for training  ------------------- #
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(esim_model.parameters(),
                           lr=l_rate,
                           weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="max",
                                                           factor=0.5,
                                                           patience=0)

    best_score = 0.0
    start_epoch = 1

    # Data for loss curves plot.
    epochs_count = []
    train_losses = []
    valid_losses = []

    # -------------------- Training epochs ------------------- #
    print("\n", 20 * "=", "Training ESIM model on device: {}".format(device),
          20 * "=")

    patience_counter = 0
    for epoch in range(start_epoch, epochs + 1):
        epochs_count.append(epoch)

        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(esim_model,
                                                       train_loader, optimizer,
                                                       criterion, epoch,
                                                       max_grad_norm)

        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))
Beispiel #10
0
def main():
    device = args.device
    print(20 * "=", " Preparing for training ", 20 * "=")
    # 保存模型的路径
    if not os.path.exists(args.target_dir):
        os.makedirs(args.target_dir)
    # -------------------- Data loading ------------------- #
    print("Loading data......")
    train_loader, dev_loader, test_loader, SEN1, SEN2 = load_data(
        args.batch_size, device)
    embedding = SEN1.vectors
    vocab_size = len(embedding)
    print("vocab_size:", vocab_size)
    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    model = ESIM(args.hidden_size,
                 embedding=embedding,
                 dropout=args.dropout,
                 num_labels=args.num_classes,
                 device=device).to(device)
    # -------------------- Preparation for training  ------------------- #
    criterion = nn.CrossEntropyLoss()
    # 过滤出需要梯度更新的参数
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=args.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="min",
                                                           factor=0.1,
                                                           patience=10)

    best_score = 0.0
    if args.ckp:
        checkpoint = torch.load(os.path.join(args.target_dir, args.ckp))
        best_score = checkpoint["best_score"]
        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        _, valid_loss, valid_accuracy = validate(model, dev_loader, criterion)
        print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%".
              format(valid_loss, (valid_accuracy * 100)))

    # -------------------- Training epochs ------------------- #
    print("\n", 20 * "=", "Training ESIM model on device: {}".format(device),
          20 * "=")
    patience_counter = 0
    for epoch in range(args.num_epoch):
        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader,
                                                       optimizer, criterion,
                                                       args.max_grad_norm,
                                                       device)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))
        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = validate(
            model, dev_loader, criterion, device)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))
        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)

        # Early stopping on validation accuracy.
        if epoch_accuracy < best_score:
            patience_counter += 1
        else:
            print("save model!!!!")
            best_score = epoch_accuracy
            patience_counter = 0
            torch.save(
                {
                    "model": model.state_dict(),
                    "best_score": best_score,
                    "optimizer": optimizer.state_dict(),
                }, os.path.join(args.target_dir, "best.pth.tar"))

        if patience_counter >= 5:
            print("-> Early stopping: patience limit reached, stopping...")
            break

    # ##-------------------- Testing epochs ------------------- #
    # print(20 * "=", " Testing ", 20 * "=")
    # if args.ckp:
    #     checkpoint = torch.load(os.path.join(args.target_dir, args.ckp))
    #     best_score = checkpoint["best_score"]
    #     model.load_state_dict(checkpoint["model"])
    #     optimizer.load_state_dict(checkpoint["optimizer"])
    #
    # print("best_score:", best_score)
    # all_labels = test(model, test_loader, device)
    # print(all_labels[:10])
    # target_label = [id2label[id] for id in all_labels]
    # print(target_label[:10])
    # with open(os.path.join(args.target_dir, 'result.txt'), 'w+') as f:
    #     for label in target_label:
    #         f.write(label + '\n')

    del train_loader
    del dev_loader
    del test_loader
    del SEN1
    del SEN2
    del embedding
Beispiel #11
0
def main(args):
    print(20 * "=", " Preparing for training ", 20 * "=")
    # 保存模型的路径
    if not os.path.exists(args.target_dir):
        os.makedirs(args.target_dir)

    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    # train_data = LCQMC_dataset(args.train_file, args.vocab_file, args.max_length, test_flag=False)
    train_data = LCQMC_dataset(args.train_file, args.vocab_file, args.max_length, test_flag=False)
    train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True)
    print("\t* Loading valid data...")
    dev_data = LCQMC_dataset(args.dev_file, args.vocab_file, args.max_length, test_flag=False)
    dev_loader = DataLoader(dev_data, batch_size=args.batch_size, shuffle=True)
    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    embeddings = load_embeddings(args.embed_file)
    model = ESIM(args, embeddings=embeddings).to(args.device)

    # -------------------- Preparation for training  ------------------- #
    criterion = nn.CrossEntropyLoss()  # 交叉熵损失函数
    # 过滤出需要梯度更新的参数
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=args.lr)  # 优化器
    # 学习计划
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max',
                                                           factor=0.85, patience=0)

    best_score = 0.0
    start_epoch = 1

    epochs_count = []
    train_losses = []
    valid_losses = []
    # Continuing training from a checkpoint if one was given as argument
    if args.checkpoint:
        # 从文件中加载checkpoint数据, 从而继续训练模型
        checkpoints = torch.load(args.checkpoint)
        start_epoch = checkpoints["epoch"] + 1
        best_score = checkpoints["best_score"]
        print("\t* Training will continue on existing model from epoch {}...".format(start_epoch))
        model.load_state_dict(checkpoints["model"])  # 模型部分
        optimizer.load_state_dict(checkpoints["optimizer"])
        epochs_count = checkpoints["epochs_count"]
        train_losses = checkpoints["train_losses"]
        valid_losses = checkpoints["valid_losses"]

        # 这里改为只有从以前加载的checkpoint中才进行计算 valid, Compute loss and accuracy before starting (or resuming) training.
        _, valid_loss, valid_accuracy, auc = validate(model, dev_loader, criterion)
        print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}"
              .format(valid_loss, (valid_accuracy * 100), auc))
    # -------------------- Training epochs ------------------- #
    print("\n", 20 * "=", "Training ESIM model on device: {}".format(args.device), 20 * "=")
    patience_counter = 0

    for epoch in range(start_epoch, args.epochs + 1):
        epochs_count.append(epoch)
        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer,
                                                       criterion, epoch, args.max_grad_norm)
        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%"
              .format(epoch_time, epoch_loss, (epoch_accuracy * 100)))

        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy, epoch_auc = validate(model, train_loader, criterion)
        valid_losses.append(epoch_loss)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}\n"
              .format(epoch_time, epoch_loss, (epoch_accuracy * 100), epoch_auc))
        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)
        # Early stopping on validation accuracy.
        if epoch_accuracy < best_score:
            patience_counter += 1
        else:
            best_score = epoch_accuracy
            patience_counter = 0
            # 保存最好的结果,需要保存的参数,这些参数在checkpoint中都能找到
            torch.save(
                {
                    "epoch": epoch,
                    "model": model.state_dict(),
                    "best_score": best_score,
                    "epochs_count": epochs_count,
                    "train_losses": train_losses,
                    "valid_losses": valid_losses},
                os.path.join(args.target_dir, "new_best.pth.tar"))
        # 保存每个epoch的结果 Save the model at each epoch.(这里可要可不要)
        torch.save(
            {
                "epoch": epoch,
                "model": model.state_dict(),
                "best_score": best_score,
                "optimizer": optimizer.state_dict(),
                "epochs_count": epochs_count,
                "train_losses": train_losses,
                "valid_losses": valid_losses},
            os.path.join(args.target_dir, "new_esim_{}.pth.tar".format(epoch)))

        if patience_counter >= args.patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break
Beispiel #12
0
    if torch.cuda.is_available():
        if not args.cuda:
            print("WARNING: You have a CUDA device, so you should probably run with --cuda")
        else:
            torch.cuda.manual_seed(args.seed)

        # Load Dictionary
    assert os.path.exists(args.train_data)
    assert os.path.exists(args.val_data)

    dictionary = Dictionary(join_path(data_dir,'data/atec_nlp_sim_train.csv'))
    args.vocab_size = len(dictionary)
    best_val_loss = None
    best_f1 = None
    n_token = len(dictionary)
    model = ESIM(args)
    if torch.cuda.is_available():
        model = model.cuda()
    print(model)

    print('Begin to load data.')
    train_data = MyDataset(args.train_data, args.sequence_length, dictionary.word2idx, args.char_model)
    val_data = MyDataset(args.val_data, args.sequence_length, dictionary.word2idx, args.char_model)
    train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=16)
    val_loader = DataLoader(val_data, batch_size=1, shuffle=False)
    try:
        for epoch in range(args.epochs):
            train(train_loader, val_loader, model, args)
    except KeyboardInterrupt:
        print('-' * 89)
        print('Exit from training early.')
Beispiel #13
0
# train_hypo = arrayToTensor(h_trian)
# eval_prem = arrayToTensor(p_eval)
# eval_hypo = arrayToTensor(p_eval)

# 生成数据集
train_dataset = tf.data.Dataset.from_tensor_slices((p_train, h_trian, y_trian))
eval_dataset = tf.data.Dataset.from_tensor_slices((p_eval, h_eval, y_eval))

# 分成多个batch
train_dataset = train_dataset.shuffle(len(p_train)).batch(args.batch_size,
                                                          drop_remainder=True)
eval_dataset = eval_dataset.shuffle(len(p_eval)).batch(args.batch_size,
                                                       drop_remainder=True)

# 载入模型
model = ESIM()

# 初始化优化器
optimizer = tf.keras.optimizers.Adam(args.lr)

# 对于文本匹配的模型,使用二元交叉熵和二元准确率评价函数
train_metric = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')
# loss_func = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
train_loss = tf.keras.metrics.SparseCategoricalCrossentropy(name='train_loss')

# 初始化模型保存机制
ckpt = tf.train.Checkpoint(optimizer=optimizer, model=model)
ckpt.restore(tf.train.latest_checkpoint(args.model_path))
ckpt_manager = tf.train.CheckpointManager(ckpt,
                                          args.model_path,
def model_train_validate_test(train_df,
                              dev_df,
                              test_df,
                              embeddings_file,
                              vocab_file,
                              target_dir,
                              mode,
                              num_labels=2,
                              max_length=50,
                              hidden_size=200,
                              dropout=0.2,
                              epochs=50,
                              batch_size=256,
                              lr=0.0005,
                              patience=5,
                              max_grad_norm=10.0,
                              gpu_index=0,
                              if_save_model=False,
                              checkpoint=None):
    device = torch.device(
        "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu")
    print(20 * "=", " Preparing for training ", 20 * "=")
    # 保存模型的路径
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    train_data = My_Dataset(train_df, vocab_file, max_length, mode)
    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    print("\t* Loading validation data...")
    dev_data = My_Dataset(dev_df, vocab_file, max_length, mode)
    dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size)
    print("\t* Loading test data...")
    test_data = My_Dataset(test_df, vocab_file, max_length, mode)
    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    if (embeddings_file is not None):
        embeddings = load_embeddings(embeddings_file)
    else:
        embeddings = None
    model = ESIM(hidden_size,
                 embeddings=embeddings,
                 dropout=dropout,
                 num_labels=num_labels,
                 device=device).to(device)
    total_params = sum(p.numel() for p in model.parameters())
    print(f'{total_params:,} total parameters.')
    total_trainable_params = sum(p.numel() for p in model.parameters()
                                 if p.requires_grad)
    print(f'{total_trainable_params:,} training parameters.')
    # -------------------- Preparation for training  ------------------- #
    criterion = nn.CrossEntropyLoss()
    # 过滤出需要梯度更新的参数
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"])
    optimizer = torch.optim.Adam(parameters, lr=lr)
    # optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="max",
                                                           factor=0.85,
                                                           patience=0)
    best_score = 0.0
    start_epoch = 1
    # Data for loss curves plot
    epochs_count = []
    train_losses = []
    valid_losses = []
    # Continuing training from a checkpoint if one was given as argument
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint["epoch"] + 1
        best_score = checkpoint["best_score"]
        print("\t* Training will continue on existing model from epoch {}...".
              format(start_epoch))
        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        epochs_count = checkpoint["epochs_count"]
        train_losses = checkpoint["train_losses"]
        valid_losses = checkpoint["valid_losses"]
    # Compute loss and accuracy before starting (or resuming) training.
    _, valid_loss, valid_accuracy, _, = validate(model, dev_loader, criterion)
    print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%".
          format(valid_loss, (valid_accuracy * 100)))
    # -------------------- Training epochs ------------------- #
    print("\n", 20 * "=", "Training ESIM model on device: {}".format(device),
          20 * "=")
    patience_counter = 0
    for epoch in range(start_epoch, epochs + 1):
        epochs_count.append(epoch)
        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader,
                                                       optimizer, criterion,
                                                       epoch, max_grad_norm)
        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))
        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy, _, = validate(
            model, dev_loader, criterion)
        valid_losses.append(epoch_loss)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))
        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)
        # Early stopping on validation accuracy.
        if epoch_accuracy < best_score:
            patience_counter += 1
        else:
            best_score = epoch_accuracy
            patience_counter = 0

            if (if_save_model):
                torch.save(
                    {
                        "epoch": epoch,
                        "model": model.state_dict(),
                        "best_score": best_score,
                        "epochs_count": epochs_count,
                        "train_losses": train_losses,
                        "valid_losses": valid_losses
                    }, os.path.join(target_dir, "best.pth.tar"))

                print("save model succesfully!\n")

            print("* Test for epoch {}:".format(epoch))
            _, _, test_accuracy, predictions = validate(
                model, test_loader, criterion)
            print("Test accuracy: {:.4f}%\n".format(test_accuracy))
            test_prediction = pd.DataFrame({'prediction': predictions})
            test_prediction.to_csv(os.path.join(target_dir,
                                                "test_prediction.csv"),
                                   index=False)

        if patience_counter >= patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break
Beispiel #15
0
        seq2_list = []
        target_list = []
        with open(url, "r", encoding="utf8") as f:
            for item in jsonlines.Reader(f):
                seq1_list.append(item["sentence1"])
                seq2_list.append(item["sentence2"])
                target_list.append(item["gold_label"])
        examples = [data.Example.fromlist(list(item), fields) for item in zip(seq1_list, seq2_list, target_list)]
        super(MyDataset, self).__init__(examples, fields, **kwargs)
        
        
if __name__ == "__main__":
    args = parser.parse_args()
    seq_field = data.Field(lower=True)
    target_field = data.Field(unk_token=None, pad_token=None)
    logging.info("Start prepare dataset")
    train_dataset = MyDataset(args.train_url, seq_field, target_field)
    valid_dataset = MyDataset(args.valid_url, seq_field, target_field)
    seq_field.build_vocab(train_dataset, valid_dataset)
    target_field.build_vocab(train_dataset, valid_dataset)
    train_iter, valid_iter = data.Iterator.splits((train_dataset, valid_dataset),
                                                  batch_sizes=(args.batch_size, args.batch_size),
                                                  repeat=False)
    args.class_num = len(target_field.vocab)
    args.embed_num = len(seq_field.vocab)
    logging.debug(target_field.vocab.stoi)
    logging.info("Success")
    model = ESIM(args)
    train.train(train_iter, valid_iter, model, args)    
    
Beispiel #16
0
def main(test_q1_file,
         test_q2_file,
         test_labels_file,
         pretrained_file,
         gpu_index=0,
         batch_size=64):
    """
    Test the ESIM model with pretrained weights on some dataset.
    Args:
        test_file: The path to a file containing preprocessed NLI data.
        pretrained_file: The path to a checkpoint produced by the
            'train_model' script.
        vocab_size: The number of words in the vocabulary of the model
            being tested.
        embedding_dim: The size of the embeddings in the model.
        hidden_size: The size of the hidden layers in the model. Must match
            the size used during training. Defaults to 300.
        num_classes: The number of classes in the output of the model. Must
            match the value used during training. Defaults to 3.
        batch_size: The size of the batches used for testing. Defaults to 32.
    """
    device = torch.device(
        "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for testing ", 20 * "=")

    if platform == "linux" or platform == "linux2":
        checkpoint = torch.load(pretrained_file)
    else:
        checkpoint = torch.load(pretrained_file, map_location="cuda:0")

    # Retrieving model parameters from checkpoint.
    vocab_size = checkpoint["model"]["word_embedding.weight"].size(0)
    embedding_dim = checkpoint["model"]['word_embedding.weight'].size(1)
    hidden_size = checkpoint["model"]["projection.0.weight"].size(0)
    num_classes = checkpoint["model"]["classification.6.weight"].size(0)

    print("\t* Loading test data...")
    test_q1 = np.load(test_q1_file)
    test_q2 = np.load(test_q2_file)
    test_labels = np.load(test_labels_file)
    #    test_labels = label_transformer(test_labels)

    test_data = {"q1": test_q1, "q2": test_q2, "labels": test_labels}

    test_data = QQPDataset(test_data)
    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

    print("\t* Building model...")
    model = ESIM(vocab_size,
                 embedding_dim,
                 hidden_size,
                 num_classes=num_classes,
                 device=device).to(device)

    model.load_state_dict(checkpoint["model"])

    print(20 * "=", " Testing ESIM model on device: {} ".format(device),
          20 * "=")
    batch_time, total_time, accuracy = test(model, test_loader)

    print()
    print(
        "-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, accuracy: {:.4f}%"
        .format(batch_time, total_time, (accuracy * 100)))
    print()
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument('--data_dir',
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir.")
    parser.add_argument(
        '--model_type',
        default=None,
        type=str,
        required=True,
        help="Model type selected in [bert, xlnet, xlm, bow, decom_att]")
    parser.add_argument(
        '--model_name_or_path',
        default='bert-base-uncased',
        type=str,
        help="Shortcut name is selected in [bert-base-uncased, ]")
    parser.add_argument('--task_name',
                        default='snli',
                        type=str,
                        help="The name of task is selected in [snli]")
    parser.add_argument(
        '--output_dir',
        default='../out',
        type=str,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    # other parameters
    parser.add_argument("--cache_dir",
                        default='../cache',
                        type=str,
                        help="Store the cache files.")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help="The maximum total input sequence length after tokenization.")
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm. Avoiding over-fitting.")
    parser.add_argument("--num_train_epochs",
                        default=60,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--seed",
                        default=42,
                        type=int,
                        help="Random seed for initializaiton.")
    parser.add_argument("--train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--eval",
                        action='store_true',
                        help="Whether to run eval on dev set.")
    parser.add_argument("--ckpt",
                        default=-1,
                        type=int,
                        help="Which ckpt to load.")
    parser.add_argument("--from_scratch",
                        action='store_true',
                        help="Whether to train from scratch.")
    parser.add_argument("--train_type",
                        default='normal',
                        type=str,
                        help="Train type is selected in [normal, rs].")

    args = parser.parse_args()

    if not os.path.exists(args.data_dir):
        raise ValueError("input data dir is not exist.")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.device = device

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger.warning("model type: %s, task name: %s, device: %s, ",
                   args.model_type, args.task_name, device)

    # set seed
    set_seed(args)
    # Prepare task
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % args.task_name)

    task_class = processors[args.task_name]()
    label_list = task_class.get_labels()
    num_labels = len(label_list)
    args.num_labels = num_labels
    # load vocab.
    if args.model_type != 'bert':
        if os.path.exists(args.cache_dir +
                          '/{}_vocab_train.pkl'.format(args.task_name)):
            with open(
                    args.cache_dir +
                    '/{}_vocab_train.pkl'.format(args.task_name), 'rb') as f:
                vocab = pickle.load(f)
            index2word = vocab['index2word']
            word2index = vocab['word2index']
            word_mat = vocab['word_mat']
        else:
            glove_path = '../data/glove/glove.840B.300d.txt'
            index2word, word2index, word_mat = load_vocab(args, glove_path)
        args.word_mat = word_mat
        args.vocab_size = len(index2word)

    # load model.
    model = None
    if args.model_type == 'bert':
        tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path,
                                                  do_lower_case=True)
        args.vocab_size = tokenizer.vocab_size
        config = BertConfig.from_pretrained(args.model_name_or_path,
                                            num_labels=num_labels,
                                            finetuning_task=args.task_name)
        model = BertForSequenceClassification.from_pretrained(
            args.model_name_or_path, config=config)
    elif args.model_type == 'bow':
        args.embed_size = 300
        args.hidden_size = 100
        model = BOWModel(word_mat,
                         n_vocab=args.vocab_size,
                         embed_size=args.embed_size,
                         hidden_size=args.hidden_size,
                         num_classes=args.num_labels)
    elif args.model_type == 'decom_att':  # No using
        args.embed_size = 300
        args.hidden_size = 100
        model = DecompAttentionModel(word_mat,
                                     n_vocab=args.vocab_size,
                                     embed_size=args.embed_size,
                                     hidden_size=args.hidden_size,
                                     num_classes=args.num_labels)
    elif args.model_type == 'esim':
        args.embed_size = 300
        args.hidden_size = 100
        model = ESIM(vocab_size=args.vocab_size,
                     embedding_dim=args.embed_size,
                     hidden_size=args.hidden_size,
                     embeddings=torch.tensor(word_mat).float(),
                     padding_idx=0,
                     dropout=0.1,
                     num_classes=args.num_labels,
                     device=args.device)
    else:
        raise ValueError('model type is not found!')
    model.to(device)
    logger.info("Training/evaluation parameters %s", args)

    # Create output directory if needed
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    # Create cache directory if needed
    if not os.path.exists(args.cache_dir):
        os.makedirs(args.cache_dir)

    train_dataset = None
    eval_dataset = None
    test_dataset = None
    if args.train:
        train_dataset = load_and_cache_normal_example(
            args, word2index, 'train') if args.model_type not in [
                'bert'
            ] else load_and_cache_bert_example(args, tokenizer, 'train')
        eval_dataset = load_and_cache_normal_example(
            args, word2index, 'eval') if args.model_type not in [
                'bert'
            ] else load_and_cache_bert_example(args, tokenizer, 'eval')
    if args.eval:
        test_dataset = load_and_cache_normal_example(
            args, word2index, 'test') if args.model_type not in [
                'bert'
            ] else load_and_cache_bert_example(args, tokenizer, 'test')

    # Training
    if args.train:
        if args.from_scratch:  # default False
            global_step, train_loss = normal_train(args, model, train_dataset,
                                                   eval_dataset)
        else:
            if args.ckpt < 0:
                checkpoints = glob.glob(args.output_dir +
                                        '/normal_{}_{}_checkpoint-*'.format(
                                            args.task_name, args.model_type))
                checkpoints.sort(key=lambda x: int(x.split('-')[-1]))
                checkpoint = checkpoints[-1]
                ckpt = int(checkpoint.split('-')[-1])
            else:
                checkpoint = os.path.join(
                    args.output_dir, 'normal_{}_{}_checkpoint-{}'.format(
                        args.task_name, args.model_type, args.ckpt))
                ckpt = args.ckpt
            model = load(args, checkpoint)
            print("Load model from {}".format(checkpoint))
            global_step, train_loss = normal_train(args, model, train_dataset,
                                                   eval_dataset, ckpt + 1)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    train_loss)

    # Evaluation
    if args.eval:
        if args.ckpt < 0:
            checkpoints = glob.glob(
                args.output_dir + '/{}_{}_{}_checkpoint-*'.format(
                    args.train_type, args.task_name, args.model_type))
            checkpoints.sort(key=lambda x: int(x.split('-')[-1]))
            checkpoint = checkpoints[-1]
        else:
            checkpoint = os.path.join(
                args.output_dir,
                '{}_{}_{}_checkpoint-{}'.format(args.train_type,
                                                args.task_name,
                                                args.model_type, args.ckpt))
        model = load(args, checkpoint)
        print("Evaluation result, load model from {}".format(checkpoint))
        acc = evaluate(args, model, test_dataset)
        print("acc={:.4f}".format(acc))
Beispiel #18
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--epoch", type=int, required=True)
    parser.add_argument("--seed", type=int, required=True)
    parser.add_argument("--emb_file", type=str, required=True)
    parser.add_argument("--checkpoint", type=str, required=True)
    parser.add_argument("--save_dir", type=str, required=True)
    parser.add_argument("--train_file", type=str, required=True)
    parser.add_argument("--log_file", type=str, required=False)
    parser.add_argument("--ratio", type=str, required=True)
    parser.add_argument("--vocab_size", type=int, required=True)
    parser.add_argument("--emb_size", type=int, required=True)
    parser.add_argument("--learning_rate", type=float, required=True)
    parser.add_argument("--batch_size", type=int, required=True)
    parser.add_argument("--max_length", type=int, required=True)
    parser.add_argument("--max_grad_norm", type=int, required=True)

    args = parser.parse_args()

    split_ratio = [float(val) for val in args.ratio.split(",")]

    has_cuda = torch.cuda.is_available()

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
    DATE_FORMAT = "%m/%d/%Y %H:%M:%S %p"
    logging.basicConfig(filename=args.log_file,
                        level=logging.INFO,
                        format=LOG_FORMAT,
                        datefmt=DATE_FORMAT)

    logging.info("start preparing data")
    data_preprocessor = DataPreprocess()
    emb, word_idx_map = data_preprocessor.build_emb_vocab(args.emb_file)
    data_preprocessor.load(args.train_file, use_mask=False, is_test=False)
    train_dataset, dev_dataset = data_preprocessor.generate_train_dev_dataset(
        ratio=split_ratio)
    train_dataset, dev_dataset = CompDataSet(
        train_dataset,
        word_idx_map,
        max_len=args.max_length,
        emb_size=args.emb_size), CompDataSet(dev_dataset,
                                             word_idx_map,
                                             max_len=args.max_length,
                                             emb_size=args.emb_size)

    train_dataset = DataLoader(train_dataset,
                               batch_size=args.batch_size,
                               shuffle=True)
    dev_dataset = DataLoader(dev_dataset,
                             batch_size=args.batch_size,
                             shuffle=True)

    logging.info("init model")
    start_epoch = 0
    if args.checkpoint:
        model = torch.load(args.checkpoint)
        start_epoch = re.findall("\d+(?=\_\d+.pt)", args.checkpoint)
        start_epoch = int(start_epoch[0]) + 1
    else:
        model = ESIM(args.vocab_size,
                     args.emb_size,
                     emb,
                     max_len=args.max_length)

    optimizer = AdamW(model.parameters(), lr=args.learning_rate)
    criterion = FocalLoss()

    if has_cuda:
        model = model.cuda()

    logging.info("start training")
    neg_auc, pos_auc = validate(model, dev_dataset)
    logging.info(f"pre-train neg_auc {str(neg_auc)} pos_auc {str(pos_auc)}")

    for epoch in range(start_epoch, args.epoch):
        running_loss = 0.0
        for step, data in enumerate(train_dataset):
            model.train()
            start_time = time.time()
            optimizer.zero_grad()

            outputs = model(data["premise"], data["premise_mask"],
                            data["hypothese"], data["hypothese_mask"])
            loss = criterion(outputs["probs"], data["label"])
            loss.backward()

            clip_grad_norm_(model.parameters(), args.max_grad_norm)
            optimizer.step()

            end_time = time.time()
            running_loss += loss.item()
            if step % 100 == 99:
                logging.info(
                    f"epoch: {epoch}, step: {step}, time: {end_time - start_time} loss: {running_loss / 100}"
                )
                running_loss = 0
            if step % 500 == 499:
                neg_auc, pos_auc = validate(model, dev_dataset)
                logging.info(
                    f"pre-train neg_auc {str(neg_auc)} pos_auc {str(pos_auc)}")
                torch.save(model, Path(args.save_dir) / f"{epoch}_{step}.pt")
Beispiel #19
0
HIDDEN_DIM = 100
LINEAR_SIZE = 200
DROPOUT = 0.5
BATCH_SIZE = 128

device = torch.device('cuda')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.text1),
    device=device,
    shuffle=True)

pretrained_embeddings = TEXT.vocab.vectors
model = ESIM(pretrained_embeddings, VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM,
             LINEAR_SIZE, DROPOUT)

optimizer = optim.Adam(model.parameters())
criterion = FocalLoss(2)
model = model.to(device)
criterion = criterion.to(device)


def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(
        dim=1, keepdim=True)  # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])
def main():
    parser = argparse.ArgumentParser()
    # Required parameters
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of task is selected in [imdb, amazon]")
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir.")
    parser.add_argument("--cache_dir",
                        default='../cache',
                        type=str,
                        help="The cache data dir.")
    parser.add_argument(
        '--model_type',
        default=None,
        type=str,
        required=True,
        help="Model type selected in [bert, xlnet, xlm, cnn, lstm]")
    parser.add_argument(
        '--model_name_or_path',
        default='bert-base-uncased',
        type=str,
        help="Shortcut name is selected in [bert-base-uncased, ]")
    parser.add_argument(
        '--output_dir',
        default='../out',
        type=str,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    parser.add_argument(
        "--skip",
        default=20,
        type=int,
        help="Evaluate one testing point every skip testing point.")
    parser.add_argument("--num_random_sample",
                        default=5000,
                        type=int,
                        help="The number of random samples of each texts.")
    parser.add_argument("--similarity_threshold",
                        default=0.8,
                        type=float,
                        help="The similarity constraint to be "
                        "considered as synonym.")
    parser.add_argument("--perturbation_constraint",
                        default=100,
                        type=int,
                        help="The maximum size of perturbation "
                        "set of each word.")
    parser.add_argument(
        "--mc_error",
        default=0.01,
        type=float,
        help="Monte Carlo Error based on concentration inequality.")
    parser.add_argument("--train_type",
                        default='normal',
                        type=str,
                        help="Train type is selected in [normal, rs].")
    # other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help="The maximum total input sequence length after tokenization.")
    parser.add_argument("--batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--ckpt",
                        default=-1,
                        type=int,
                        help="Which ckpt to load.")
    parser.add_argument("--seed",
                        default=42,
                        type=int,
                        help="Random seed for initializaiton.")

    args = parser.parse_args()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.device = device

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger.warning("model type: %s, task name: %s, device: %s, train_type: %s",
                   args.model_type, args.task_name, device, args.train_type)

    set_seed(args)
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % args.task_name)

    task_class = processors[args.task_name]()
    label_list = task_class.get_labels()
    num_labels = len(label_list)
    args.num_labels = num_labels
    # load vocab.
    word2index = None
    if args.model_type != 'bert':
        with open(
                args.cache_dir + '/{}_vocab_train.pkl'.format(args.task_name),
                'rb') as f:
            vocab = pickle.load(f)
        index2word = vocab['index2word']
        word2index = vocab['word2index']
        word_mat = vocab['word_mat']
        args.word_mat = word_mat
        args.vocab_size = len(index2word)

    tokenizer = None
    if args.model_type == 'bert':
        tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path,
                                                  do_lower_case=True)
        args.vocab_size = tokenizer.vocab_size
        config = BertConfig.from_pretrained(args.model_name_or_path,
                                            num_labels=num_labels,
                                            finetuning_task=args.task_name)
        model = BertForSequenceClassification.from_pretrained(
            args.model_name_or_path, config=config)
    elif args.model_type == 'bow':
        args.embed_size = 300
        args.hidden_size = 100
        model = BOWModel(word_mat,
                         n_vocab=args.vocab_size,
                         embed_size=args.embed_size,
                         hidden_size=args.hidden_size,
                         num_classes=args.num_labels)
    elif args.model_type == 'decom_att':  # No using
        args.embed_size = 300
        args.hidden_size = 100
        model = DecompAttentionModel(word_mat,
                                     n_vocab=args.vocab_size,
                                     embed_size=args.embed_size,
                                     hidden_size=args.hidden_size,
                                     num_classes=args.num_labels)
    elif args.model_type == 'esim':
        args.embed_size = 300
        args.hidden_size = 100
        model = ESIM(vocab_size=args.vocab_size,
                     embedding_dim=args.embed_size,
                     hidden_size=args.hidden_size,
                     embeddings=torch.tensor(word_mat).float(),
                     padding_idx=0,
                     dropout=0.1,
                     num_classes=args.num_labels,
                     device=args.device)
    else:
        raise ValueError('model type is not found!')
    model.to(device)

    similarity_threshold = args.similarity_threshold
    perturbation_constraint = args.perturbation_constraint

    perturbation_file = args.cache_dir + '/' + args.task_name + '_perturbation_constraint_pca' + str(
        similarity_threshold) + "_" + str(perturbation_constraint) + '.pkl'
    with open(perturbation_file, 'rb') as f:
        perturb = pickle.load(f)

    # random smooth
    random_smooth = WordSubstitute(perturb)
    # generate randomized data
    randomize_testset(args, random_smooth, similarity_threshold,
                      perturbation_constraint)
    # calculate total variation
    calculate_tv_perturb(args, perturb)
    # Evaluation
    if args.ckpt < 0:
        checkpoints = glob.glob(
            args.output_dir + '/{}_{}_{}_checkpoint-*'.format(
                args.train_type, args.task_name, args.model_type))
        checkpoints.sort(key=lambda x: int(x.split('-')[-1]))
        checkpoint = checkpoints[-1]
    else:
        checkpoint = os.path.join(
            args.output_dir,
            '{}_{}_{}_checkpoint-{}'.format(args.train_type, args.task_name,
                                            args.model_type, args.ckpt))
    print("Evaluation result, load model from {}".format(checkpoint))
    model = load(args, checkpoint)
    randomized_evaluate(args, model, tokenizer, word2index)
Beispiel #21
0
parser.add_argument('--max_len', type=int, default=100)
parser.add_argument('--batch_size', type=int, default=32)
parser.add_argument('--lr', type=float, default=0.0004)
parser.add_argument('--embedding_dim', type=int, default=300)
parser.add_argument('--hidden_dim', type=int, default=300)
parser.add_argument('--dropout', type=float, default=0.5)
parser.add_argument('--weight_decay', type=float, default=0.1)
parser.add_argument('--num_epochs', type=int, default=20)
parser.add_argument('--model_path', type=str, default='./model/best.bin1')
args = parser.parse_args()
label2idx = {'entailment': 0, 'neutral': 1, 'contradiction': 2}

train_iter, dev_iter, vocab = data_process.load_data(args, device)

# 定义模型、优化器、损失函数
net = ESIM(args, vocab)
net.to(device)
crition = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=args.lr)


#验证集的准确率
def val_test(net, data_iter, crition):
    acc_sum, loss_sum, n, batch_num = 0.0, 0.0, 0, 0
    net.eval()
    for batch in data_iter:
        sent1, sent2 = batch.sentence1[0], batch.sentence2[0]
        mask1 = (sent1 == 1)
        mask2 = (sent2 == 1)
        y = batch.label
        y = y.to(device)
Beispiel #22
0
def main(train_q1_file,
         train_q2_file,
         train_labels_file,
         dev_q1_file,
         dev_q2_file,
         dev_labels_file,
         embeddings_file,
         target_dir,
         hidden_size=128,
         dropout=0.5,
         num_classes=2,
         epochs=15,
         batch_size=64,
         lr=0.001,
         patience=5,
         max_grad_norm=10.0,
         gpu_index=0,
         checkpoint=None):

    device = torch.device(
        "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu")

    print(20 * "=", " Preparing for training ", 20 * "=")

    # 保存模型的路径
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    train_q1 = np.load(train_q1_file)
    train_q2 = np.load(train_q2_file)
    train_labels = np.load(train_labels_file)
    #    train_labels = label_transformer(train_labels)

    train_data = {"q1": train_q1, "q2": train_q2, "labels": train_labels}

    train_data = QQPDataset(train_data)
    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

    print("\t* Loading validation data...")
    dev_q1 = np.load(dev_q1_file)
    dev_q2 = np.load(dev_q2_file)
    dev_labels = np.load(dev_labels_file)
    #    dev_labels = label_transformer(dev_labels)

    dev_data = {"q1": dev_q1, "q2": dev_q2, "labels": dev_labels}

    dev_data = QQPDataset(dev_data)
    dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size)

    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    embeddings = torch.tensor(np.load(embeddings_file),
                              dtype=torch.float).to(device)

    model = ESIM(embeddings.shape[0],
                 embeddings.shape[1],
                 hidden_size,
                 embeddings=embeddings,
                 dropout=dropout,
                 num_classes=num_classes,
                 device=device).to(device)

    # -------------------- Preparation for training  ------------------- #
    criterion = nn.CrossEntropyLoss()
    # 过滤出需要梯度更新的参数
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"])
    optimizer = torch.optim.Adam(parameters, lr=lr)
    # optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="max",
                                                           factor=0.85,
                                                           patience=0)

    best_score = 0.0
    start_epoch = 1

    # Data for loss curves plot
    epochs_count = []
    train_losses = []
    valid_losses = []

    # Continuing training from a checkpoint if one was given as argument
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint["epoch"] + 1
        best_score = checkpoint["best_score"]

        print("\t* Training will continue on existing model from epoch {}...".
              format(start_epoch))

        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        epochs_count = checkpoint["epochs_count"]
        train_losses = checkpoint["train_losses"]
        valid_losses = checkpoint["valid_losses"]

    # Compute loss and accuracy before starting (or resuming) training.
    _, valid_loss, valid_accuracy = validate(model, dev_loader, criterion)
    print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%".
          format(valid_loss, (valid_accuracy * 100)))

    # -------------------- Training epochs ------------------- #
    print("\n", 20 * "=", "Training ESIM model on device: {}".format(device),
          20 * "=")

    patience_counter = 0
    for epoch in range(start_epoch, epochs + 1):
        epochs_count.append(epoch)

        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader,
                                                       optimizer, criterion,
                                                       epoch, max_grad_norm)

        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))

        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = validate(
            model, dev_loader, criterion)

        valid_losses.append(epoch_loss)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))

        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)

        # Early stopping on validation accuracy.
        if epoch_accuracy < best_score:
            patience_counter += 1
        else:
            best_score = epoch_accuracy
            patience_counter = 0
            # Save the best model. The optimizer is not saved to avoid having
            # a checkpoint file that is too heavy to be shared. To resume
            # training from the best model, use the 'esim_*.pth.tar'
            # checkpoints instead.
            torch.save(
                {
                    "epoch": epoch,
                    "model": model.state_dict(),
                    "best_score": best_score,
                    "epochs_count": epochs_count,
                    "train_losses": train_losses,
                    "valid_losses": valid_losses
                }, os.path.join(target_dir, "best.pth.tar"))

        # Save the model at each epoch.
        torch.save(
            {
                "epoch": epoch,
                "model": model.state_dict(),
                "best_score": best_score,
                "optimizer": optimizer.state_dict(),
                "epochs_count": epochs_count,
                "train_losses": train_losses,
                "valid_losses": valid_losses
            }, os.path.join(target_dir, "esim_{}.pth.tar".format(epoch)))

        if patience_counter >= patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break
Beispiel #23
0
def main(test_file, pretrained_file, embeddings_file, batch_size=1):
    """
    Test the ESIM model with pretrained weights on some dataset.

    Args:
        test_file: The path to a file containing preprocessed NLI data.
        pretrained_file: The path to a checkpoint produced by the
            'train_model' script.
        vocab_size: The number of words in the vocabulary of the model
            being tested.
        embedding_dim: The size of the embeddings in the model.
        hidden_size: The size of the hidden layers in the model. Must match
            the size used during training. Defaults to 300.
        num_classes: The number of classes in the output of the model. Must
            match the value used during training. Defaults to 3.
        batch_size: The size of the batches used for testing. Defaults to 32.
    """
    debug_file = open('test_debug.txt', 'w')
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    #device = torch.device("cpu")

    print(20 * "=", " Preparing for generating representations ", 20 * "=")

    checkpoint = torch.load(pretrained_file)

    # Retrieving model parameters from checkpoint.
    vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0)
    embedding_dim = checkpoint["model"]['_word_embedding.weight'].size(1)
    hidden_size = checkpoint["model"]["_projection.0.weight"].size(0)
    num_classes = checkpoint["model"]["_classification.4.weight"].size(0)

    print("\t* Loading the data...")
    with open(test_file, "rb") as pkl:
        test_data = NLIDataset(pickle.load(pkl))
    print(test_data, file=debug_file)

    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

    print("\t* Building model...")

    # loading the embedding weights separately
    # with open(embeddings_file, "rb") as pkl:
    pkl = open(embeddings_file, "rb")
    embeddings = torch.tensor(pickle.load(pkl), dtype=torch.float)\
                 .to(device)
    pkl.close()

    # model = ESIM(vocab_size,
    #              embedding_dim,
    #              hidden_size,
    #              num_classes=num_classes,
    #              device=device).to(device)
    model = ESIM(embeddings.shape[0],
                 embeddings.shape[1],
                 hidden_size,
                 embeddings=embeddings,
                 num_classes=num_classes,
                 device=device).to(device)
    # Writing custom load_state_dict
    pretrained_dict = checkpoint["model"]
    own_state = model.state_dict()
    for i, (name, param) in enumerate(pretrained_dict.items()):
        #print(name, type(name))
        # if name is "_word_embedding.weight":
        #     print(name)
        #     continue
        if i == 0:
            continue
        if isinstance(param, Parameter):
            # backwards compatibility for serialized parameters
            param = param.data
        own_state[name].copy_(param)

    #model.load_state_dict(checkpoint["model"])

    print(
        20 * "=",
        " Loading the representations from ESIM model on device: {} ".format(
            device), 20 * "=")
    batch_time, total_time, save_rep = test(model, test_loader)
    print("-> Average batch processing time: {:.4f}s, total test time:\
 {:.4f}s,%".format(batch_time, total_time))
    file_debug = open('test_save_rep_details.txt', 'w')
    print('len of save_rep is' + str(len(save_rep)), file=file_debug)
    try:
        print('save_rep sample key is' + str(list(save_rep.keys())[0]),
              file=file_debug)
        print('save_rep sample value is' + str(list(save_rep.values())[0]),
              file=file_debug)
    except:
        pass

    # Dump save_rep as a pickle file
    with open('test_nv_repr.pickle', 'wb') as handle:
        pickle.dump(save_rep, handle, protocol=pickle.HIGHEST_PROTOCOL)