Ejemplo n.º 1
0
def train():

    glove_pretrained, dataloaders, dataset_sizes, tbl, tagset, reverse_tagset, tag_definitions = preprocess(
    )
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    net = BiGRU(glove_pretrained,
                MODEL_PARAMS['gru_hidden_dim'], MODEL_PARAMS['gru_num_layers'],
                len(tagset), MODEL_PARAMS['concat']).to(device)

    criterion = nn.NLLLoss(ignore_index=-1)
    optimizer = optim.Adam(net.parameters(), lr=0.001)

    train_model(device, net, dataloaders, dataset_sizes, criterion, optimizer,
                MODEL_PARAMS['num_epochs'])
    test(device, net, dataloaders['testing'])
    torch.save(net.state_dict(), 'trained_model.pt')
Ejemplo n.º 2
0
        matched = torch.mul(matched.type(torch.FloatTensor), masks)

        correct += matched.sum()
        total += masks.sum()
        # print("correct prediction: {}; total prediction: {}".format(correct, total))
        # print("sum comparison:", masks.sum(), seq_len.sum())
    accuracy = correct / total
    return accuracy


rnn = BiGRU(input_size, hidden_size, num_layers, num_classes, batch_size)
rnn = rnn.cuda() if use_cuda else rnn
print("Model loaded!!!!!!!!")

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    print("Epoch {} is running".format(epoch + 1))
    for i, (name, features, masks, labels, seq_len) in enumerate(train_loader):

        features = Variable(features)
        labels = labels.view(-1)
        labels = Variable(labels.type(torch.LongTensor))
        # print("labels size: {}".format(labels.size()))
        if use_cuda:
            features = features.cuda()
            labels = labesl.cuda()
        optimizer.zero_grad()
        outputs = rnn(features)
        # print("outputs size: {}".format(outputs.size()))
Ejemplo n.º 3
0
def train_model(args,
                train_text=None,
                train_labels=None,
                eval_text=None,
                eval_labels=None,
                tokenizer=None):
    textattack.shared.utils.set_seed(args.random_seed)

    _make_directories(args.output_dir)

    num_gpus = torch.cuda.device_count()

    # Save logger writes to file
    log_txt_path = os.path.join(args.output_dir, "log.txt")
    fh = logging.FileHandler(log_txt_path)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)
    logger.info(f"Writing logs to {log_txt_path}.")

    train_examples_len = len(train_text)

    # label_id_len = len(train_labels)
    label_set = set(train_labels)
    args.num_labels = len(label_set)
    logger.info(
        f"Loaded dataset. Found: {args.num_labels} labels: {sorted(label_set)}"
    )

    if len(train_labels) != len(train_text):
        raise ValueError(
            f"Number of train examples ({len(train_text)}) does not match number of labels ({len(train_labels)})"
        )
    if len(eval_labels) != len(eval_text):
        raise ValueError(
            f"Number of teste xamples ({len(eval_text)}) does not match number of labels ({len(eval_labels)})"
        )

    if args.model == "gru":
        textattack.shared.logger.info(
            "Loading textattack model: GRUForClassification")
        model = BiGRU()
        model.to(device)
    elif args.model == "lstm":
        textattack.shared.logger.info(
            "Loading textattack model: LSTMForClassification")
        model = BiLSTM()
        model.to(device)

    # attack_class = attack_from_args(args)
    # We are adversarial training if the user specified an attack along with
    # the training args.
    # adversarial_training = (attack_class is not None) and (not args.check_robustness)

    # multi-gpu training
    if num_gpus > 1:
        model = torch.nn.DataParallel(model)
        logger.info("Using torch.nn.DataParallel.")
    logger.info(f"Training model across {num_gpus} GPUs")

    num_train_optimization_steps = (
        int(train_examples_len / args.batch_size / args.grad_accum_steps) *
        args.num_train_epochs)

    if args.model == "lstm" or args.model == "cnn" or args.model == "gru":

        def need_grad(x):
            return x.requires_grad

        optimizer = torch.optim.Adam(filter(need_grad, model.parameters()),
                                     lr=args.learning_rate)
        scheduler = None
    else:
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]

        optimizer = transformers.optimization.AdamW(
            optimizer_grouped_parameters, lr=args.learning_rate)

        scheduler = transformers.optimization.get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_proportion,
            num_training_steps=num_train_optimization_steps,
        )

    # Start Tensorboard and log hyperparams.
    from torch.utils.tensorboard import SummaryWriter

    tb_writer = SummaryWriter(args.output_dir)

    # Use Weights & Biases, if enabled.
    if args.enable_wandb:
        global wandb
        wandb = textattack.shared.utils.LazyLoader("wandb", globals(), "wandb")
        wandb.init(sync_tensorboard=True)

    # Save original args to file
    args_save_path = os.path.join(args.output_dir, "train_args.json")
    _save_args(args, args_save_path)
    logger.info(f"Wrote original training args to {args_save_path}.")

    tb_writer.add_hparams(
        {k: v
         for k, v in vars(args).items() if _is_writable_type(v)}, {})

    # Start training
    logger.info("***** Running training *****")
    # if augmenter:
    #     logger.info(f"\tNum original examples = {train_examples_len}")
    #     logger.info(f"\tNum examples after augmentation = {len(train_text)}")
    # else:
    #     logger.info(f"\tNum examples = {train_examples_len}")
    logger.info(f"\tNum examples = {train_examples_len}")
    logger.info(f"\tBatch size = {args.batch_size}")
    logger.info(f"\tMax sequence length = {args.max_length}")
    logger.info(f"\tNum steps = {num_train_optimization_steps}")
    logger.info(f"\tNum epochs = {args.num_train_epochs}")
    logger.info(f"\tLearning rate = {args.learning_rate}")

    eval_dataloader = _make_dataloader(tokenizer, eval_text, eval_labels,
                                       args.batch_size)
    train_dataloader = _make_dataloader(tokenizer, train_text, train_labels,
                                        args.batch_size)

    global_step = 0
    tr_loss = 0

    model.train()
    args.best_eval_score = 0
    args.best_eval_score_epoch = 0
    args.epochs_since_best_eval_score = 0

    def loss_backward(loss):
        if num_gpus > 1:
            loss = loss.mean(
            )  # mean() to average on multi-gpu parallel training
        if args.grad_accum_steps > 1:
            loss = loss / args.grad_accum_steps
        loss.backward()
        return loss

    # if args.do_regression:
    #     # TODO integrate with textattack `metrics` package
    #     loss_fct = torch.nn.MSELoss()
    # else:
    #     loss_fct = torch.nn.CrossEntropyLoss()
    loss_fct = torch.nn.CrossEntropyLoss()

    for epoch in tqdm.trange(int(args.num_train_epochs),
                             desc="Epoch",
                             position=0,
                             leave=True):
        # if adversarial_training:
        #     if epoch >= args.num_clean_epochs:
        #         if (epoch - args.num_clean_epochs) % args.attack_period == 0:
        #             # only generate a new adversarial training set every args.attack_period epochs
        #             # after the clean epochs
        #             logger.info("Attacking model to generate new training set...")

        #             adv_attack_results = _generate_adversarial_examples(
        #                 model_wrapper, attack_class, list(zip(train_text, train_labels))
        #             )
        #             adv_train_text = [r.perturbed_text() for r in adv_attack_results]
        #             train_dataloader = _make_dataloader(
        #                 tokenizer, adv_train_text, train_labels, args.batch_size
        #             )
        #     else:
        #         logger.info(f"Running clean epoch {epoch+1}/{args.num_clean_epochs}")

        prog_bar = tqdm.tqdm(train_dataloader,
                             desc="Iteration",
                             position=0,
                             leave=True)

        # Use these variables to track training accuracy during classification.
        correct_predictions = 0
        total_predictions = 0
        for step, batch in enumerate(prog_bar):
            ids1, ids2, msk1, msk2, labels = batch
            # input_ids, labels = batch
            labels = labels.to(device)
            # if isinstance(input_ids, dict):
            #     ## dataloader collates dict backwards. This is a workaround to get
            #     # ids in the right shape for HuggingFace models
            #     input_ids = {
            #         k: torch.stack(v).T.to(device) for k, v in input_ids.items()
            #     }
            #     logits = model(**input_ids)[0]
            # else:

            ids1 = ids1.to(device)
            ids2 = ids2.to(device)
            msk1 = msk1.to(device)
            msk2 = msk2.to(device)
            logits = model(ids1, ids2, msk1, msk2)

            # if args.do_regression:
            #     # TODO integrate with textattack `metrics` package
            #     loss = loss_fct(logits.squeeze(), labels.squeeze())
            # else:
            loss = loss_fct(logits, labels)
            pred_labels = logits.argmax(dim=-1)
            correct_predictions += (pred_labels == labels).sum().item()
            total_predictions += len(pred_labels)

            loss = loss_backward(loss)
            tr_loss += loss.item()

            if global_step % args.tb_writer_step == 0:
                tb_writer.add_scalar("loss", loss.item(), global_step)
                if scheduler is not None:
                    tb_writer.add_scalar("lr",
                                         scheduler.get_last_lr()[0],
                                         global_step)
                else:
                    tb_writer.add_scalar("lr", args.learning_rate, global_step)
            if global_step > 0:
                prog_bar.set_description(f"Loss {tr_loss/global_step}")
            if (step + 1) % args.grad_accum_steps == 0:
                optimizer.step()
                if scheduler is not None:
                    scheduler.step()
                optimizer.zero_grad()
            # Save model checkpoint to file.
            if (global_step > 0 and (args.checkpoint_steps > 0)
                    and (global_step % args.checkpoint_steps) == 0):
                _save_model_checkpoint(model, args.output_dir, global_step)

            # Inc step counter.
            global_step += 1

        # Print training accuracy, if we're tracking it.
        if total_predictions > 0:
            train_acc = correct_predictions / total_predictions
            logger.info(f"Train accuracy: {train_acc*100}%")
            tb_writer.add_scalar("epoch_train_score", train_acc, epoch)

        # Check accuracy after each epoch.
        # skip args.num_clean_epochs during adversarial training
        # if (not adversarial_training) or (epoch >= args.num_clean_epochs):
        if (epoch >= args.num_clean_epochs):
            eval_score = _get_eval_score(model, eval_dataloader, False)
            tb_writer.add_scalar("epoch_eval_score", eval_score, epoch)

            if args.checkpoint_every_epoch:
                _save_model_checkpoint(model, args.output_dir,
                                       args.global_step)

            logger.info(
                f"Eval {'pearson correlation' if args.do_regression else 'accuracy'}: {eval_score*100}%"
            )
            if eval_score > args.best_eval_score:
                args.best_eval_score = eval_score
                args.best_eval_score_epoch = epoch
                args.epochs_since_best_eval_score = 0
                _save_model(model, args.output_dir, args.weights_name,
                            args.config_name)
                logger.info(
                    f"Best acc found. Saved model to {args.output_dir}.")
                _save_args(args, args_save_path)
                logger.info(f"Saved updated args to {args_save_path}")
            else:
                args.epochs_since_best_eval_score += 1
                if (args.early_stopping_epochs >
                        0) and (args.epochs_since_best_eval_score >
                                args.early_stopping_epochs):
                    logger.info(
                        f"Stopping early since it's been {args.early_stopping_epochs} steps since validation acc increased"
                    )
                    break

        if args.check_robustness:
            samples_to_attack = list(zip(eval_text, eval_labels))
            samples_to_attack = random.sample(samples_to_attack, 1000)
            adv_attack_results = _generate_adversarial_examples(
                model_wrapper, attack_class, samples_to_attack)
            attack_types = [r.__class__.__name__ for r in adv_attack_results]
            attack_types = collections.Counter(attack_types)

            adv_acc = 1 - (attack_types["SkippedAttackResult"] /
                           len(adv_attack_results))
            total_attacks = (attack_types["SuccessfulAttackResult"] +
                             attack_types["FailedAttackResult"])
            adv_succ_rate = attack_types[
                "SuccessfulAttackResult"] / total_attacks
            after_attack_acc = attack_types["FailedAttackResult"] / len(
                adv_attack_results)

            tb_writer.add_scalar("robustness_test_acc", adv_acc, global_step)
            tb_writer.add_scalar("robustness_total_attacks", total_attacks,
                                 global_step)
            tb_writer.add_scalar("robustness_attack_succ_rate", adv_succ_rate,
                                 global_step)
            tb_writer.add_scalar("robustness_after_attack_acc",
                                 after_attack_acc, global_step)

            logger.info(f"Eval after-attack accuracy: {100*after_attack_acc}%")

    # read the saved model and report its eval performance
    logger.info(
        "Finished training. Re-loading and evaluating model from disk.")
    model_wrapper = model_from_args(args, args.num_labels)
    model = model_wrapper.model
    model.load_state_dict(
        torch.load(os.path.join(args.output_dir, args.weights_name)))
    eval_score = _get_eval_score(model, eval_dataloader, args.do_regression)
    logger.info(
        f"Saved model {'pearson correlation' if args.do_regression else 'accuracy'}: {eval_score*100}%"
    )

    if args.save_last:
        _save_model(model, args.output_dir, args.weights_name,
                    args.config_name)

    # end of training, save tokenizer
    try:
        tokenizer.save_pretrained(args.output_dir)
        logger.info(f"Saved tokenizer {tokenizer} to {args.output_dir}.")
    except AttributeError:
        logger.warn(
            f"Error: could not save tokenizer {tokenizer} to {args.output_dir}."
        )

    # Save a little readme with model info
    write_readme(args, args.best_eval_score, args.best_eval_score_epoch)

    _save_args(args, args_save_path)
    tb_writer.close()
    logger.info(f"Wrote final training args to {args_save_path}.")
Ejemplo n.º 4
0
def main(options):

    use_cuda = (len(options.gpuid) >= 1)
    if options.gpuid:
        cuda.set_device(options.gpuid[0])

    train, dev, test, vocab = torch.load(open(options.data_file, 'rb'),
                                         pickle_module=dill)

    batched_train, batched_train_mask, _ = utils.tensor.advanced_batchize(
        train, options.batch_size, vocab.stoi["<pad>"])
    batched_dev, batched_dev_mask, _ = utils.tensor.advanced_batchize(
        dev, options.batch_size, vocab.stoi["<pad>"])

    vocab_size = len(vocab)

    rnnlm = BiGRU(vocab_size, use_cuda=use_cuda)
    if use_cuda > 0:
        rnnlm.cuda()
    else:
        rnnlm.cpu()

    criterion = torch.nn.NLLLoss()
    optimizer = eval("torch.optim." + options.optimizer)(rnnlm.parameters(),
                                                         options.learning_rate)

    # main training loop
    last_dev_avg_loss = float("inf")
    rnnlm.train()
    for epoch_i in range(options.epochs):
        logging.info("At {0}-th epoch.".format(epoch_i))
        # srange generates a lazy sequence of shuffled range
        for i, batch_i in enumerate(utils.rand.srange(len(batched_train))):
            train_batch = Variable(
                batched_train[batch_i])  # of size (seq_len, batch_size)
            train_mask = Variable(batched_train_mask[batch_i])
            train_in_mask = train_mask.view(-1)
            train_out_mask = train_mask.view(-1)
            if use_cuda:
                train_batch = train_batch.cuda()
                train_mask = train_mask.cuda()
                train_in_mask = train_in_mask.cuda()
                train_out_mask = train_out_mask.cuda()

            sys_out_batch = rnnlm(
                train_batch
            )  # (seq_len, batch_size, vocab_size) # TODO: substitute this with your module
            train_in_mask = train_in_mask.unsqueeze(1).expand(
                len(train_in_mask), vocab_size)
            sys_out_batch = sys_out_batch.view(-1, vocab_size)
            train_out_batch = train_batch.view(-1)
            sys_out_batch = sys_out_batch.masked_select(train_in_mask).view(
                -1, vocab_size)
            train_out_batch = train_out_batch.masked_select(train_out_mask)
            loss = criterion(sys_out_batch, train_out_batch)
            logging.debug("loss at batch {0}: {1}".format(i, loss.data[0]))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # validation -- this is a crude esitmation because there might be some paddings at the end
        dev_loss = 0.0
        rnnlm.eval()
        for batch_i in range(len(batched_dev)):
            dev_batch = Variable(batched_dev[batch_i], volatile=True)
            dev_mask = Variable(batched_dev_mask[batch_i], volatile=True)
            dev_in_mask = dev_mask.view(-1)
            dev_out_batch = dev_batch.view(-1)
            if use_cuda:
                dev_batch = dev_batch.cuda()
                dev_mask = dev_mask.cuda()
                dev_in_mask = dev_in_mask.cuda()
                dev_out_batch = dev_out_batch.cuda()

            sys_out_batch = rnnlm(dev_batch)
            dev_in_mask = dev_in_mask.unsqueeze(1).expand(
                len(dev_in_mask), vocab_size)
            dev_out_mask = dev_mask.view(-1)
            sys_out_batch = sys_out_batch.view(-1, vocab_size)
            sys_out_batch = sys_out_batch.masked_select(dev_in_mask).view(
                -1, vocab_size)
            dev_out_batch = dev_out_batch.masked_select(dev_out_mask)
            loss = criterion(sys_out_batch, dev_out_batch)
            dev_loss += loss
        dev_avg_loss = dev_loss / len(batched_dev)
        logging.info(
            "Average loss value per instance is {0} at the end of epoch {1}".
            format(dev_avg_loss.data[0], epoch_i))

        if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop:
            logging.info(
                "Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})"
                .format(epoch_i, last_dev_avg_loss.data[0],
                        dev_avg_loss.data[0]))
            break
        torch.save(
            rnnlm,
            open(
                options.model_file +
                ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i),
                'wb'),
            pickle_module=dill)
        last_dev_avg_loss = dev_avg_loss
Ejemplo n.º 5
0
def model_train_validate_test(train_df,
                              dev_df,
                              test_df,
                              embeddings_file,
                              vocab_file,
                              target_dir,
                              mode,
                              num_labels=2,
                              max_length=50,
                              epochs=50,
                              batch_size=128,
                              lr=0.0005,
                              patience=5,
                              max_grad_norm=10.0,
                              gpu_index=0,
                              if_save_model=False,
                              checkpoint=None):
    device = torch.device(
        "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu")
    print(20 * "=", " Preparing for training ", 20 * "=")
    # 保存模型的路径
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    train_data = My_Dataset(train_df, vocab_file, max_length, mode)
    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    print("\t* Loading validation data...")
    dev_data = My_Dataset(dev_df, vocab_file, max_length, mode)
    dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size)
    print("\t* Loading test data...")
    test_data = My_Dataset(test_df, vocab_file, max_length, mode)
    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    if (embeddings_file is not None):
        embeddings = load_embeddings(embeddings_file)
    else:
        embeddings = None
    model = BiGRU(embeddings, num_labels=num_labels, device=device).to(device)
    total_params = sum(p.numel() for p in model.parameters())
    print(f'{total_params:,} total parameters.')
    total_trainable_params = sum(p.numel() for p in model.parameters()
                                 if p.requires_grad)
    print(f'{total_trainable_params:,} training parameters.')
    # -------------------- Preparation for training  ------------------- #
    criterion = nn.CrossEntropyLoss()
    # 过滤出需要梯度更新的参数
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"])
    optimizer = torch.optim.Adam(parameters, lr=lr)
    # optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="max",
                                                           factor=0.85,
                                                           patience=0)
    best_score = 0.0
    start_epoch = 1
    # Data for loss curves plot
    epochs_count = []
    train_losses = []
    valid_losses = []
    # Continuing training from a checkpoint if one was given as argument
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint["epoch"] + 1
        best_score = checkpoint["best_score"]
        print("\t* Training will continue on existing model from epoch {}...".
              format(start_epoch))
        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        epochs_count = checkpoint["epochs_count"]
        train_losses = checkpoint["train_losses"]
        valid_losses = checkpoint["valid_losses"]
    # Compute loss and accuracy before starting (or resuming) training.
    _, valid_loss, valid_accuracy, _, = validate(model, dev_loader, criterion)
    print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%".
          format(valid_loss, (valid_accuracy * 100)))
    # -------------------- Training epochs ------------------- #
    print("\n", 20 * "=", "Training BiGRU model on device: {}".format(device),
          20 * "=")
    patience_counter = 0
    for epoch in range(start_epoch, epochs + 1):
        epochs_count.append(epoch)
        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader,
                                                       optimizer, criterion,
                                                       epoch, max_grad_norm)
        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))
        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy, _, = validate(
            model, dev_loader, criterion)
        valid_losses.append(epoch_loss)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))
        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)
        # Early stopping on validation accuracy.
        if epoch_accuracy < best_score:
            patience_counter += 1
        else:
            best_score = epoch_accuracy
            patience_counter = 0

            if (if_save_model):
                torch.save(
                    {
                        "epoch": epoch,
                        "model": model.state_dict(),
                        "best_score": best_score,
                        "epochs_count": epochs_count,
                        "train_losses": train_losses,
                        "valid_losses": valid_losses
                    }, os.path.join(target_dir, "best.pth.tar"))

                print("save model succesfully!\n")

            print("* Test for epoch {}:".format(epoch))
            _, _, test_accuracy, predictions = validate(
                model, test_loader, criterion)
            print("Test accuracy: {:.4f}%\n".format(test_accuracy))
            test_prediction = pd.DataFrame({'prediction': predictions})
            test_prediction.to_csv(os.path.join(target_dir,
                                                "test_prediction.csv"),
                                   index=False)

        if patience_counter >= patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break