Beispiel #1
1
 def __init__(self,
              enc_nhids=1000,
              dec_nhids=1000,
              enc_embed=620,
              dec_embed=620,
              src_vocab_size=30000,
              trg_vocab_size=30000,
              **kwargs):
     self.src_lookup_table = Lookup_table(enc_embed, src_vocab_size, prefix='src_lookup_table')
     self.trg_lookup_table = Lookup_table(dec_embed, trg_vocab_size, prefix='trg_lookup_table')
     self.encoder = BiGRU(enc_embed, enc_nhids, **kwargs)
     self.decoder = Decoder(dec_embed, dec_nhids, c_hids=enc_nhids*2, **kwargs)
     self.logistic = LogisticRegression(kwargs.get('n_out', dec_nhids), trg_vocab_size, prefix='logistic', drop_rate=kwargs['dropout'])
     self.params = self.src_lookup_table.params + self.trg_lookup_table.params + self.encoder.params + self.decoder.params  \
         + self.logistic.params
     self.tparams = OrderedDict([(param.name, param) for param in self.params])
Beispiel #2
0
    def __init__(self,
                 enc_nhids=1000,
                 dec_nhids=1000,
                 enc_embed=620,
                 dec_embed=620,
                 src_vocab_size=30000,
                 trg_vocab_size=30000,
                 **kwargs):
        self.lr_in = kwargs.get('n_out', dec_nhids)

        self.src_lookup_table = Lookup_table(enc_embed,
                                             src_vocab_size,
                                             prefix='src_lookup_table')
        self.trg_lookup_table = Lookup_table(dec_embed,
                                             trg_vocab_size,
                                             prefix='trg_lookup_table')
        self.encoder = BiGRU(enc_embed, enc_nhids, **kwargs)
        # src_nhids*2 corresponds the last dimension of encoded state
        self.decoder = Decoder(dec_embed,
                               dec_nhids,
                               c_hids=enc_nhids * 2,
                               **kwargs)
        # the output size of decoder should be same with lr_in if no n_out
        # defined
        self.logistic = LogisticRegression(self.lr_in,
                                           trg_vocab_size,
                                           prefix='logistic',
                                           **kwargs)
        self.params = self.src_lookup_table.params + self.trg_lookup_table.params + \
            self.encoder.params + self.decoder.params + self.logistic.params
        self.tparams = OrderedDict([(param.name, param)
                                    for param in self.params])
        self.use_mv = kwargs.get('use_mv', 0)
Beispiel #3
0
class Predictor():
    def __init__(self):
        try:
            f = open("params.json", "r", encoding='utf8')
            self.params = json.loads(f.read())
        except FileNotFoundError:
            self.params = save_corpus()
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.device = device
        self.model = BiGRU(torch.zeros((len(self.params['tbl']) + 1, 300)),
                           MODEL_PARAMS['gru_hidden_dim'],
                           MODEL_PARAMS['gru_num_layers'],
                           len(self.params['tagset']),
                           MODEL_PARAMS['concat']).to(device)
        self.model.load_state_dict(
            torch.load('trained_model.pt',
                       map_location=lambda storage, loc: storage))
        self.model.eval()

    def predict(self, sentence):
        words = sentence.split()

        lis = []
        new_words = []
        for word in words:
            symbol = None
            if not word[-1].isalnum():
                symbol = word[-1]
                word = word[:-1]
            if word.lower() in self.params['tbl']:
                lis.append(self.params['tbl'][word.lower()])
            else:
                lis.append(0)
            new_words.append(word)
            if symbol != None:
                if symbol in self.params['tbl']:
                    lis.append(self.params['tbl'][symbol])
                else:
                    lis.append(0)
                new_words.append(symbol)

        x = torch.LongTensor(lis).to(self.device)
        x = x.unsqueeze(0)
        y_raw = self.model(x)
        y_pred = torch.argmax(y_raw, dim=2).view(-1)
        tagged_sent = ''
        for i in range(len(y_pred)):
            tagged_sent += new_words[i]
            tagged_sent += ' '
            tagged_sent += self.params['reverse_tagset'][y_pred[i]]
            tagged_sent += ' '
        print(tagged_sent)

    def tag_lookup(self, tag):
        try:
            print('TAG:', tag)
            print('Definition:', self.params['tag_definition'][tag][0])
        except:
            print('Error: Tag not found.')
Beispiel #4
0
def train():

    glove_pretrained, dataloaders, dataset_sizes, tbl, tagset, reverse_tagset, tag_definitions = preprocess(
    )
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    net = BiGRU(glove_pretrained,
                MODEL_PARAMS['gru_hidden_dim'], MODEL_PARAMS['gru_num_layers'],
                len(tagset), MODEL_PARAMS['concat']).to(device)

    criterion = nn.NLLLoss(ignore_index=-1)
    optimizer = optim.Adam(net.parameters(), lr=0.001)

    train_model(device, net, dataloaders, dataset_sizes, criterion, optimizer,
                MODEL_PARAMS['num_epochs'])
    test(device, net, dataloaders['testing'])
    torch.save(net.state_dict(), 'trained_model.pt')
Beispiel #5
0
 def __init__(self):
     try:
         f = open("params.json", "r", encoding='utf8')
         self.params = json.loads(f.read())
     except FileNotFoundError:
         self.params = save_corpus()
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     self.device = device
     self.model = BiGRU(torch.zeros((len(self.params['tbl']) + 1, 300)),
                        MODEL_PARAMS['gru_hidden_dim'],
                        MODEL_PARAMS['gru_num_layers'],
                        len(self.params['tagset']),
                        MODEL_PARAMS['concat']).to(device)
     self.model.load_state_dict(
         torch.load('trained_model.pt',
                    map_location=lambda storage, loc: storage))
     self.model.eval()
Beispiel #6
0
def model_load_test(test_df,
                    vocab_file,
                    embeddings_file,
                    pretrained_file,
                    test_prediction_dir,
                    test_prediction_name,
                    mode,
                    num_labels=2,
                    max_length=50,
                    gpu_index=0,
                    batch_size=128):

    device = torch.device(
        "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu")
    print(20 * "=", " Preparing for testing ", 20 * "=")
    if platform == "linux" or platform == "linux2":
        checkpoint = torch.load(pretrained_file)
    else:
        checkpoint = torch.load(pretrained_file, map_location=device)
    # Retrieving model parameters from checkpoint.
    embeddings = load_embeddings(embeddings_file)
    print("\t* Loading test data...")
    test_data = My_Dataset(test_df, vocab_file, max_length, mode)
    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
    print("\t* Building model...")
    model = BiGRU(embeddings, num_labels=num_labels, device=device).to(device)
    model.load_state_dict(checkpoint["model"])
    print(20 * "=", " Testing BiGRU model on device: {} ".format(device),
          20 * "=")
    batch_time, total_time, accuracy, predictions = test(model, test_loader)
    print(
        "\n-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, accuracy: {:.4f}%\n"
        .format(batch_time, total_time, (accuracy * 100)))
    test_prediction = pd.DataFrame({'prediction': predictions})
    if not os.path.exists(test_prediction_dir):
        os.makedirs(test_prediction_dir)
    test_prediction.to_csv(os.path.join(test_prediction_dir,
                                        test_prediction_name),
                           index=False)
Beispiel #7
0
class Translate(object):

    def __init__(self,
                 enc_nhids=1000,
                 dec_nhids=1000,
                 enc_embed=620,
                 dec_embed=620,
                 src_vocab_size=30000,
                 trg_vocab_size=30000,
                 **kwargs):
        self.src_lookup_table = Lookup_table(enc_embed, src_vocab_size, prefix='src_lookup_table')
        self.trg_lookup_table = Lookup_table(dec_embed, trg_vocab_size, prefix='trg_lookup_table')
        self.encoder = BiGRU(enc_embed, enc_nhids, **kwargs)
        self.decoder = Decoder(dec_embed, dec_nhids, c_hids=enc_nhids*2, **kwargs)
        self.logistic = LogisticRegression(kwargs.get('n_out', dec_nhids), trg_vocab_size, prefix='logistic', drop_rate=kwargs['dropout'])
        self.params = self.src_lookup_table.params + self.trg_lookup_table.params + self.encoder.params + self.decoder.params  \
            + self.logistic.params
        self.tparams = OrderedDict([(param.name, param) for param in self.params])

    def apply(self, source, source_mask, target, target_mask, **kwargs):
        sbelow = self.src_lookup_table.apply(source)
        tbelow = self.trg_lookup_table.apply_zero_pad(target)

        s_rep = self.encoder.apply(sbelow, source_mask)
        hiddens = self.decoder.apply(tbelow, target_mask, s_rep, source_mask)

        cost_matrix = self.logistic.cost(hiddens, target, target_mask)
        self.cost = cost_matrix.sum()/target_mask.shape[1]

    def _next_prob_state(self, y, state, c, c_x):
        next_state, merge_out = self.decoder.next_state_merge(y, state, c, c_x)
        prob = self.logistic.apply(merge_out)
        return prob, next_state

    def build_sample(self):
        x = T.matrix('x', dtype='int64')
        sbelow = self.src_lookup_table.apply(x)
        ctx = self.encoder.apply(sbelow, mask=None)
        c_x = T.dot(ctx, self.decoder.Ws) + self.decoder.bs
        init_state = self.decoder.init_state(ctx)

        outs = [init_state, ctx]
        f_init = theano.function([x], outs, name='f_init')

        y = T.vector('y_sampler', dtype='int64')
        y_emb = self.trg_lookup_table.index(y)
        init_state = T.matrix('init_state', dtype='float32')
        next_probs, next_state = self._next_prob_state(y_emb, init_state, ctx, c_x)

        inps = [y, ctx, init_state]
        outs = [next_probs, next_state]
        f_next = theano.function(inps, outs, name='f_next')

        return f_init, f_next

    def savez(self, filename):
        params_value = OrderedDict([(kk, value.get_value()) for kk, value in self.tparams.iteritems()])
        numpy.savez(filename, **params_value)

    def load(self, filename):
        params_value = numpy.load(filename)
        assert len(params_value.files) == len(self.tparams)
        for key, value in self.tparams.iteritems():
            value.set_value(params_value[key])
Beispiel #8
0
        # print("Nonzeros in masks", ones)
        labels = labels.type(torch.LongTensor)
        outputs = rnn(features)
        _, predicted = torch.max(outputs, 1)
        matched = torch.eq(predicted.data, labels)
        matched = torch.mul(matched.type(torch.FloatTensor), masks)

        correct += matched.sum()
        total += masks.sum()
        # print("correct prediction: {}; total prediction: {}".format(correct, total))
        # print("sum comparison:", masks.sum(), seq_len.sum())
    accuracy = correct / total
    return accuracy


rnn = BiGRU(input_size, hidden_size, num_layers, num_classes, batch_size)
rnn = rnn.cuda() if use_cuda else rnn
print("Model loaded!!!!!!!!")

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    print("Epoch {} is running".format(epoch + 1))
    for i, (name, features, masks, labels, seq_len) in enumerate(train_loader):

        features = Variable(features)
        labels = labels.view(-1)
        labels = Variable(labels.type(torch.LongTensor))
        # print("labels size: {}".format(labels.size()))
        if use_cuda:
Beispiel #9
0
def train_model(args,
                train_text=None,
                train_labels=None,
                eval_text=None,
                eval_labels=None,
                tokenizer=None):
    textattack.shared.utils.set_seed(args.random_seed)

    _make_directories(args.output_dir)

    num_gpus = torch.cuda.device_count()

    # Save logger writes to file
    log_txt_path = os.path.join(args.output_dir, "log.txt")
    fh = logging.FileHandler(log_txt_path)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)
    logger.info(f"Writing logs to {log_txt_path}.")

    train_examples_len = len(train_text)

    # label_id_len = len(train_labels)
    label_set = set(train_labels)
    args.num_labels = len(label_set)
    logger.info(
        f"Loaded dataset. Found: {args.num_labels} labels: {sorted(label_set)}"
    )

    if len(train_labels) != len(train_text):
        raise ValueError(
            f"Number of train examples ({len(train_text)}) does not match number of labels ({len(train_labels)})"
        )
    if len(eval_labels) != len(eval_text):
        raise ValueError(
            f"Number of teste xamples ({len(eval_text)}) does not match number of labels ({len(eval_labels)})"
        )

    if args.model == "gru":
        textattack.shared.logger.info(
            "Loading textattack model: GRUForClassification")
        model = BiGRU()
        model.to(device)
    elif args.model == "lstm":
        textattack.shared.logger.info(
            "Loading textattack model: LSTMForClassification")
        model = BiLSTM()
        model.to(device)

    # attack_class = attack_from_args(args)
    # We are adversarial training if the user specified an attack along with
    # the training args.
    # adversarial_training = (attack_class is not None) and (not args.check_robustness)

    # multi-gpu training
    if num_gpus > 1:
        model = torch.nn.DataParallel(model)
        logger.info("Using torch.nn.DataParallel.")
    logger.info(f"Training model across {num_gpus} GPUs")

    num_train_optimization_steps = (
        int(train_examples_len / args.batch_size / args.grad_accum_steps) *
        args.num_train_epochs)

    if args.model == "lstm" or args.model == "cnn" or args.model == "gru":

        def need_grad(x):
            return x.requires_grad

        optimizer = torch.optim.Adam(filter(need_grad, model.parameters()),
                                     lr=args.learning_rate)
        scheduler = None
    else:
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]

        optimizer = transformers.optimization.AdamW(
            optimizer_grouped_parameters, lr=args.learning_rate)

        scheduler = transformers.optimization.get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_proportion,
            num_training_steps=num_train_optimization_steps,
        )

    # Start Tensorboard and log hyperparams.
    from torch.utils.tensorboard import SummaryWriter

    tb_writer = SummaryWriter(args.output_dir)

    # Use Weights & Biases, if enabled.
    if args.enable_wandb:
        global wandb
        wandb = textattack.shared.utils.LazyLoader("wandb", globals(), "wandb")
        wandb.init(sync_tensorboard=True)

    # Save original args to file
    args_save_path = os.path.join(args.output_dir, "train_args.json")
    _save_args(args, args_save_path)
    logger.info(f"Wrote original training args to {args_save_path}.")

    tb_writer.add_hparams(
        {k: v
         for k, v in vars(args).items() if _is_writable_type(v)}, {})

    # Start training
    logger.info("***** Running training *****")
    # if augmenter:
    #     logger.info(f"\tNum original examples = {train_examples_len}")
    #     logger.info(f"\tNum examples after augmentation = {len(train_text)}")
    # else:
    #     logger.info(f"\tNum examples = {train_examples_len}")
    logger.info(f"\tNum examples = {train_examples_len}")
    logger.info(f"\tBatch size = {args.batch_size}")
    logger.info(f"\tMax sequence length = {args.max_length}")
    logger.info(f"\tNum steps = {num_train_optimization_steps}")
    logger.info(f"\tNum epochs = {args.num_train_epochs}")
    logger.info(f"\tLearning rate = {args.learning_rate}")

    eval_dataloader = _make_dataloader(tokenizer, eval_text, eval_labels,
                                       args.batch_size)
    train_dataloader = _make_dataloader(tokenizer, train_text, train_labels,
                                        args.batch_size)

    global_step = 0
    tr_loss = 0

    model.train()
    args.best_eval_score = 0
    args.best_eval_score_epoch = 0
    args.epochs_since_best_eval_score = 0

    def loss_backward(loss):
        if num_gpus > 1:
            loss = loss.mean(
            )  # mean() to average on multi-gpu parallel training
        if args.grad_accum_steps > 1:
            loss = loss / args.grad_accum_steps
        loss.backward()
        return loss

    # if args.do_regression:
    #     # TODO integrate with textattack `metrics` package
    #     loss_fct = torch.nn.MSELoss()
    # else:
    #     loss_fct = torch.nn.CrossEntropyLoss()
    loss_fct = torch.nn.CrossEntropyLoss()

    for epoch in tqdm.trange(int(args.num_train_epochs),
                             desc="Epoch",
                             position=0,
                             leave=True):
        # if adversarial_training:
        #     if epoch >= args.num_clean_epochs:
        #         if (epoch - args.num_clean_epochs) % args.attack_period == 0:
        #             # only generate a new adversarial training set every args.attack_period epochs
        #             # after the clean epochs
        #             logger.info("Attacking model to generate new training set...")

        #             adv_attack_results = _generate_adversarial_examples(
        #                 model_wrapper, attack_class, list(zip(train_text, train_labels))
        #             )
        #             adv_train_text = [r.perturbed_text() for r in adv_attack_results]
        #             train_dataloader = _make_dataloader(
        #                 tokenizer, adv_train_text, train_labels, args.batch_size
        #             )
        #     else:
        #         logger.info(f"Running clean epoch {epoch+1}/{args.num_clean_epochs}")

        prog_bar = tqdm.tqdm(train_dataloader,
                             desc="Iteration",
                             position=0,
                             leave=True)

        # Use these variables to track training accuracy during classification.
        correct_predictions = 0
        total_predictions = 0
        for step, batch in enumerate(prog_bar):
            ids1, ids2, msk1, msk2, labels = batch
            # input_ids, labels = batch
            labels = labels.to(device)
            # if isinstance(input_ids, dict):
            #     ## dataloader collates dict backwards. This is a workaround to get
            #     # ids in the right shape for HuggingFace models
            #     input_ids = {
            #         k: torch.stack(v).T.to(device) for k, v in input_ids.items()
            #     }
            #     logits = model(**input_ids)[0]
            # else:

            ids1 = ids1.to(device)
            ids2 = ids2.to(device)
            msk1 = msk1.to(device)
            msk2 = msk2.to(device)
            logits = model(ids1, ids2, msk1, msk2)

            # if args.do_regression:
            #     # TODO integrate with textattack `metrics` package
            #     loss = loss_fct(logits.squeeze(), labels.squeeze())
            # else:
            loss = loss_fct(logits, labels)
            pred_labels = logits.argmax(dim=-1)
            correct_predictions += (pred_labels == labels).sum().item()
            total_predictions += len(pred_labels)

            loss = loss_backward(loss)
            tr_loss += loss.item()

            if global_step % args.tb_writer_step == 0:
                tb_writer.add_scalar("loss", loss.item(), global_step)
                if scheduler is not None:
                    tb_writer.add_scalar("lr",
                                         scheduler.get_last_lr()[0],
                                         global_step)
                else:
                    tb_writer.add_scalar("lr", args.learning_rate, global_step)
            if global_step > 0:
                prog_bar.set_description(f"Loss {tr_loss/global_step}")
            if (step + 1) % args.grad_accum_steps == 0:
                optimizer.step()
                if scheduler is not None:
                    scheduler.step()
                optimizer.zero_grad()
            # Save model checkpoint to file.
            if (global_step > 0 and (args.checkpoint_steps > 0)
                    and (global_step % args.checkpoint_steps) == 0):
                _save_model_checkpoint(model, args.output_dir, global_step)

            # Inc step counter.
            global_step += 1

        # Print training accuracy, if we're tracking it.
        if total_predictions > 0:
            train_acc = correct_predictions / total_predictions
            logger.info(f"Train accuracy: {train_acc*100}%")
            tb_writer.add_scalar("epoch_train_score", train_acc, epoch)

        # Check accuracy after each epoch.
        # skip args.num_clean_epochs during adversarial training
        # if (not adversarial_training) or (epoch >= args.num_clean_epochs):
        if (epoch >= args.num_clean_epochs):
            eval_score = _get_eval_score(model, eval_dataloader, False)
            tb_writer.add_scalar("epoch_eval_score", eval_score, epoch)

            if args.checkpoint_every_epoch:
                _save_model_checkpoint(model, args.output_dir,
                                       args.global_step)

            logger.info(
                f"Eval {'pearson correlation' if args.do_regression else 'accuracy'}: {eval_score*100}%"
            )
            if eval_score > args.best_eval_score:
                args.best_eval_score = eval_score
                args.best_eval_score_epoch = epoch
                args.epochs_since_best_eval_score = 0
                _save_model(model, args.output_dir, args.weights_name,
                            args.config_name)
                logger.info(
                    f"Best acc found. Saved model to {args.output_dir}.")
                _save_args(args, args_save_path)
                logger.info(f"Saved updated args to {args_save_path}")
            else:
                args.epochs_since_best_eval_score += 1
                if (args.early_stopping_epochs >
                        0) and (args.epochs_since_best_eval_score >
                                args.early_stopping_epochs):
                    logger.info(
                        f"Stopping early since it's been {args.early_stopping_epochs} steps since validation acc increased"
                    )
                    break

        if args.check_robustness:
            samples_to_attack = list(zip(eval_text, eval_labels))
            samples_to_attack = random.sample(samples_to_attack, 1000)
            adv_attack_results = _generate_adversarial_examples(
                model_wrapper, attack_class, samples_to_attack)
            attack_types = [r.__class__.__name__ for r in adv_attack_results]
            attack_types = collections.Counter(attack_types)

            adv_acc = 1 - (attack_types["SkippedAttackResult"] /
                           len(adv_attack_results))
            total_attacks = (attack_types["SuccessfulAttackResult"] +
                             attack_types["FailedAttackResult"])
            adv_succ_rate = attack_types[
                "SuccessfulAttackResult"] / total_attacks
            after_attack_acc = attack_types["FailedAttackResult"] / len(
                adv_attack_results)

            tb_writer.add_scalar("robustness_test_acc", adv_acc, global_step)
            tb_writer.add_scalar("robustness_total_attacks", total_attacks,
                                 global_step)
            tb_writer.add_scalar("robustness_attack_succ_rate", adv_succ_rate,
                                 global_step)
            tb_writer.add_scalar("robustness_after_attack_acc",
                                 after_attack_acc, global_step)

            logger.info(f"Eval after-attack accuracy: {100*after_attack_acc}%")

    # read the saved model and report its eval performance
    logger.info(
        "Finished training. Re-loading and evaluating model from disk.")
    model_wrapper = model_from_args(args, args.num_labels)
    model = model_wrapper.model
    model.load_state_dict(
        torch.load(os.path.join(args.output_dir, args.weights_name)))
    eval_score = _get_eval_score(model, eval_dataloader, args.do_regression)
    logger.info(
        f"Saved model {'pearson correlation' if args.do_regression else 'accuracy'}: {eval_score*100}%"
    )

    if args.save_last:
        _save_model(model, args.output_dir, args.weights_name,
                    args.config_name)

    # end of training, save tokenizer
    try:
        tokenizer.save_pretrained(args.output_dir)
        logger.info(f"Saved tokenizer {tokenizer} to {args.output_dir}.")
    except AttributeError:
        logger.warn(
            f"Error: could not save tokenizer {tokenizer} to {args.output_dir}."
        )

    # Save a little readme with model info
    write_readme(args, args.best_eval_score, args.best_eval_score_epoch)

    _save_args(args, args_save_path)
    tb_writer.close()
    logger.info(f"Wrote final training args to {args_save_path}.")
Beispiel #10
0
def main(options):

    use_cuda = (len(options.gpuid) >= 1)
    if options.gpuid:
        cuda.set_device(options.gpuid[0])

    train, dev, test, vocab = torch.load(open(options.data_file, 'rb'),
                                         pickle_module=dill)

    batched_train, batched_train_mask, _ = utils.tensor.advanced_batchize(
        train, options.batch_size, vocab.stoi["<pad>"])
    batched_dev, batched_dev_mask, _ = utils.tensor.advanced_batchize(
        dev, options.batch_size, vocab.stoi["<pad>"])

    vocab_size = len(vocab)

    rnnlm = BiGRU(vocab_size, use_cuda=use_cuda)
    if use_cuda > 0:
        rnnlm.cuda()
    else:
        rnnlm.cpu()

    criterion = torch.nn.NLLLoss()
    optimizer = eval("torch.optim." + options.optimizer)(rnnlm.parameters(),
                                                         options.learning_rate)

    # main training loop
    last_dev_avg_loss = float("inf")
    rnnlm.train()
    for epoch_i in range(options.epochs):
        logging.info("At {0}-th epoch.".format(epoch_i))
        # srange generates a lazy sequence of shuffled range
        for i, batch_i in enumerate(utils.rand.srange(len(batched_train))):
            train_batch = Variable(
                batched_train[batch_i])  # of size (seq_len, batch_size)
            train_mask = Variable(batched_train_mask[batch_i])
            train_in_mask = train_mask.view(-1)
            train_out_mask = train_mask.view(-1)
            if use_cuda:
                train_batch = train_batch.cuda()
                train_mask = train_mask.cuda()
                train_in_mask = train_in_mask.cuda()
                train_out_mask = train_out_mask.cuda()

            sys_out_batch = rnnlm(
                train_batch
            )  # (seq_len, batch_size, vocab_size) # TODO: substitute this with your module
            train_in_mask = train_in_mask.unsqueeze(1).expand(
                len(train_in_mask), vocab_size)
            sys_out_batch = sys_out_batch.view(-1, vocab_size)
            train_out_batch = train_batch.view(-1)
            sys_out_batch = sys_out_batch.masked_select(train_in_mask).view(
                -1, vocab_size)
            train_out_batch = train_out_batch.masked_select(train_out_mask)
            loss = criterion(sys_out_batch, train_out_batch)
            logging.debug("loss at batch {0}: {1}".format(i, loss.data[0]))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # validation -- this is a crude esitmation because there might be some paddings at the end
        dev_loss = 0.0
        rnnlm.eval()
        for batch_i in range(len(batched_dev)):
            dev_batch = Variable(batched_dev[batch_i], volatile=True)
            dev_mask = Variable(batched_dev_mask[batch_i], volatile=True)
            dev_in_mask = dev_mask.view(-1)
            dev_out_batch = dev_batch.view(-1)
            if use_cuda:
                dev_batch = dev_batch.cuda()
                dev_mask = dev_mask.cuda()
                dev_in_mask = dev_in_mask.cuda()
                dev_out_batch = dev_out_batch.cuda()

            sys_out_batch = rnnlm(dev_batch)
            dev_in_mask = dev_in_mask.unsqueeze(1).expand(
                len(dev_in_mask), vocab_size)
            dev_out_mask = dev_mask.view(-1)
            sys_out_batch = sys_out_batch.view(-1, vocab_size)
            sys_out_batch = sys_out_batch.masked_select(dev_in_mask).view(
                -1, vocab_size)
            dev_out_batch = dev_out_batch.masked_select(dev_out_mask)
            loss = criterion(sys_out_batch, dev_out_batch)
            dev_loss += loss
        dev_avg_loss = dev_loss / len(batched_dev)
        logging.info(
            "Average loss value per instance is {0} at the end of epoch {1}".
            format(dev_avg_loss.data[0], epoch_i))

        if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop:
            logging.info(
                "Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})"
                .format(epoch_i, last_dev_avg_loss.data[0],
                        dev_avg_loss.data[0]))
            break
        torch.save(
            rnnlm,
            open(
                options.model_file +
                ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i),
                'wb'),
            pickle_module=dill)
        last_dev_avg_loss = dev_avg_loss
Beispiel #11
0
class Translate(object):
    def __init__(self,
                 enc_nhids=1000,
                 dec_nhids=1000,
                 enc_embed=620,
                 dec_embed=620,
                 src_vocab_size=30000,
                 trg_vocab_size=30000,
                 **kwargs):
        self.lr_in = kwargs.get('n_out', dec_nhids)

        self.src_lookup_table = Lookup_table(enc_embed,
                                             src_vocab_size,
                                             prefix='src_lookup_table')
        self.trg_lookup_table = Lookup_table(dec_embed,
                                             trg_vocab_size,
                                             prefix='trg_lookup_table')
        self.encoder = BiGRU(enc_embed, enc_nhids, **kwargs)
        # src_nhids*2 corresponds the last dimension of encoded state
        self.decoder = Decoder(dec_embed,
                               dec_nhids,
                               c_hids=enc_nhids * 2,
                               **kwargs)
        # the output size of decoder should be same with lr_in if no n_out
        # defined
        self.logistic = LogisticRegression(self.lr_in,
                                           trg_vocab_size,
                                           prefix='logistic',
                                           **kwargs)
        self.params = self.src_lookup_table.params + self.trg_lookup_table.params + \
            self.encoder.params + self.decoder.params + self.logistic.params
        self.tparams = OrderedDict([(param.name, param)
                                    for param in self.params])
        self.use_mv = kwargs.get('use_mv', 0)

    def apply(self,
              source,
              source_mask,
              target,
              target_mask,
              v_part=None,
              v_true=None,
              **kwargs):
        # sbelow and tbelow are 3-D matrix, sbelow[i][j] (tbelow[i][j]) are embeddings of the i^{th} word in the j^{th} sentence in batch
        # here, source and source_mask: shape(src_sent_len * batch_size)
        # target and target_mask: shape(trg_sent_len * batch_size)
        # and their type are all theano.tensor.var.TensorVariable (numpy.ndarray)
        # (40,28,620) = (src_sent_len, batch_size, srcw_embsz)
        sbelow = self.src_lookup_table.apply(source)
        # the shape is different from source, (trg_sent_len-1, batch_size,
        # trgw_embsz)
        tbelow = self.trg_lookup_table.apply_zero_pad(target)

        # (src_sent_len, batch_size, src_nhids*2): bidirectional encode source sentence
        s_rep = self.encoder.apply(sbelow, source_mask)
        # remove the last word which is '</S>' of each sentence in a batch, the padding words are alse </S> 29999
        # tbelow[:-1] -> shape(trg_sent_len-1, batch_size, trgw_embsz)
        # target_mask[:-1] -> shape(trg_sent_len-1, batch_size)
        # hiddens, s, a, ss, als = self.decoder.apply(tbelow[:-1], target_mask[:-1], s_rep, source_mask)
        hiddens = self.decoder.apply(tbelow, target_mask, s_rep, source_mask)
        # hiddens from decoder: shape(trg_sent_len-1, batch_size, n_out)
        # (padding words all 0)
        self.mean_cost, self.mean_abs_log_norm = self.logistic.cost(
            hiddens, target, target_mask, v_part, v_true)

        # cost_matrix: shape((trg_sent_len-1), batch_size), here the trg_sent_len corresponds to this batch,
        # trg_sent_len may differ between different batches
        # cost_matrix.sum(): sum of all the elements in cost_matrix
        # target_mask[1]: the sentences number in a batch
        # so, cost_matrix.sum()/target_mask.shape[1] is actually the average cross
        # entropy per sentence in a batch

    '''
    y_emb_im1: (trgw_embsz,)
    t_stat_im1: (batch_size, trg_nhids)
    ctx: (src_sent_len, batch_size, src_nhids*2)
    c_x: (src_sent_len, batch_size, trg_nhids)
    '''

    def build_sample(self):
        x = T.matrix('x', dtype='int64')
        sbelow = self.src_lookup_table.apply(x)
        mask = T.alloc(numpy.float32(1.), sbelow.shape[0], sbelow.shape[1])
        # (src_sent_len, batch_size, src_nhids*2) batch_size == 1 for decoding
        ctx = self.encoder.apply(sbelow, mask)
        # self.decoder.Ws: (src_nhids*2, trg_nhids)
        # self.decocer.bs: (trg_nhids, )
        # (src_sent_len, batch_size, trg_nhids) (-1 ~ 1)
        # as long as ctx is inputed as parameter, no need c_x, it will be
        # calculated... do not worry
        c_x = T.dot(ctx, self.decoder.Ws) + self.decoder.bs
        # init_state: (batch_size, trg_nhids)
        init_state = self.decoder.init_state(
            ctx)  # no mask here, because no batch
        f_init = theano.function([x], [init_state, ctx, c_x], name='f_init')

        #--------------------------------------------------------------
        y_im1 = T.vector('y_sampler', dtype='int64')
        y_emb_im1 = self.trg_lookup_table.index(y_im1)
        f_emb = theano.function([y_im1], y_emb_im1, name='f_emb')

        #t_yemb_im1 = T.tensor3('t_yemb_im1', dtype='float32')
        t_yemb_im1 = T.matrix('t_yemb_im1', dtype='float32')
        t_stat_im1 = T.matrix('t_stat_im1', dtype='float32')

        #--------------------------------------------------------------
        # get next state h1: h_i = rnn(y_{i-1}, s_{i-1})
        # y_emb_im1: embedding of one target word, shape(1, trgw_embsz)
        hi = self.decoder._step_forward(x_t=t_yemb_im1,
                                        x_m=None,
                                        h_tm1=t_stat_im1)
        f_nh = theano.function([t_yemb_im1, t_stat_im1], hi, name='f_nh')

        #--------------------------------------------------------------
        t_hi = T.matrix('t_hi', dtype='float32')
        t_ctx = T.tensor3('t_ctx', dtype='float32')
        t_c_x = T.tensor3('t_c_x', dtype='float32')
        # next attention: a_i = a(h_i, c_i), c_i is actually do not change ...
        pi, ai = self.decoder.attention_layer.apply(source_ctx=t_ctx,
                                                    source_mask=None,
                                                    source_x=t_c_x,
                                                    cur_hidden=t_hi)
        f_na = theano.function([t_ctx, t_c_x, t_hi], [pi, ai], name='f_na')

        #--------------------------------------------------------------
        # get next final state, s_i = f(h_i<=(y_{i-1} and s_{i-1}), y_{i-1},
        # c_i)
        t_ai = T.matrix('t_ai', dtype='float32')
        ns = self.decoder.state_with_attend(h1=t_hi, attended=t_ai)
        f_ns = theano.function([t_hi, t_ai], ns, name='f_ns')

        #--------------------------------------------------------------
        # merge_out = g(y_{i-1}, s_i, a_i)
        t_si = T.matrix('t_si', dtype='float32')
        merge_out = self.decoder.merge_out(y_emb_im1=t_yemb_im1,
                                           s_i=t_si,
                                           a_i=t_ai)
        f_mo = theano.function([t_yemb_im1, t_ai, t_si],
                               merge_out,
                               name='f_mo')

        #--------------------------------------------------------------
        # get model score of the whole vocab: nonlinear(merge_out)
        t_mo = T.matrix('t_mo', dtype='float32')
        if self.use_mv:
            ptv = T.vector('ptv', dtype='int64')
            ptv_ins = [t_mo, ptv]
            ptv_ous = self.logistic.apply_score(t_mo, ptv, drop=True)
        else:
            ptv_ins = [t_mo]
            ptv_ous = self.logistic.apply_score(t_mo, drop=True)
        f_pws = theano.function(ptv_ins, ptv_ous, name='f_pws')

        #--------------------------------------------------------------
        # no need to use the whole vocabulary, vocabulary manipulation
        # if use T.ivector(), this slice will be very slow on cpu, i do not
        # know why
        y = T.wscalar('y')
        # get part model score slice: nonlinear(merge_out)[part]
        f_one = theano.function([t_mo, y],
                                self.logistic.apply_score_one(t_mo, y),
                                name='f_one')

        #--------------------------------------------------------------
        # distribution over target vocab: softmax(energy)
        t_pws = T.matrix('t_pws', dtype='float32')
        #self.logistic.apply_softmax(t_pws)
        #self.logistic.softmax(t_pws)
        f_ce = theano.function([t_pws], T.nnet.softmax(t_pws), name='f_ce')
        # next_w(y_emb_im1):    (k-dead_k,)  the last word id of each translate candidate in beam
        # ctx:  (src_sent_len, live_k, src_nhids*2)
        # t_stat_im1:           shape(k-dead_k, trg_nhids)
        # probs:  shape(k-dead_k, trg_vocab_size)

        # f_next .................
        next_probs, next_state = self.next_prob_state(y_emb_im1, t_stat_im1,
                                                      ctx, c_x)

        inps = [y_im1, ctx, t_stat_im1]
        outs = [next_probs, next_state]
        f_next = theano.function(inps, outs, name='f_next')

        return [
            f_init, f_nh, f_na, f_ns, f_mo, f_pws, f_one, f_ce, f_next, f_emb
        ]

    def next_prob_state(self, y_emb_im1, s_im1, ctx, c_x):
        next_state, merge_out = self.decoder.next_state_mout(
            y_emb_im1, s_im1, ctx, c_x)
        prob = self.logistic.apply(merge_out)
        return prob, next_state

    def savez(self, filename):
        params_value = OrderedDict([(kk, value.get_value())
                                    for kk, value in self.tparams.iteritems()])
        numpy.savez(filename, **params_value)

    def load(self, filename):  # change all weights by file
        params_value = numpy.load(filename)
        assert len(params_value.files) == len(self.tparams)
        for key, value in self.tparams.iteritems():
            # type(value) theano.tensor.sharedvar.TensorSharedVariable
            # params_value[key] is numpy.ndarray
            # we set the shared variable as the numpy array
            value.set_value(params_value[key])
        '''
        type(params_value['logistic_W0']: numpy.ndarray (512, 30000)
array([[-0.00096034, -0.0392303 , -0.07458289, ..., -0.00285031,
         0.03942127, -0.03161906],
       [-0.03706803, -0.06445373, -0.00836279, ..., -0.01915432,
        -0.00247126,  0.17407075],
       [-0.00102945,  0.03983303, -0.00801838, ..., -0.02834764,
         0.02834882, -0.07769781],
       ...,
       [ 0.01267207,  0.07802714, -0.02748013, ...,  0.0485581 ,
        -0.00657458,  0.07204553],
       [ 0.01089897,  0.06406539, -0.04804269, ..., -0.03247456,
         0.04343275, -0.14596273],
       [ 0.01474529,  0.02925147,  0.01569422, ...,  0.01673588,
        -0.02202134,  0.19972666]], dtype=float32)
        '''

    def load2numpy(self, filename):  # change all weights by file
        params_value = numpy.load(filename)
        assert len(params_value.files) == len(self.tparams)
        return params_value
Beispiel #12
0
def model_train_validate_test(train_df,
                              dev_df,
                              test_df,
                              embeddings_file,
                              vocab_file,
                              target_dir,
                              mode,
                              num_labels=2,
                              max_length=50,
                              epochs=50,
                              batch_size=128,
                              lr=0.0005,
                              patience=5,
                              max_grad_norm=10.0,
                              gpu_index=0,
                              if_save_model=False,
                              checkpoint=None):
    device = torch.device(
        "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu")
    print(20 * "=", " Preparing for training ", 20 * "=")
    # 保存模型的路径
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    train_data = My_Dataset(train_df, vocab_file, max_length, mode)
    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    print("\t* Loading validation data...")
    dev_data = My_Dataset(dev_df, vocab_file, max_length, mode)
    dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size)
    print("\t* Loading test data...")
    test_data = My_Dataset(test_df, vocab_file, max_length, mode)
    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    if (embeddings_file is not None):
        embeddings = load_embeddings(embeddings_file)
    else:
        embeddings = None
    model = BiGRU(embeddings, num_labels=num_labels, device=device).to(device)
    total_params = sum(p.numel() for p in model.parameters())
    print(f'{total_params:,} total parameters.')
    total_trainable_params = sum(p.numel() for p in model.parameters()
                                 if p.requires_grad)
    print(f'{total_trainable_params:,} training parameters.')
    # -------------------- Preparation for training  ------------------- #
    criterion = nn.CrossEntropyLoss()
    # 过滤出需要梯度更新的参数
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"])
    optimizer = torch.optim.Adam(parameters, lr=lr)
    # optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="max",
                                                           factor=0.85,
                                                           patience=0)
    best_score = 0.0
    start_epoch = 1
    # Data for loss curves plot
    epochs_count = []
    train_losses = []
    valid_losses = []
    # Continuing training from a checkpoint if one was given as argument
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint["epoch"] + 1
        best_score = checkpoint["best_score"]
        print("\t* Training will continue on existing model from epoch {}...".
              format(start_epoch))
        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        epochs_count = checkpoint["epochs_count"]
        train_losses = checkpoint["train_losses"]
        valid_losses = checkpoint["valid_losses"]
    # Compute loss and accuracy before starting (or resuming) training.
    _, valid_loss, valid_accuracy, _, = validate(model, dev_loader, criterion)
    print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%".
          format(valid_loss, (valid_accuracy * 100)))
    # -------------------- Training epochs ------------------- #
    print("\n", 20 * "=", "Training BiGRU model on device: {}".format(device),
          20 * "=")
    patience_counter = 0
    for epoch in range(start_epoch, epochs + 1):
        epochs_count.append(epoch)
        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader,
                                                       optimizer, criterion,
                                                       epoch, max_grad_norm)
        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))
        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy, _, = validate(
            model, dev_loader, criterion)
        valid_losses.append(epoch_loss)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))
        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)
        # Early stopping on validation accuracy.
        if epoch_accuracy < best_score:
            patience_counter += 1
        else:
            best_score = epoch_accuracy
            patience_counter = 0

            if (if_save_model):
                torch.save(
                    {
                        "epoch": epoch,
                        "model": model.state_dict(),
                        "best_score": best_score,
                        "epochs_count": epochs_count,
                        "train_losses": train_losses,
                        "valid_losses": valid_losses
                    }, os.path.join(target_dir, "best.pth.tar"))

                print("save model succesfully!\n")

            print("* Test for epoch {}:".format(epoch))
            _, _, test_accuracy, predictions = validate(
                model, test_loader, criterion)
            print("Test accuracy: {:.4f}%\n".format(test_accuracy))
            test_prediction = pd.DataFrame({'prediction': predictions})
            test_prediction.to_csv(os.path.join(target_dir,
                                                "test_prediction.csv"),
                                   index=False)

        if patience_counter >= patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break