コード例 #1
0
ファイル: pretrain_lm.py プロジェクト: stepgazaille/meansum
    def run_epoch(self,
                  data_iter,
                  nbatches,
                  epoch,
                  split,
                  optimizer=None,
                  tb_writer=None):
        """

        Args:
            data_iter: Pytorch DataLoader
            nbatches: int (number of batches in data_iter)
            epoch: int
            split: str ('train', 'val')
            optimizer: Wrapped optim (e.g. OptWrapper, NoamOpt)
            tb_writer: Tensorboard SummaryWriter

        Returns:
            1D tensor containing average loss across all items in data_iter
        """

        loss_avg = 0
        n_fwds = 0
        for s_idx, (texts, ratings, metadata) in enumerate(data_iter):

            start = time.time()

            # Add special tokens to texts
            x, lengths, labels = self.dataset.prepare_batch(
                texts, ratings, doc_append_id=EDOC_ID)
            iter = create_lm_data_iter(x, self.hp.lm_seq_len)
            for b_idx, batch_obj in enumerate(iter):
                if optimizer:
                    optimizer.optimizer.zero_grad()

                #
                # Forward pass
                #
                if self.hp.model_type == 'mlstm':
                    # Note: iter creates a sequence of length hp.lm_seq_len + 1, and batch_obj.trg is all about the
                    # last token, while batch_obj.trg_y is all but the first token. They're named as such because
                    # the Batch class was originally designed for the Encoder-Decoder version of the Transformer, and
                    # the trg variables correspond to inputs to the Decoder.
                    batch = move_to_cuda(
                        batch_obj.trg
                    )  # it's trg because doesn't include last token
                    batch_trg = move_to_cuda(batch_obj.trg_y)
                    batch_size, seq_len = batch.size()

                    if b_idx == 0:
                        h_init, c_init = self.model.module.rnn.state0(batch_size) if self.ngpus > 1 \
                            else self.model.rnn.state0(batch_size)
                        h_init = move_to_cuda(h_init)
                        c_init = move_to_cuda(c_init)

                    # Forward steps for lstm
                    result = self.model(batch, h_init, c_init)
                    hiddens, cells, outputs = zip(
                        *result) if self.ngpus > 1 else result

                    # Calculate loss
                    loss = 0
                    batch_trg = batch_trg.transpose(
                        0, 1).contiguous()  # [seq_len, batch]
                    if self.ngpus > 1:
                        for t in range(len(outputs[0])):
                            # length ngpus list of outputs at that time step
                            loss += self.loss_fn(
                                [outputs[i][t] for i in range(len(outputs))],
                                batch_trg[t])
                    else:
                        for t in range(len(outputs)):
                            loss += self.loss_fn(outputs[t], batch_trg[t])
                    loss_value = loss.item() / self.hp.lm_seq_len

                    # We only do bptt until lm_seq_len. Copy the hidden states so that we can continue the sequence
                    if self.ngpus > 1:
                        h_init = torch.cat([
                            copy_state(hiddens[i][-1])
                            for i in range(self.ngpus)
                        ],
                                           dim=0)
                        c_init = torch.cat([
                            copy_state(cells[i][-1]) for i in range(self.ngpus)
                        ],
                                           dim=0)
                    else:
                        h_init = copy_state(hiddens[-1])
                        c_init = copy_state(cells[-1])

                elif self.hp.model_type == 'transformer':
                    # This is the decoder only version now
                    logits = self.model(move_to_cuda(batch_obj.trg),
                                        move_to_cuda(batch_obj.trg_mask))
                    # logits: [batch, seq_len, vocab]
                    loss = self.loss_fn(logits, move_to_cuda(batch_obj.trg_y))
                    loss /= move_to_cuda(batch_obj.ntokens.float(
                    ))  # normalize by number of non-pad tokens
                    loss_value = loss.item()
                    if self.ngpus > 1:
                        # With the custom DataParallel, there is no gather() and the loss is calculated per
                        # minibatch split on each GPU (see DataParallelCriterion's forward() -- the return
                        # value is divided by the number of GPUs). We simply undo that operation here.
                        # Also, note that the KLDivLoss in LabelSmoothing is already normalized by both
                        # batch and seq_len, as we use size_average=False to prevent any normalization followed
                        # by a manual normalization using the batch.ntokens. This oddity is because
                        # KLDivLoss does not support ignore_index=PAD_ID as CrossEntropyLoss does.
                        loss_value *= len(self.opt.gpus.split(','))

                #
                # Backward pass
                #
                gn = -1.0  # dummy for val (norm can't be < 0 anyway)
                if optimizer:
                    loss.backward()
                    gn = calc_grad_norm(
                        self.model
                    )  # not actually using this, just for printing
                    optimizer.step()
                loss_avg = update_moving_avg(loss_avg, loss_value, n_fwds + 1)
                n_fwds += 1

            # Print
            print_str = 'Epoch={}, batch={}/{}, split={}, time={:.4f} --- ' \
                        'loss={:.4f}, loss_avg_so_far={:.4f}, grad_norm={:.4f}'
            if s_idx % self.opt.print_every_nbatches == 0:
                print(
                    print_str.format(epoch, s_idx, nbatches, split,
                                     time.time() - start, loss_value, loss_avg,
                                     gn))
                if tb_writer:
                    # Step for tensorboard: global steps in terms of number of reviews
                    # This accounts for runs with different batch sizes
                    step = (epoch * nbatches *
                            self.hp.batch_size) + (s_idx * self.hp.batch_size)
                    tb_writer.add_scalar('stats/loss', loss_value, step)

            # Save periodically so we don't have to wait for epoch to finish
            save_every = nbatches // 10
            if save_every != 0 and s_idx % save_every == 0:
                save_model(self.save_dir, self.model, self.optimizer, epoch,
                           self.opt, 'intermediate')

        print('Epoch={}, split={}, --- '
              'loss_avg={:.4f}'.format(epoch, split, loss_avg))

        return loss_avg
コード例 #2
0
    def run_epoch(self, data_iter, nbatches, epoch, split, optimizer=None, tb_writer=None, save_intermediate=True):
        """

        Args:
            data_iter: iterable providing minibatches
            nbatches: int (number of batches in data_iter)
            epoch: int
            split: str ('train', 'val')
            optimizer: Wrapped optim (e.g. OptWrapper)
            tb_writer: Tensorboard SummaryWriter
            save_intermediate: boolean (save intermediate checkpoints)

        Returns:
            1D tensor containing average loss across all items in data_iter
        """

        loss_avg = 0
        acc_avg = 0
        rating_diff_avg = 0

        per_rating_counts = defaultdict(int)
        per_rating_acc = defaultdict(int)

        for s, batch in enumerate(data_iter):
            start = time.time()
            if optimizer:
                optimizer.optimizer.zero_grad()

            texts, ratings, metadata = batch
            batch_size = len(texts)
            x, lengths, labels = self.dataset.prepare_batch(texts, ratings)

            #
            # Forward pass
            #
            logits = self.model(x)
            if self.hp.clf_mse:
                logits = logits.squeeze(1)  # [batch, 1] -> [batch]
                loss = self.loss_fn(logits, labels.float())
            else:
                loss = self.loss_fn(logits, labels)
            loss_value = loss.item()
            acc = calc_clf_acc(logits, labels).item()

            #
            # Backward pass
            #
            gn = -1.0  # dummy for val (norm can't be < 0 anyway)
            if optimizer:
                loss.backward()
                gn = calc_grad_norm(self.model)  # not actually using this, just for printing
                optimizer.step()

            #
            # Print etc.
            #
            loss_avg = update_moving_avg(loss_avg, loss_value, s + 1)
            acc_avg = update_moving_avg(acc_avg, acc, s + 1)
            print_str = 'Epoch={}, batch={}/{}, split={}, time={:.4f} --- ' \
                        'loss={:.4f}, loss_avg_so_far={:.4f}, acc={:.4f}, acc_avg_so_far={:.4f}, grad_norm={:.4f}'

            if self.hp.clf_mse:
                rating_diff = (labels - logits.round().long()).float().mean()
                rating_diff_avg = update_moving_avg(rating_diff_avg, rating_diff, s + 1)
                print_str += ', rating_diff={:.4f}, rating_diff_avg_so_far={:.4f}'.format(rating_diff, rating_diff_avg)

                true_ratings = labels + 1
                pred_ratings = logits.round() + 1
                probs = torch.ones(batch_size)  # dummy
                per_rating_counts, per_rating_acc = calc_per_rating_acc(pred_ratings, true_ratings,
                                                                        per_rating_counts, per_rating_acc)
            else:
                true_ratings = labels + 1
                probs, max_idxs = torch.max(F.softmax(logits, dim=1), dim=1)
                pred_ratings = max_idxs + 1
                per_rating_counts, per_rating_acc = calc_per_rating_acc(pred_ratings, true_ratings,
                                                                        per_rating_counts, per_rating_acc)

            if s % self.opt.print_every_nbatches == 0:
                print(print_str.format(
                    epoch, s, nbatches, split, time.time() - start,
                    loss_value, loss_avg, acc, acc_avg, gn
                ))
                print('Review: {}'.format(texts[0]))
                print('True rating: {}'.format(true_ratings[0]))
                print('Predicted rating: {}'.format(pred_ratings[0]))
                print('Predicted rating probability: {:.4f}'.format(probs[0]))
                print('Per rating accuracy: {}'.format(dict(per_rating_acc)))

                if tb_writer:
                    # Global steps in terms of number of items
                    # This accounts for runs with different batch sizes
                    step = (epoch * nbatches * self.hp.batch_size) + (s * self.hp.batch_size)
                    tb_writer.add_scalar('loss/batch_loss', loss_value, step)
                    tb_writer.add_scalar('loss/avg_loss', loss_avg, step)
                    tb_writer.add_scalar('acc/batch_acc', acc, step)
                    tb_writer.add_scalar('acc/avg_acc', acc_avg, step)
                    if self.hp.clf_mse:
                        tb_writer.add_scalar('rating_diff/batch_diff', rating_diff, step)
                        tb_writer.add_scalar('rating_diff/avg_diff', rating_diff_avg, step)

                    tb_writer.add_text('predictions/review', texts[0], step)
                    tb_writer.add_text('predictions/true_pred_prob',
                                       'True={}, Pred={}, Prob={:.4f}'.format(
                                           true_ratings[0], pred_ratings[0], probs[0]),
                                       step)
                    for r, acc in per_rating_acc.items():
                        tb_writer.add_scalar('acc/curavg_per_rating_acc_{}'.format(r), acc, step)


            # Save periodically so we don't have to wait for epoch to finish
            if save_intermediate:
                save_every = nbatches // 10
                if save_every != 0 and s % save_every == 0:
                    model_to_save = self.model.module if len(self.opt.gpus) > 1 else self.model
                    save_model(self.save_dir, model_to_save, self.optimizer, epoch, self.opt, 'intermediate')

        print_str = 'Epoch={}, split={}, --- ' \
              'loss_avg={:.4f}, acc_avg={:.4f}, per_rating_acc={}'.format(
            epoch, split, loss_avg, acc_avg, dict(per_rating_acc))
        if self.hp.clf_mse:
            print_str += ', rating_diff_avg={:.4f}'.format(rating_diff_avg)
        print(print_str)

        return loss_avg, acc_avg, rating_diff_avg, per_rating_acc
コード例 #3
0
ファイル: pretrain_lm.py プロジェクト: stepgazaille/meansum
    def train(self):
        """
        Main train loop
        """
        #
        # Get data, setup
        #

        self.dataset = SummDatasetFactory.get(self.opt.dataset,
                                              '../datasets/yelp_dataset/')
        subwordenc = self.dataset.subwordenc
        train_iter = self.dataset.get_data_loader(
            split='train',
            n_docs=self.hp.n_docs,
            sample_reviews=True,
            batch_size=self.hp.batch_size,
            shuffle=True)
        train_nbatches = train_iter.__len__()
        val_iter = self.dataset.get_data_loader(split='val',
                                                n_docs=self.hp.n_docs,
                                                sample_reviews=False,
                                                batch_size=self.hp.batch_size,
                                                shuffle=False)
        val_nbatches = val_iter.__len__()

        tb_path = os.path.join(self.save_dir, 'tensorboard/')
        print('Tensorboard events will be logged to: {}'.format(tb_path))
        os.mkdir(tb_path)
        os.mkdir(tb_path + 'train/')
        os.mkdir(tb_path + 'val/')
        self.tb_tr_writer = SummaryWriter(tb_path + 'train/')
        self.tb_val_writer = SummaryWriter(tb_path + 'val/')

        #
        # Get model and loss
        #
        if len(self.opt.load_model) > 0:
            raise NotImplementedError(
                'Need to save run to same directory, handle changes in hp, etc.'
            )
            # checkpoint = torch.load(opt.load_model)
            # self.model = checkpoint['model']
        else:
            if self.hp.model_type == 'mlstm':
                embed = nn.Embedding(subwordenc.vocab_size, self.hp.emb_size)
                lstm = StackedLSTM(mLSTM,
                                   self.hp.lstm_layers,
                                   self.hp.emb_size,
                                   self.hp.hidden_size,
                                   subwordenc.vocab_size,
                                   self.hp.lstm_dropout,
                                   layer_norm=self.hp.lstm_ln)
                self.model = StackedLSTMEncoder(embed, lstm)
                self.loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_ID)
            elif self.hp.model_type == 'transformer':
                self.model = make_model(subwordenc.vocab_size,
                                        subwordenc.vocab_size,
                                        N=self.hp.tsfr_blocks,
                                        d_model=self.hp.hidden_size,
                                        d_ff=self.hp.tsfr_ff_size,
                                        dropout=self.hp.tsfr_dropout,
                                        tie_embs=self.hp.tsfr_tie_embs,
                                        decoder_only=True)
                self.loss_fn = LabelSmoothing(
                    size=subwordenc.vocab_size,
                    smoothing=self.hp.tsfr_label_smooth)
        if torch.cuda.is_available():
            self.model.cuda()
        self.ngpus = 1
        if len(self.opt.gpus) > 1:
            self.ngpus = len(self.opt.gpus.split(','))
            self.model = DataParallelModel(self.model)
            self.loss_fn = DataParallelCriterion(self.loss_fn)

        n_params = sum([p.nelement() for p in self.model.parameters()])
        print('Number of parameters: {}'.format(n_params))

        #
        # Get optimizer
        #
        if self.hp.optim == 'normal':
            self.optimizer = OptWrapper(
                self.model, self.hp.lm_clip,
                optim.Adam(self.model.parameters(), lr=self.hp.lm_lr))
        elif self.hp.optim == 'noam':
            d_model = self.model.module.tgt_embed[0].d_model if self.ngpus > 1 else \
                self.model.tgt_embed[0].d_model
            self.optimizer = NoamOpt(
                d_model, 2, self.hp.noam_warmup,
                torch.optim.Adam(self.model.parameters(),
                                 lr=0,
                                 betas=(0.9, 0.98),
                                 eps=1e-9))

        #
        # Train epochs
        #
        for e in range(hp.max_nepochs):
            try:
                self.model.train()
                loss_avg = self.run_epoch(train_iter,
                                          train_nbatches,
                                          e,
                                          'train',
                                          optimizer=self.optimizer,
                                          tb_writer=self.tb_tr_writer)
                self.tb_tr_writer.add_scalar('overall_stats/loss_avg',
                                             loss_avg, e)

            except KeyboardInterrupt:
                print('Exiting from training early')

            self.model.eval()
            loss_avg = self.run_epoch(val_iter,
                                      val_nbatches,
                                      e,
                                      'val',
                                      optimizer=None)
            self.tb_val_writer.add_scalar('overall_stats/loss_avg', loss_avg,
                                          e)
            save_model(self.save_dir, self.model, self.optimizer, e, self.opt,
                       loss_avg)
コード例 #4
0
    def train(self):
        """
        Main train loop
        """
        #
        # Get data, setup
        #

        # NOTE: Use n_docs=1 so we can classify one review
        self.dataset = SummDatasetFactory.get(self.opt.dataset, self.opt.dir_path)
        train_iter = self.dataset.get_data_loader(split='train', sample_reviews=True, n_docs=1,
                                                  batch_size=self.hp.batch_size, shuffle=True)
        val_iter = self.dataset.get_data_loader(split='val', sample_reviews=False, n_docs=1,
                                                batch_size=self.hp.batch_size, shuffle=False)

        self.tb_tr_writer = None
        self.tb_val_writer = None
        tb_path = os.path.join(self.save_dir, 'tensorboard/')
        print('Tensorboard events will be logged to: {}'.format(tb_path))
        os.mkdir(tb_path)
        os.mkdir(tb_path + 'train/')
        os.mkdir(tb_path + 'val/')
        self.tb_tr_writer = SummaryWriter(tb_path + 'train/')
        self.tb_val_writer = SummaryWriter(tb_path + 'val/')

        #
        # Get model and loss
        #
        if len(self.opt.load_train_model) > 0:
            raise NotImplementedError('Need to save run to same directory, handle changes in hp, etc.')
            # checkpoint = torch.load(opt.load_model)
            # self.model = checkpoint['model']
        else:
            if self.hp.model_type == 'cnn':
                cnn_output_size = self.hp.cnn_n_feat_maps * len(self.hp.cnn_filter_sizes)
                self.model = TextClassifier(self.dataset.subwordenc.vocab_size, self.hp.emb_size,
                                            self.hp.cnn_filter_sizes, self.hp.cnn_n_feat_maps, self.hp.cnn_dropout,
                                            cnn_output_size, self.dataset.n_ratings_labels,
                                            onehot_inputs=self.hp.clf_onehot, mse=self.hp.clf_mse)

        if self.hp.clf_mse:
            self.loss_fn = nn.MSELoss()
        else:
            self.loss_fn = nn.CrossEntropyLoss()
        if torch.cuda.is_available():
            self.model.cuda()
        if len(self.opt.gpus) > 1:
            self.model = nn.DataParallel(self.model)

        n_params = sum([p.nelement() for p in self.model.parameters()])
        print('Number of parameters: {}'.format(n_params))

        #
        # Get optimizer
        #
        self.optimizer = OptWrapper(
            self.model,
            self.hp.clf_clip,
            optim.Adam(self.model.parameters(), lr=self.hp.clf_lr))

        #
        # Train epochs
        #
        for e in range(hp.max_nepochs):
            try:
                self.model.train()
                loss_avg, acc_avg, rating_diff_avg, per_rating_acc = self.run_epoch(
                    train_iter, train_iter.__len__(), e, 'train',
                    optimizer=self.optimizer, tb_writer=self.tb_tr_writer)
                self.tb_tr_writer.add_scalar('overall/loss', loss_avg, e)
                self.tb_tr_writer.add_scalar('overall/acc', acc_avg, e)
                self.tb_tr_writer.add_scalar('overall/rating_diff', rating_diff_avg, e)
                for r, acc in per_rating_acc.items():
                    self.tb_tr_writer.add_scalar('overall/per_rating_acc_{}_stars'.format(r), acc, e)
            except KeyboardInterrupt:
                print('Exiting from training early')

            self.model.eval()
            loss_avg, acc_avg, rating_diff_avg, per_rating_acc = self.run_epoch(
                val_iter, val_iter.__len__(), e, 'val', optimizer=None)
            self.tb_val_writer.add_scalar('overall/loss', loss_avg, e)
            self.tb_val_writer.add_scalar('overall/acc', acc_avg, e)
            self.tb_val_writer.add_scalar('overall/rating_diff', rating_diff_avg, e)
            for r, acc in per_rating_acc.items():
                self.tb_val_writer.add_scalar('overall/per_rating_acc_{}'.format(r), acc, e)
            fn_str = 'l{:.4f}_a{:.4f}_d{:.4f}'.format(loss_avg, acc_avg, rating_diff_avg)
            model_to_save = self.model.module if len(self.opt.gpus) > 1 else self.model
            save_model(self.save_dir, model_to_save, self.optimizer, e, self.opt, fn_str)