Ejemplo n.º 1
0
def calc_per_rating_acc(pred_ratings, true_ratings, per_rating_counts,
                        per_rating_acc):
    """
    Calculate the accuracy of each star rating

    Args:
        pred_ratings: 1D Tensor (e.g. batch_size)
        true_ratings: 1D Tensor (e.g. batch_size)
        per_rating_counts: dict: rating to int
        per_rating_acc: dict: rating to float

    Returns:
        Updated per_rating_counts and per_rating_acc
    """
    for b_idx in range(true_ratings.size(0)):
        true_rating, pred_rating = true_ratings[b_idx].item(
        ), pred_ratings[b_idx].item()
        per_rating_counts[true_rating] += 1
        avg_so_far = per_rating_acc[true_rating]
        item_acc = true_rating == pred_rating
        rating_count = per_rating_counts[true_rating]
        per_rating_acc[true_rating] = update_moving_avg(
            avg_so_far, item_acc, rating_count)

    return per_rating_counts, per_rating_acc
Ejemplo n.º 2
0
    def extractive_baseline(self, data_iter, clf_model=None):
        """
        Run an extractive method
        """
        evaluator = EvalMetrics(remove_stopwords=self.hp.remove_stopwords,
                                use_stemmer=self.hp.use_stemmer,
                                store_all=True)
        summarizer = CentroidW2VSummarizer(WORD2VEC_PATH,
                                           length_limit=2,
                                           topic_threshold=0.3,
                                           sim_threshold=0.95,
                                           reordering=True,
                                           subtract_centroid=False,
                                           keep_first=False,
                                           bow_param=0,
                                           length_param=0,
                                           position_param=0,
                                           debug=False)

        summaries = []
        accuracy = 0.0
        per_rating_counts = defaultdict(int)
        per_rating_acc = defaultdict(int)
        for i, (texts, ratings, metadata) in enumerate(data_iter):
            for j, text in enumerate(texts):
                # texts is a list of of length batch_size
                # each item in texts is a str, i.e. n_docs documents concatenated together
                src_docs = SummDataset.split_docs(text)
                # limit is number of words
                # concatenate documents without the token
                summary = summarizer.summarize(
                    SummDataset.concat_docs(src_docs, edok_token=False),
                    limit=self.dataset.conf.extractive_max_len)
                evaluator.batch_update_avg_rouge([summary], [src_docs])
                acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \
                    classify_summ_batch(clf_model, [summary], [ratings[j]], self.dataset,
                                        per_rating_counts, per_rating_acc)

                if acc is None:
                    print('Summary was too short to classify')
                    pred_rating, pred_prob = None, None
                else:
                    pred_rating, pred_prob = pred_ratings[j].item(
                    ), pred_probs[j].item()
                    accuracy = update_moving_avg(accuracy, acc,
                                                 i * len(texts) + j + 1)

                dic = {
                    'docs': text,
                    'summary': summary,
                    'rating': ratings[j].item(),
                    'pred_rating': pred_rating,
                    'pred_prob': pred_prob
                }
                for k, values in metadata.items():
                    dic[k] = values[j]
                summaries.append(dic)

        return evaluator, summaries, accuracy.item(), per_rating_acc
Ejemplo n.º 3
0
    def update_with_evaluator(self, evaluator):
        """
        Use another EvalMetrics object to update the self.* rouge dicts. This is used
        by best_review_baseline() in run_evaluations.

        Args:
            evaluator: EvalMetrics instance
        """
        self._updates += 1  # global count

        # Update moving averages
        for stat, rouge_dict in self.get_avg_stats_dicts().items():
            src_rouge_dict = getattr(evaluator, 'avg_{}_rouges'.format(stat))
            for rouge_name, d in src_rouge_dict.items():
                for metric, score in d.items():
                    cur_score = rouge_dict[rouge_name][metric]
                    rouge_dict[rouge_name][metric] = update_moving_avg(cur_score, score, self._updates)

        # Add to lists
        for stat, rouge_dict in self.get_list_stats_dicts().items():
            src_rouge_dict = getattr(evaluator, '{}_rouges'.format(stat))
            for rouge_name, d in src_rouge_dict.items():
                for metric, scores in d.items():
                    rouge_dict[rouge_name][metric].extend(scores)
Ejemplo n.º 4
0
    def run_epoch(self,
                  data_iter,
                  nbatches,
                  epoch,
                  split,
                  optimizer=None,
                  tb_writer=None):
        """

        Args:
            data_iter: Pytorch DataLoader
            nbatches: int (number of batches in data_iter)
            epoch: int
            split: str ('train', 'val')
            optimizer: Wrapped optim (e.g. OptWrapper, NoamOpt)
            tb_writer: Tensorboard SummaryWriter

        Returns:
            1D tensor containing average loss across all items in data_iter
        """

        loss_avg = 0
        n_fwds = 0
        for s_idx, (texts, ratings, metadata) in enumerate(data_iter):

            start = time.time()

            # Add special tokens to texts
            x, lengths, labels = self.dataset.prepare_batch(
                texts, ratings, doc_append_id=EDOC_ID)
            iter = create_lm_data_iter(x, self.hp.lm_seq_len)
            for b_idx, batch_obj in enumerate(iter):
                if optimizer:
                    optimizer.optimizer.zero_grad()

                #
                # Forward pass
                #
                if self.hp.model_type == 'mlstm':
                    # Note: iter creates a sequence of length hp.lm_seq_len + 1, and batch_obj.trg is all about the
                    # last token, while batch_obj.trg_y is all but the first token. They're named as such because
                    # the Batch class was originally designed for the Encoder-Decoder version of the Transformer, and
                    # the trg variables correspond to inputs to the Decoder.
                    batch = move_to_cuda(
                        batch_obj.trg
                    )  # it's trg because doesn't include last token
                    batch_trg = move_to_cuda(batch_obj.trg_y)
                    batch_size, seq_len = batch.size()

                    if b_idx == 0:
                        h_init, c_init = self.model.module.rnn.state0(batch_size) if self.ngpus > 1 \
                            else self.model.rnn.state0(batch_size)
                        h_init = move_to_cuda(h_init)
                        c_init = move_to_cuda(c_init)

                    # Forward steps for lstm
                    result = self.model(batch, h_init, c_init)
                    hiddens, cells, outputs = zip(
                        *result) if self.ngpus > 1 else result

                    # Calculate loss
                    loss = 0
                    batch_trg = batch_trg.transpose(
                        0, 1).contiguous()  # [seq_len, batch]
                    if self.ngpus > 1:
                        for t in range(len(outputs[0])):
                            # length ngpus list of outputs at that time step
                            loss += self.loss_fn(
                                [outputs[i][t] for i in range(len(outputs))],
                                batch_trg[t])
                    else:
                        for t in range(len(outputs)):
                            loss += self.loss_fn(outputs[t], batch_trg[t])
                    loss_value = loss.item() / self.hp.lm_seq_len

                    # We only do bptt until lm_seq_len. Copy the hidden states so that we can continue the sequence
                    if self.ngpus > 1:
                        h_init = torch.cat([
                            copy_state(hiddens[i][-1])
                            for i in range(self.ngpus)
                        ],
                                           dim=0)
                        c_init = torch.cat([
                            copy_state(cells[i][-1]) for i in range(self.ngpus)
                        ],
                                           dim=0)
                    else:
                        h_init = copy_state(hiddens[-1])
                        c_init = copy_state(cells[-1])

                elif self.hp.model_type == 'transformer':
                    # This is the decoder only version now
                    logits = self.model(move_to_cuda(batch_obj.trg),
                                        move_to_cuda(batch_obj.trg_mask))
                    # logits: [batch, seq_len, vocab]
                    loss = self.loss_fn(logits, move_to_cuda(batch_obj.trg_y))
                    loss /= move_to_cuda(batch_obj.ntokens.float(
                    ))  # normalize by number of non-pad tokens
                    loss_value = loss.item()
                    if self.ngpus > 1:
                        # With the custom DataParallel, there is no gather() and the loss is calculated per
                        # minibatch split on each GPU (see DataParallelCriterion's forward() -- the return
                        # value is divided by the number of GPUs). We simply undo that operation here.
                        # Also, note that the KLDivLoss in LabelSmoothing is already normalized by both
                        # batch and seq_len, as we use size_average=False to prevent any normalization followed
                        # by a manual normalization using the batch.ntokens. This oddity is because
                        # KLDivLoss does not support ignore_index=PAD_ID as CrossEntropyLoss does.
                        loss_value *= len(self.opt.gpus.split(','))

                #
                # Backward pass
                #
                gn = -1.0  # dummy for val (norm can't be < 0 anyway)
                if optimizer:
                    loss.backward()
                    gn = calc_grad_norm(
                        self.model
                    )  # not actually using this, just for printing
                    optimizer.step()
                loss_avg = update_moving_avg(loss_avg, loss_value, n_fwds + 1)
                n_fwds += 1

            # Print
            print_str = 'Epoch={}, batch={}/{}, split={}, time={:.4f} --- ' \
                        'loss={:.4f}, loss_avg_so_far={:.4f}, grad_norm={:.4f}'
            if s_idx % self.opt.print_every_nbatches == 0:
                print(
                    print_str.format(epoch, s_idx, nbatches, split,
                                     time.time() - start, loss_value, loss_avg,
                                     gn))
                if tb_writer:
                    # Step for tensorboard: global steps in terms of number of reviews
                    # This accounts for runs with different batch sizes
                    step = (epoch * nbatches *
                            self.hp.batch_size) + (s_idx * self.hp.batch_size)
                    tb_writer.add_scalar('stats/loss', loss_value, step)

            # Save periodically so we don't have to wait for epoch to finish
            save_every = nbatches // 10
            if save_every != 0 and s_idx % save_every == 0:
                save_model(self.save_dir, self.model, self.optimizer, epoch,
                           self.opt, 'intermediate')

        print('Epoch={}, split={}, --- '
              'loss_avg={:.4f}'.format(epoch, split, loss_avg))

        return loss_avg
Ejemplo n.º 5
0
    def run_epoch(self, data_iter, nbatches, epoch, split, optimizer=None, tb_writer=None, save_intermediate=True):
        """

        Args:
            data_iter: iterable providing minibatches
            nbatches: int (number of batches in data_iter)
            epoch: int
            split: str ('train', 'val')
            optimizer: Wrapped optim (e.g. OptWrapper)
            tb_writer: Tensorboard SummaryWriter
            save_intermediate: boolean (save intermediate checkpoints)

        Returns:
            1D tensor containing average loss across all items in data_iter
        """

        loss_avg = 0
        acc_avg = 0
        rating_diff_avg = 0

        per_rating_counts = defaultdict(int)
        per_rating_acc = defaultdict(int)

        for s, batch in enumerate(data_iter):
            start = time.time()
            if optimizer:
                optimizer.optimizer.zero_grad()

            texts, ratings, metadata = batch
            batch_size = len(texts)
            x, lengths, labels = self.dataset.prepare_batch(texts, ratings)

            #
            # Forward pass
            #
            logits = self.model(x)
            if self.hp.clf_mse:
                logits = logits.squeeze(1)  # [batch, 1] -> [batch]
                loss = self.loss_fn(logits, labels.float())
            else:
                loss = self.loss_fn(logits, labels)
            loss_value = loss.item()
            acc = calc_clf_acc(logits, labels).item()

            #
            # Backward pass
            #
            gn = -1.0  # dummy for val (norm can't be < 0 anyway)
            if optimizer:
                loss.backward()
                gn = calc_grad_norm(self.model)  # not actually using this, just for printing
                optimizer.step()

            #
            # Print etc.
            #
            loss_avg = update_moving_avg(loss_avg, loss_value, s + 1)
            acc_avg = update_moving_avg(acc_avg, acc, s + 1)
            print_str = 'Epoch={}, batch={}/{}, split={}, time={:.4f} --- ' \
                        'loss={:.4f}, loss_avg_so_far={:.4f}, acc={:.4f}, acc_avg_so_far={:.4f}, grad_norm={:.4f}'

            if self.hp.clf_mse:
                rating_diff = (labels - logits.round().long()).float().mean()
                rating_diff_avg = update_moving_avg(rating_diff_avg, rating_diff, s + 1)
                print_str += ', rating_diff={:.4f}, rating_diff_avg_so_far={:.4f}'.format(rating_diff, rating_diff_avg)

                true_ratings = labels + 1
                pred_ratings = logits.round() + 1
                probs = torch.ones(batch_size)  # dummy
                per_rating_counts, per_rating_acc = calc_per_rating_acc(pred_ratings, true_ratings,
                                                                        per_rating_counts, per_rating_acc)
            else:
                true_ratings = labels + 1
                probs, max_idxs = torch.max(F.softmax(logits, dim=1), dim=1)
                pred_ratings = max_idxs + 1
                per_rating_counts, per_rating_acc = calc_per_rating_acc(pred_ratings, true_ratings,
                                                                        per_rating_counts, per_rating_acc)

            if s % self.opt.print_every_nbatches == 0:
                print(print_str.format(
                    epoch, s, nbatches, split, time.time() - start,
                    loss_value, loss_avg, acc, acc_avg, gn
                ))
                print('Review: {}'.format(texts[0]))
                print('True rating: {}'.format(true_ratings[0]))
                print('Predicted rating: {}'.format(pred_ratings[0]))
                print('Predicted rating probability: {:.4f}'.format(probs[0]))
                print('Per rating accuracy: {}'.format(dict(per_rating_acc)))

                if tb_writer:
                    # Global steps in terms of number of items
                    # This accounts for runs with different batch sizes
                    step = (epoch * nbatches * self.hp.batch_size) + (s * self.hp.batch_size)
                    tb_writer.add_scalar('loss/batch_loss', loss_value, step)
                    tb_writer.add_scalar('loss/avg_loss', loss_avg, step)
                    tb_writer.add_scalar('acc/batch_acc', acc, step)
                    tb_writer.add_scalar('acc/avg_acc', acc_avg, step)
                    if self.hp.clf_mse:
                        tb_writer.add_scalar('rating_diff/batch_diff', rating_diff, step)
                        tb_writer.add_scalar('rating_diff/avg_diff', rating_diff_avg, step)

                    tb_writer.add_text('predictions/review', texts[0], step)
                    tb_writer.add_text('predictions/true_pred_prob',
                                       'True={}, Pred={}, Prob={:.4f}'.format(
                                           true_ratings[0], pred_ratings[0], probs[0]),
                                       step)
                    for r, acc in per_rating_acc.items():
                        tb_writer.add_scalar('acc/curavg_per_rating_acc_{}'.format(r), acc, step)


            # Save periodically so we don't have to wait for epoch to finish
            if save_intermediate:
                save_every = nbatches // 10
                if save_every != 0 and s % save_every == 0:
                    model_to_save = self.model.module if len(self.opt.gpus) > 1 else self.model
                    save_model(self.save_dir, model_to_save, self.optimizer, epoch, self.opt, 'intermediate')

        print_str = 'Epoch={}, split={}, --- ' \
              'loss_avg={:.4f}, acc_avg={:.4f}, per_rating_acc={}'.format(
            epoch, split, loss_avg, acc_avg, dict(per_rating_acc))
        if self.hp.clf_mse:
            print_str += ', rating_diff_avg={:.4f}'.format(rating_diff_avg)
        print(print_str)

        return loss_avg, acc_avg, rating_diff_avg, per_rating_acc
Ejemplo n.º 6
0
    def run_summarization_baseline(self, method):
        """
        Args:
            method: str ('extractive', 'ledes-<n>', 'best_review', 'lm_autoenc')

        Saves outputs to: outputs/eval/<dataset>/<n_docs>/<method>
        """
        batch_size = self.hp.batch_size if method == 'lm_autoenc' else 1
        dl = self.get_test_set_data_iter(batch_size=batch_size)

        if torch.cuda.is_available():
            clf_model = torch.load(self.opt.load_clf)['model']
        else:
            raise Exception(
                'You should run on a cuda machine to load and use the classifcation model'
            )

        print('\n', '=' * 50)
        print('Running {} baseline'.format(method))
        if method == 'extractive':
            evaluator, summaries, acc, per_rating_acc = self.extractive_baseline(
                dl, clf_model)
        elif 'ledes' in method:  # e.g. ledes-2
            n = int(method.split('-')[1])
            evaluator, summaries, acc, per_rating_acc = self.ledes_baseline(
                dl, n, clf_model)
        elif method == 'best_review':
            evaluator, summaries, acc, per_rating_acc = self.best_or_worst_review_baseline(
                dl, 'best', clf_model)
        elif method == 'worst_review':
            evaluator, summaries, acc, per_rating_acc = self.best_or_worst_review_baseline(
                dl, 'worst', clf_model)
        elif method == 'lm_autoenc':
            evaluator, summaries, acc, per_rating_acc = self.lm_autoenc_baseline(
                dl, clf_model)

        # Calculate NLL of summaries using fixed, pretrained LM
        pretrained_lm = torch.load(
            self.opt.load_lm)['model']  # StackedLSTMEncoder
        pretrained_lm = pretrained_lm.module if isinstance(
            pretrained_lm, nn.DataParallel) else pretrained_lm
        avg_nll = 0.0
        loop_idx = 0
        for i in range(0, len(summaries), batch_size):
            batch_summs = summaries[i:i + batch_size]
            batch_texts = [d['summary'] for d in batch_summs]
            dummy_ratings = [
                torch.LongTensor([0]) for _ in range(len(batch_texts))
            ]
            try:
                batch_x, _, _ = self.dataset.prepare_batch(
                    batch_texts, dummy_ratings)
                nll = calc_lm_nll(pretrained_lm, batch_x)
                if not np.isnan(nll.detach().cpu().numpy()):
                    avg_nll = update_moving_avg(avg_nll, nll.item(),
                                                loop_idx + 1)
                    loop_idx += 1
                else:
                    # lm_autoenc baseline has a rare edge case where a nan is produced
                    continue
            except Exception as e:
                # worst_review in the Amazon dataset has a rare edge case
                # where the worst review is an empty string.
                # No reviews should be empty, but it appears to just be one or two reviews
                print(e)
                continue

        # Save summaries, stats, rouge scores, etc.
        dataset_dir = self.opt.dataset if self.opt.az_cat is None else 'amazon_{}'.format(
            self.opt.az_cat)
        out_dir = os.path.join(OUTPUTS_EVAL_DIR, dataset_dir,
                               'n_docs_{}'.format(self.hp.n_docs), method)
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        summs_out_fp = os.path.join(out_dir, 'summaries.json')
        save_file(summaries, summs_out_fp)
        out_fp = os.path.join(out_dir, 'stats.json')
        save_file(
            {
                'acc': acc,
                'per_rating_acc': per_rating_acc,
                'nll': avg_nll
            }, out_fp)

        print('-' * 50)
        print('Rating accuracy: ', acc)
        print('NLL: ', avg_nll)
        print('Per rating accuracy: ', dict(per_rating_acc))
        for stat, rouge_dict in evaluator.get_avg_stats_dicts().items():
            print('-' * 50)
            print(stat.upper())
            print(evaluator.to_str(rouge_dict))

            out_fp = os.path.join(out_dir, 'avg_{}-rouges.json'.format(stat))
            save_file(rouge_dict, out_fp)
            out_fp = os.path.join(out_dir, 'avg_{}-rouges.csv'.format(stat))
            evaluator.to_csv(rouge_dict, out_fp)

        out_fp = os.path.join(out_dir, '{}-rouges.pdf')
        evaluator.plot_rouge_distributions(show=self.opt.show_figs,
                                           out_fp=out_fp)
Ejemplo n.º 7
0
    def run_clf_baseline(self):
        """
        Calculate the classification accuracy when the input is all the reviews concatenated together. This provdies
        a sort of ceiling on how well each of the summarization methods can do, as the classification model
        is not perfect either.
        """
        print('\n', '=' * 50)
        print('Running classifier baseline')

        # Load classifier
        clf_model = torch.load(self.opt.load_clf)['model']
        clf_model = clf_model.module if isinstance(
            clf_model, nn.DataParallel) else clf_model
        if torch.cuda.is_available():
            clf_model.cuda()
        if len(self.opt.gpus) > 1:
            clf_model = nn.DataParallel(clf_model)

        summaries = []
        accuracy = 0.0
        per_rating_counts = defaultdict(int)
        per_rating_acc = defaultdict(int)
        dl = self.get_test_set_data_iter(self.hp.batch_size)
        for i, (texts, ratings_batch, metadata) in enumerate(dl):
            summaries_batch = []
            for j, text in enumerate(texts):
                # texts is a list of of length batch_size
                # each item in texts is a str, i.e. n_docs documents concatenated together
                # concatenate documents without the token
                src_docs = SummDataset.split_docs(text)
                summary = SummDataset.concat_docs(src_docs, edok_token=False)
                summaries_batch.append(summary)

            acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \
                classify_summ_batch(clf_model, summaries_batch, ratings_batch, self.dataset,
                                    per_rating_counts, per_rating_acc)
            accuracy = update_moving_avg(accuracy, acc, i + 1)

            for j in range(len(summaries_batch)):
                dic = {
                    'docs': summaries_batch[j],
                    'rating': ratings_batch[j].item(),
                    'pred_rating': pred_ratings[j].item(),
                    'pred_prob': pred_probs[j].item()
                }
                for k, values in metadata.items():
                    dic[k] = values[j]
                summaries.append(dic)

        # Calculate NLL of summaries using fixed, pretrained LM
        pretrained_lm = torch.load(
            self.opt.load_lm)['model']  # StackedLSTMEncoder
        pretrained_lm = pretrained_lm.module if isinstance(
            pretrained_lm, nn.DataParallel) else pretrained_lm
        avg_nll = 0.0
        batch_size = self.hp.batch_size
        for i in range(0, len(summaries), batch_size):
            batch_summs = summaries[i:i + batch_size]
            batch_texts = [d['docs'] for d in batch_summs]
            dummy_ratings = [
                torch.LongTensor([0]) for _ in range(len(batch_texts))
            ]
            batch_x, _, _ = self.dataset.prepare_batch(batch_texts,
                                                       dummy_ratings)
            nll = calc_lm_nll(pretrained_lm, batch_x)
            avg_nll = update_moving_avg(avg_nll, nll.item(), i + 1)

        # Print and save accuracies, summaries, etc.
        print('NLL: ', avg_nll)
        print('Accuracy: ', accuracy.item())
        print('Per rating accuracy: ', per_rating_acc)

        dataset_dir = self.opt.dataset if self.opt.az_cat is None else 'amazon_{}'.format(
            self.opt.az_cat)
        out_dir = os.path.join(OUTPUTS_EVAL_DIR, dataset_dir,
                               'n_docs_{}'.format(self.hp.n_docs),
                               'clf_baseline')
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        out_fp = os.path.join(out_dir, 'summaries.json')
        save_file(summaries, out_fp)
        out_fp = os.path.join(out_dir, 'stats.json')
        save_file(
            {
                'acc': accuracy.item(),
                'per_rating_acc': per_rating_acc,
                'nll': avg_nll
            }, out_fp)
Ejemplo n.º 8
0
    def lm_autoenc_baseline(self, data_iter, clf_model=None):
        """
        Use the pretrained language model to initialize an encoder-decoder model. This is basically the
        unsupervised abstractive summarization model without training.
        """

        # Load encoder decoder by initializing with languag emodel
        docs_enc = torch.load(self.opt.load_lm)['model']  # StackedLSTMEncoder
        docs_enc = docs_enc.module if isinstance(docs_enc,
                                                 nn.DataParallel) else docs_enc
        summ_dec = StackedLSTMDecoder(copy.deepcopy(docs_enc.embed),
                                      copy.deepcopy(docs_enc.rnn))

        # Create Summarizer so that we can use run_epoch()
        # Copy hp and opt as we're modifying some params. This way there won't be any unexpected errors
        # if it's used by another method
        hp = copy.deepcopy(self.hp)
        hp.sum_cycle = False
        hp.autoenc_docs = False
        hp.sum_clf = False
        opt = copy.deepcopy(self.opt)
        opt.print_every_nbatches = float('inf')

        summarizer = Summarizer(hp, opt, '/tmp/')
        summarizer.tb_val_sub_writer = None
        summarizer.tau = self.hp.tau
        summarizer.ngpus = 1 if len(self.opt.gpus) == 1 else len(
            self.opt.gpus.split(','))
        summarizer.sum_model = torch.load(self.opt.load_lm)
        summarizer.dataset = self.dataset

        summarizer.fixed_lm = torch.load(
            self.opt.load_lm)['model']  # StackedLSTMEncoder
        summarizer.fixed_lm = summarizer.fixed_lm.module if isinstance(summarizer.fixed_lm, nn.DataParallel) \
            else summarizer.fixed_lm

        # Create SummarizationModel
        docs_autodec, combine_encs_h_net, combine_encs_c_net = None, None, None
        summ_enc, docs_dec, discrim_model, clf_model_arg, fixed_lm = None, None, None, None, None
        summarizer.sum_model = SummarizationModel(docs_enc, docs_autodec,
                                                  combine_encs_h_net,
                                                  combine_encs_c_net, summ_dec,
                                                  summ_enc, docs_dec,
                                                  discrim_model, clf_model_arg,
                                                  fixed_lm, hp, self.dataset)
        if torch.cuda.is_available():
            summarizer.sum_model.cuda()
        if summarizer.ngpus > 1:
            summarizer.sum_model = DataParallelModel(summarizer.sum_model)
        summarizer.sum_model.eval()
        with torch.no_grad():
            stats_avgs, evaluator, summaries = summarizer.run_epoch(
                data_iter,
                data_iter.__len__(),
                0,
                'test',
                store_all_rouges=True,
                store_all_summaries=True,
                save_intermediate=False,
                run_val_subset=False)

        #
        # Pass summaries through classifier
        #
        # Note: I know that since the SummarizationModel already calculates the classification accuracy
        # if sum_clf=True. Hence, technically, I could refactor it to add everything that I'd like to compute
        # in the forward pass and add to stats(). However, I think it's cleaner /easier to just do everything
        # I want here, especially if I add more things like per rating counts and accuracy. Plus,
        # it's just one pass through the test set -- which I'll run infrequently to evaluate a trained model.
        # I think that it takes more time is fine.
        #
        results = []
        accuracy = 0.0
        per_rating_counts = defaultdict(int)
        per_rating_acc = defaultdict(int)
        for i, (texts, ratings_batch, metadata) in enumerate(data_iter):
            summaries_batch = summaries[i * self.hp.batch_size:i *
                                        self.hp.batch_size + len(texts)]
            acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \
                classify_summ_batch(clf_model, summaries_batch, ratings_batch, self.dataset,
                                    per_rating_counts, per_rating_acc)

            if acc is None:
                print('Summary was too short to classify')
                pred_ratings = [None for _ in range(len(summaries_batch))]
                pred_probs = [None for _ in range(len(summaries_batch))]
            else:
                accuracy = update_moving_avg(accuracy, acc, i + 1)

            for j in range(len(summaries_batch)):
                dic = {
                    'docs': texts[j],
                    'summary': summaries_batch[j],
                    'rating': ratings_batch[j].item(),
                    'pred_rating': pred_ratings[j].item(),
                    'pred_prob': pred_probs[j].item()
                }
                for k, values in metadata.items():
                    dic[k] = values[j]
                results.append(dic)

        return evaluator, results, accuracy.item(), per_rating_acc
Ejemplo n.º 9
0
    def best_or_worst_review_baseline(self,
                                      data_iter,
                                      method='best',
                                      clf_model=None):
        """
        When summarizing n_docs reviews, calculate the average ROUGE1-F for each review as if it was the summary.
        Choose the document with the best / worst score.

        Note: it'd be far more efficient to calculate best and worst at the same time as all the rouges
        are already calculated...
        """
        evaluator = EvalMetrics(remove_stopwords=self.hp.remove_stopwords,
                                use_stemmer=self.hp.use_stemmer,
                                store_all=True)
        summaries = []
        accuracy = 0.0
        per_rating_counts = defaultdict(int)
        per_rating_acc = defaultdict(int)
        for i, (texts, ratings, metadata) in enumerate(data_iter):
            # texts is a list of of length batch_size
            # each item in texts is a str, i.e. n_docs documents concatenated together
            for j, text in enumerate(texts):
                bw_evaluator = None
                bw_rouge1_f = 0.0 if method == 'best' else 1.0
                bw_doc = None

                # Set each document as the summary and find the best one
                src_docs = SummDataset.split_docs(text)
                for doc in src_docs:
                    cur_evaluator = EvalMetrics(
                        remove_stopwords=self.hp.remove_stopwords,
                        use_stemmer=self.hp.use_stemmer,
                        store_all=True)
                    avg_rouges, _, _, _ = cur_evaluator.batch_update_avg_rouge(
                        [doc], [src_docs])
                    is_better_worse = (method == 'best' and (avg_rouges['rouge1']['f'] >= bw_rouge1_f)) or \
                                      (method == 'worst' and (avg_rouges['rouge1']['f'] <= bw_rouge1_f))
                    if is_better_worse:
                        bw_evaluator = cur_evaluator
                        bw_rouge1_f = avg_rouges['rouge1']['f']
                        bw_doc = doc

                evaluator.update_with_evaluator(bw_evaluator)

                try:
                    acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \
                        classify_summ_batch(clf_model, [bw_doc], [ratings[j]], self.dataset,
                                            per_rating_counts, per_rating_acc)
                except Exception as e:
                    # worst_review in the Amazon dataset has a rare edge case
                    # where the worst review is an empty string.
                    # No reviews should be empty, but it appears to just be one or two reviews
                    pass

                if acc is None:
                    print('Summary was too short to classify')
                    pred_rating, pred_prob = None, None
                else:
                    pred_rating, pred_prob = pred_ratings[j].item(
                    ), pred_probs[j].item()
                    accuracy = update_moving_avg(accuracy, acc,
                                                 i * len(texts) + j + 1)

                dic = {
                    'docs': text,
                    'summary': bw_doc,
                    'rating': ratings[j].item(),
                    'pred_rating': pred_rating,
                    'pred_prob': pred_prob
                }
                for k, values in metadata.items():
                    dic[k] = values[j]
                summaries.append(dic)

        return evaluator, summaries, accuracy.item(), per_rating_acc
Ejemplo n.º 10
0
    def ledes_baseline(self, data_iter, n=1, clf_model=None):
        """
        Add up until the first n sentences from each review, or until the maximum review length is exceeded
        """
        evaluator = EvalMetrics(remove_stopwords=self.hp.remove_stopwords,
                                use_stemmer=self.hp.use_stemmer,
                                store_all=True)
        summaries = []
        accuracy = 0.0
        per_rating_counts = defaultdict(int)
        per_rating_acc = defaultdict(int)
        for i, (texts, ratings, metadata) in enumerate(data_iter):
            # texts is a list of of length batch_size
            # each item in texts is a str, i.e. n_docs documents concatenated together
            for j, text in enumerate(texts):
                src_docs = SummDataset.split_docs(text)

                summary = []
                doc_sents = [nltk.sent_tokenize(doc) for doc in src_docs]
                summary_len = 0
                doc_idx, sent_idx = 0, 0

                # Keep adding sentences as long as summary isn't over maximum length and
                # there are still sentences to add
                while (summary_len <
                       self.dataset.conf.review_max_len) and (sent_idx < n):
                    # Current document has this many sentences
                    if sent_idx < len(doc_sents[doc_idx]):
                        sent = doc_sents[doc_idx][sent_idx]
                        sent_tok_len = len(nltk.word_tokenize(sent))

                        # Adding sentence won't exceed maximum length
                        if summary_len + sent_tok_len <= self.dataset.conf.review_max_len:
                            summary.append(sent)
                            summary_len += sent_tok_len

                    # Move on to next document
                    doc_idx = (doc_idx + 1) % len(src_docs)
                    if doc_idx == 0:  # back to the first doc, all first sentences have been added
                        sent_idx += 1

                summary = ' '.join(summary)
                evaluator.batch_update_avg_rouge([summary], [src_docs])
                acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \
                    classify_summ_batch(clf_model, [summary], [ratings[j]], self.dataset,
                                        per_rating_counts, per_rating_acc)

                if acc is None:
                    print('Summary was too short to classify')
                    pred_rating, pred_prob = None, None
                else:
                    pred_rating, pred_prob = pred_ratings[j].item(
                    ), pred_probs[j].item()
                    accuracy = update_moving_avg(accuracy, acc,
                                                 i * len(texts) + j + 1)

                dic = {
                    'docs': text,
                    'summary': summary,
                    'rating': ratings[j].item(),
                    'pred_rating': pred_rating,
                    'pred_prob': pred_prob
                }
                for k, values in metadata.items():
                    dic[k] = values[j]
                summaries.append(dic)

        return evaluator, summaries, accuracy.item(), per_rating_acc
Ejemplo n.º 11
0
    def batch_update_avg_rouge(self, summaries, source_docs):
        """
        Args:
            summaries: list of strs
            source_docs: list of lists of strs
        Returns: 4 (avg, min, max, std) rouge dicts for this batch
        """
        # Store average of the four statistics for this batch
        batch_avg_avg_rouges = self.get_rouge_defaultdict()
        batch_avg_min_rouges = self.get_rouge_defaultdict()
        batch_avg_max_rouges = self.get_rouge_defaultdict()
        batch_avg_std_rouges = self.get_rouge_defaultdict()

        for i, summary in enumerate(summaries):
            docs = source_docs[i]

            # Compute rouges between summary and each document
            rouges = self.get_rouge_defaultdict(list)
            for doc in docs:
                scores = self.calc_rouges(doc, summary)
                for rouge_name, rouge_obj in scores.items():  # rouge_name = rouge1, rouge2, rougeL
                    for metric in ['precision', 'recall', 'fmeasure']:
                        score = getattr(rouge_obj, metric)
                        rouges[rouge_name][metric[0]].append(score)  # [0] for first letter

            # Compute statistics and update batch and global averages
            avg_rouges = self.get_rouge_defaultdict()
            min_rouges = self.get_rouge_defaultdict()
            max_rouges = self.get_rouge_defaultdict()
            std_rouges = self.get_rouge_defaultdict()
            self._updates += 1  # global count
            for rouge_name, rouge_obj in rouges.items():
                for metric in ['precision', 'recall', 'fmeasure']:
                    scores = rouges[rouge_name][metric[0]]

                    avg, min, max, std = np.mean(scores), np.min(scores), np.max(scores), np.std(scores)
                    avg_rouges[rouge_name][metric[0]] = avg
                    min_rouges[rouge_name][metric[0]] = min
                    max_rouges[rouge_name][metric[0]] = max
                    std_rouges[rouge_name][metric[0]] = std

                    # update batch averages
                    cur_avg_avg = batch_avg_avg_rouges[rouge_name][metric[0]]
                    cur_avg_min = batch_avg_min_rouges[rouge_name][metric[0]]
                    cur_avg_max = batch_avg_max_rouges[rouge_name][metric[0]]
                    cur_avg_std = batch_avg_std_rouges[rouge_name][metric[0]]
                    batch_avg_avg_rouges[rouge_name][metric[0]] = update_moving_avg(cur_avg_avg, avg, i + 1)
                    batch_avg_min_rouges[rouge_name][metric[0]] = update_moving_avg(cur_avg_min, min, i + 1)
                    batch_avg_max_rouges[rouge_name][metric[0]] = update_moving_avg(cur_avg_max, max, i + 1)
                    batch_avg_std_rouges[rouge_name][metric[0]] = update_moving_avg(cur_avg_std, std, i + 1)

                    # update global averages
                    cur_avg_avg = self.avg_avg_rouges[rouge_name][metric[0]]
                    cur_avg_min = self.avg_min_rouges[rouge_name][metric[0]]
                    cur_avg_max = self.avg_max_rouges[rouge_name][metric[0]]
                    cur_avg_std = self.avg_std_rouges[rouge_name][metric[0]]
                    self.avg_avg_rouges[rouge_name][metric[0]] = update_moving_avg(cur_avg_avg, avg, self._updates)
                    self.avg_min_rouges[rouge_name][metric[0]] = update_moving_avg(cur_avg_min, min, self._updates)
                    self.avg_max_rouges[rouge_name][metric[0]] = update_moving_avg(cur_avg_max, max, self._updates)
                    self.avg_std_rouges[rouge_name][metric[0]] = update_moving_avg(cur_avg_std, std, self._updates)

                    # Add to dictionary storing all stats
                    if self.store_all:
                        self.avg_rouges[rouge_name][metric[0]].append(avg)
                        self.min_rouges[rouge_name][metric[0]].append(min)
                        self.max_rouges[rouge_name][metric[0]].append(max)
                        self.std_rouges[rouge_name][metric[0]].append(std)

        return batch_avg_avg_rouges, batch_avg_min_rouges, batch_avg_max_rouges, batch_avg_std_rouges