def calc_per_rating_acc(pred_ratings, true_ratings, per_rating_counts, per_rating_acc): """ Calculate the accuracy of each star rating Args: pred_ratings: 1D Tensor (e.g. batch_size) true_ratings: 1D Tensor (e.g. batch_size) per_rating_counts: dict: rating to int per_rating_acc: dict: rating to float Returns: Updated per_rating_counts and per_rating_acc """ for b_idx in range(true_ratings.size(0)): true_rating, pred_rating = true_ratings[b_idx].item( ), pred_ratings[b_idx].item() per_rating_counts[true_rating] += 1 avg_so_far = per_rating_acc[true_rating] item_acc = true_rating == pred_rating rating_count = per_rating_counts[true_rating] per_rating_acc[true_rating] = update_moving_avg( avg_so_far, item_acc, rating_count) return per_rating_counts, per_rating_acc
def extractive_baseline(self, data_iter, clf_model=None): """ Run an extractive method """ evaluator = EvalMetrics(remove_stopwords=self.hp.remove_stopwords, use_stemmer=self.hp.use_stemmer, store_all=True) summarizer = CentroidW2VSummarizer(WORD2VEC_PATH, length_limit=2, topic_threshold=0.3, sim_threshold=0.95, reordering=True, subtract_centroid=False, keep_first=False, bow_param=0, length_param=0, position_param=0, debug=False) summaries = [] accuracy = 0.0 per_rating_counts = defaultdict(int) per_rating_acc = defaultdict(int) for i, (texts, ratings, metadata) in enumerate(data_iter): for j, text in enumerate(texts): # texts is a list of of length batch_size # each item in texts is a str, i.e. n_docs documents concatenated together src_docs = SummDataset.split_docs(text) # limit is number of words # concatenate documents without the token summary = summarizer.summarize( SummDataset.concat_docs(src_docs, edok_token=False), limit=self.dataset.conf.extractive_max_len) evaluator.batch_update_avg_rouge([summary], [src_docs]) acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \ classify_summ_batch(clf_model, [summary], [ratings[j]], self.dataset, per_rating_counts, per_rating_acc) if acc is None: print('Summary was too short to classify') pred_rating, pred_prob = None, None else: pred_rating, pred_prob = pred_ratings[j].item( ), pred_probs[j].item() accuracy = update_moving_avg(accuracy, acc, i * len(texts) + j + 1) dic = { 'docs': text, 'summary': summary, 'rating': ratings[j].item(), 'pred_rating': pred_rating, 'pred_prob': pred_prob } for k, values in metadata.items(): dic[k] = values[j] summaries.append(dic) return evaluator, summaries, accuracy.item(), per_rating_acc
def update_with_evaluator(self, evaluator): """ Use another EvalMetrics object to update the self.* rouge dicts. This is used by best_review_baseline() in run_evaluations. Args: evaluator: EvalMetrics instance """ self._updates += 1 # global count # Update moving averages for stat, rouge_dict in self.get_avg_stats_dicts().items(): src_rouge_dict = getattr(evaluator, 'avg_{}_rouges'.format(stat)) for rouge_name, d in src_rouge_dict.items(): for metric, score in d.items(): cur_score = rouge_dict[rouge_name][metric] rouge_dict[rouge_name][metric] = update_moving_avg(cur_score, score, self._updates) # Add to lists for stat, rouge_dict in self.get_list_stats_dicts().items(): src_rouge_dict = getattr(evaluator, '{}_rouges'.format(stat)) for rouge_name, d in src_rouge_dict.items(): for metric, scores in d.items(): rouge_dict[rouge_name][metric].extend(scores)
def run_epoch(self, data_iter, nbatches, epoch, split, optimizer=None, tb_writer=None): """ Args: data_iter: Pytorch DataLoader nbatches: int (number of batches in data_iter) epoch: int split: str ('train', 'val') optimizer: Wrapped optim (e.g. OptWrapper, NoamOpt) tb_writer: Tensorboard SummaryWriter Returns: 1D tensor containing average loss across all items in data_iter """ loss_avg = 0 n_fwds = 0 for s_idx, (texts, ratings, metadata) in enumerate(data_iter): start = time.time() # Add special tokens to texts x, lengths, labels = self.dataset.prepare_batch( texts, ratings, doc_append_id=EDOC_ID) iter = create_lm_data_iter(x, self.hp.lm_seq_len) for b_idx, batch_obj in enumerate(iter): if optimizer: optimizer.optimizer.zero_grad() # # Forward pass # if self.hp.model_type == 'mlstm': # Note: iter creates a sequence of length hp.lm_seq_len + 1, and batch_obj.trg is all about the # last token, while batch_obj.trg_y is all but the first token. They're named as such because # the Batch class was originally designed for the Encoder-Decoder version of the Transformer, and # the trg variables correspond to inputs to the Decoder. batch = move_to_cuda( batch_obj.trg ) # it's trg because doesn't include last token batch_trg = move_to_cuda(batch_obj.trg_y) batch_size, seq_len = batch.size() if b_idx == 0: h_init, c_init = self.model.module.rnn.state0(batch_size) if self.ngpus > 1 \ else self.model.rnn.state0(batch_size) h_init = move_to_cuda(h_init) c_init = move_to_cuda(c_init) # Forward steps for lstm result = self.model(batch, h_init, c_init) hiddens, cells, outputs = zip( *result) if self.ngpus > 1 else result # Calculate loss loss = 0 batch_trg = batch_trg.transpose( 0, 1).contiguous() # [seq_len, batch] if self.ngpus > 1: for t in range(len(outputs[0])): # length ngpus list of outputs at that time step loss += self.loss_fn( [outputs[i][t] for i in range(len(outputs))], batch_trg[t]) else: for t in range(len(outputs)): loss += self.loss_fn(outputs[t], batch_trg[t]) loss_value = loss.item() / self.hp.lm_seq_len # We only do bptt until lm_seq_len. Copy the hidden states so that we can continue the sequence if self.ngpus > 1: h_init = torch.cat([ copy_state(hiddens[i][-1]) for i in range(self.ngpus) ], dim=0) c_init = torch.cat([ copy_state(cells[i][-1]) for i in range(self.ngpus) ], dim=0) else: h_init = copy_state(hiddens[-1]) c_init = copy_state(cells[-1]) elif self.hp.model_type == 'transformer': # This is the decoder only version now logits = self.model(move_to_cuda(batch_obj.trg), move_to_cuda(batch_obj.trg_mask)) # logits: [batch, seq_len, vocab] loss = self.loss_fn(logits, move_to_cuda(batch_obj.trg_y)) loss /= move_to_cuda(batch_obj.ntokens.float( )) # normalize by number of non-pad tokens loss_value = loss.item() if self.ngpus > 1: # With the custom DataParallel, there is no gather() and the loss is calculated per # minibatch split on each GPU (see DataParallelCriterion's forward() -- the return # value is divided by the number of GPUs). We simply undo that operation here. # Also, note that the KLDivLoss in LabelSmoothing is already normalized by both # batch and seq_len, as we use size_average=False to prevent any normalization followed # by a manual normalization using the batch.ntokens. This oddity is because # KLDivLoss does not support ignore_index=PAD_ID as CrossEntropyLoss does. loss_value *= len(self.opt.gpus.split(',')) # # Backward pass # gn = -1.0 # dummy for val (norm can't be < 0 anyway) if optimizer: loss.backward() gn = calc_grad_norm( self.model ) # not actually using this, just for printing optimizer.step() loss_avg = update_moving_avg(loss_avg, loss_value, n_fwds + 1) n_fwds += 1 # Print print_str = 'Epoch={}, batch={}/{}, split={}, time={:.4f} --- ' \ 'loss={:.4f}, loss_avg_so_far={:.4f}, grad_norm={:.4f}' if s_idx % self.opt.print_every_nbatches == 0: print( print_str.format(epoch, s_idx, nbatches, split, time.time() - start, loss_value, loss_avg, gn)) if tb_writer: # Step for tensorboard: global steps in terms of number of reviews # This accounts for runs with different batch sizes step = (epoch * nbatches * self.hp.batch_size) + (s_idx * self.hp.batch_size) tb_writer.add_scalar('stats/loss', loss_value, step) # Save periodically so we don't have to wait for epoch to finish save_every = nbatches // 10 if save_every != 0 and s_idx % save_every == 0: save_model(self.save_dir, self.model, self.optimizer, epoch, self.opt, 'intermediate') print('Epoch={}, split={}, --- ' 'loss_avg={:.4f}'.format(epoch, split, loss_avg)) return loss_avg
def run_epoch(self, data_iter, nbatches, epoch, split, optimizer=None, tb_writer=None, save_intermediate=True): """ Args: data_iter: iterable providing minibatches nbatches: int (number of batches in data_iter) epoch: int split: str ('train', 'val') optimizer: Wrapped optim (e.g. OptWrapper) tb_writer: Tensorboard SummaryWriter save_intermediate: boolean (save intermediate checkpoints) Returns: 1D tensor containing average loss across all items in data_iter """ loss_avg = 0 acc_avg = 0 rating_diff_avg = 0 per_rating_counts = defaultdict(int) per_rating_acc = defaultdict(int) for s, batch in enumerate(data_iter): start = time.time() if optimizer: optimizer.optimizer.zero_grad() texts, ratings, metadata = batch batch_size = len(texts) x, lengths, labels = self.dataset.prepare_batch(texts, ratings) # # Forward pass # logits = self.model(x) if self.hp.clf_mse: logits = logits.squeeze(1) # [batch, 1] -> [batch] loss = self.loss_fn(logits, labels.float()) else: loss = self.loss_fn(logits, labels) loss_value = loss.item() acc = calc_clf_acc(logits, labels).item() # # Backward pass # gn = -1.0 # dummy for val (norm can't be < 0 anyway) if optimizer: loss.backward() gn = calc_grad_norm(self.model) # not actually using this, just for printing optimizer.step() # # Print etc. # loss_avg = update_moving_avg(loss_avg, loss_value, s + 1) acc_avg = update_moving_avg(acc_avg, acc, s + 1) print_str = 'Epoch={}, batch={}/{}, split={}, time={:.4f} --- ' \ 'loss={:.4f}, loss_avg_so_far={:.4f}, acc={:.4f}, acc_avg_so_far={:.4f}, grad_norm={:.4f}' if self.hp.clf_mse: rating_diff = (labels - logits.round().long()).float().mean() rating_diff_avg = update_moving_avg(rating_diff_avg, rating_diff, s + 1) print_str += ', rating_diff={:.4f}, rating_diff_avg_so_far={:.4f}'.format(rating_diff, rating_diff_avg) true_ratings = labels + 1 pred_ratings = logits.round() + 1 probs = torch.ones(batch_size) # dummy per_rating_counts, per_rating_acc = calc_per_rating_acc(pred_ratings, true_ratings, per_rating_counts, per_rating_acc) else: true_ratings = labels + 1 probs, max_idxs = torch.max(F.softmax(logits, dim=1), dim=1) pred_ratings = max_idxs + 1 per_rating_counts, per_rating_acc = calc_per_rating_acc(pred_ratings, true_ratings, per_rating_counts, per_rating_acc) if s % self.opt.print_every_nbatches == 0: print(print_str.format( epoch, s, nbatches, split, time.time() - start, loss_value, loss_avg, acc, acc_avg, gn )) print('Review: {}'.format(texts[0])) print('True rating: {}'.format(true_ratings[0])) print('Predicted rating: {}'.format(pred_ratings[0])) print('Predicted rating probability: {:.4f}'.format(probs[0])) print('Per rating accuracy: {}'.format(dict(per_rating_acc))) if tb_writer: # Global steps in terms of number of items # This accounts for runs with different batch sizes step = (epoch * nbatches * self.hp.batch_size) + (s * self.hp.batch_size) tb_writer.add_scalar('loss/batch_loss', loss_value, step) tb_writer.add_scalar('loss/avg_loss', loss_avg, step) tb_writer.add_scalar('acc/batch_acc', acc, step) tb_writer.add_scalar('acc/avg_acc', acc_avg, step) if self.hp.clf_mse: tb_writer.add_scalar('rating_diff/batch_diff', rating_diff, step) tb_writer.add_scalar('rating_diff/avg_diff', rating_diff_avg, step) tb_writer.add_text('predictions/review', texts[0], step) tb_writer.add_text('predictions/true_pred_prob', 'True={}, Pred={}, Prob={:.4f}'.format( true_ratings[0], pred_ratings[0], probs[0]), step) for r, acc in per_rating_acc.items(): tb_writer.add_scalar('acc/curavg_per_rating_acc_{}'.format(r), acc, step) # Save periodically so we don't have to wait for epoch to finish if save_intermediate: save_every = nbatches // 10 if save_every != 0 and s % save_every == 0: model_to_save = self.model.module if len(self.opt.gpus) > 1 else self.model save_model(self.save_dir, model_to_save, self.optimizer, epoch, self.opt, 'intermediate') print_str = 'Epoch={}, split={}, --- ' \ 'loss_avg={:.4f}, acc_avg={:.4f}, per_rating_acc={}'.format( epoch, split, loss_avg, acc_avg, dict(per_rating_acc)) if self.hp.clf_mse: print_str += ', rating_diff_avg={:.4f}'.format(rating_diff_avg) print(print_str) return loss_avg, acc_avg, rating_diff_avg, per_rating_acc
def run_summarization_baseline(self, method): """ Args: method: str ('extractive', 'ledes-<n>', 'best_review', 'lm_autoenc') Saves outputs to: outputs/eval/<dataset>/<n_docs>/<method> """ batch_size = self.hp.batch_size if method == 'lm_autoenc' else 1 dl = self.get_test_set_data_iter(batch_size=batch_size) if torch.cuda.is_available(): clf_model = torch.load(self.opt.load_clf)['model'] else: raise Exception( 'You should run on a cuda machine to load and use the classifcation model' ) print('\n', '=' * 50) print('Running {} baseline'.format(method)) if method == 'extractive': evaluator, summaries, acc, per_rating_acc = self.extractive_baseline( dl, clf_model) elif 'ledes' in method: # e.g. ledes-2 n = int(method.split('-')[1]) evaluator, summaries, acc, per_rating_acc = self.ledes_baseline( dl, n, clf_model) elif method == 'best_review': evaluator, summaries, acc, per_rating_acc = self.best_or_worst_review_baseline( dl, 'best', clf_model) elif method == 'worst_review': evaluator, summaries, acc, per_rating_acc = self.best_or_worst_review_baseline( dl, 'worst', clf_model) elif method == 'lm_autoenc': evaluator, summaries, acc, per_rating_acc = self.lm_autoenc_baseline( dl, clf_model) # Calculate NLL of summaries using fixed, pretrained LM pretrained_lm = torch.load( self.opt.load_lm)['model'] # StackedLSTMEncoder pretrained_lm = pretrained_lm.module if isinstance( pretrained_lm, nn.DataParallel) else pretrained_lm avg_nll = 0.0 loop_idx = 0 for i in range(0, len(summaries), batch_size): batch_summs = summaries[i:i + batch_size] batch_texts = [d['summary'] for d in batch_summs] dummy_ratings = [ torch.LongTensor([0]) for _ in range(len(batch_texts)) ] try: batch_x, _, _ = self.dataset.prepare_batch( batch_texts, dummy_ratings) nll = calc_lm_nll(pretrained_lm, batch_x) if not np.isnan(nll.detach().cpu().numpy()): avg_nll = update_moving_avg(avg_nll, nll.item(), loop_idx + 1) loop_idx += 1 else: # lm_autoenc baseline has a rare edge case where a nan is produced continue except Exception as e: # worst_review in the Amazon dataset has a rare edge case # where the worst review is an empty string. # No reviews should be empty, but it appears to just be one or two reviews print(e) continue # Save summaries, stats, rouge scores, etc. dataset_dir = self.opt.dataset if self.opt.az_cat is None else 'amazon_{}'.format( self.opt.az_cat) out_dir = os.path.join(OUTPUTS_EVAL_DIR, dataset_dir, 'n_docs_{}'.format(self.hp.n_docs), method) if not os.path.exists(out_dir): os.makedirs(out_dir) summs_out_fp = os.path.join(out_dir, 'summaries.json') save_file(summaries, summs_out_fp) out_fp = os.path.join(out_dir, 'stats.json') save_file( { 'acc': acc, 'per_rating_acc': per_rating_acc, 'nll': avg_nll }, out_fp) print('-' * 50) print('Rating accuracy: ', acc) print('NLL: ', avg_nll) print('Per rating accuracy: ', dict(per_rating_acc)) for stat, rouge_dict in evaluator.get_avg_stats_dicts().items(): print('-' * 50) print(stat.upper()) print(evaluator.to_str(rouge_dict)) out_fp = os.path.join(out_dir, 'avg_{}-rouges.json'.format(stat)) save_file(rouge_dict, out_fp) out_fp = os.path.join(out_dir, 'avg_{}-rouges.csv'.format(stat)) evaluator.to_csv(rouge_dict, out_fp) out_fp = os.path.join(out_dir, '{}-rouges.pdf') evaluator.plot_rouge_distributions(show=self.opt.show_figs, out_fp=out_fp)
def run_clf_baseline(self): """ Calculate the classification accuracy when the input is all the reviews concatenated together. This provdies a sort of ceiling on how well each of the summarization methods can do, as the classification model is not perfect either. """ print('\n', '=' * 50) print('Running classifier baseline') # Load classifier clf_model = torch.load(self.opt.load_clf)['model'] clf_model = clf_model.module if isinstance( clf_model, nn.DataParallel) else clf_model if torch.cuda.is_available(): clf_model.cuda() if len(self.opt.gpus) > 1: clf_model = nn.DataParallel(clf_model) summaries = [] accuracy = 0.0 per_rating_counts = defaultdict(int) per_rating_acc = defaultdict(int) dl = self.get_test_set_data_iter(self.hp.batch_size) for i, (texts, ratings_batch, metadata) in enumerate(dl): summaries_batch = [] for j, text in enumerate(texts): # texts is a list of of length batch_size # each item in texts is a str, i.e. n_docs documents concatenated together # concatenate documents without the token src_docs = SummDataset.split_docs(text) summary = SummDataset.concat_docs(src_docs, edok_token=False) summaries_batch.append(summary) acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \ classify_summ_batch(clf_model, summaries_batch, ratings_batch, self.dataset, per_rating_counts, per_rating_acc) accuracy = update_moving_avg(accuracy, acc, i + 1) for j in range(len(summaries_batch)): dic = { 'docs': summaries_batch[j], 'rating': ratings_batch[j].item(), 'pred_rating': pred_ratings[j].item(), 'pred_prob': pred_probs[j].item() } for k, values in metadata.items(): dic[k] = values[j] summaries.append(dic) # Calculate NLL of summaries using fixed, pretrained LM pretrained_lm = torch.load( self.opt.load_lm)['model'] # StackedLSTMEncoder pretrained_lm = pretrained_lm.module if isinstance( pretrained_lm, nn.DataParallel) else pretrained_lm avg_nll = 0.0 batch_size = self.hp.batch_size for i in range(0, len(summaries), batch_size): batch_summs = summaries[i:i + batch_size] batch_texts = [d['docs'] for d in batch_summs] dummy_ratings = [ torch.LongTensor([0]) for _ in range(len(batch_texts)) ] batch_x, _, _ = self.dataset.prepare_batch(batch_texts, dummy_ratings) nll = calc_lm_nll(pretrained_lm, batch_x) avg_nll = update_moving_avg(avg_nll, nll.item(), i + 1) # Print and save accuracies, summaries, etc. print('NLL: ', avg_nll) print('Accuracy: ', accuracy.item()) print('Per rating accuracy: ', per_rating_acc) dataset_dir = self.opt.dataset if self.opt.az_cat is None else 'amazon_{}'.format( self.opt.az_cat) out_dir = os.path.join(OUTPUTS_EVAL_DIR, dataset_dir, 'n_docs_{}'.format(self.hp.n_docs), 'clf_baseline') if not os.path.exists(out_dir): os.makedirs(out_dir) out_fp = os.path.join(out_dir, 'summaries.json') save_file(summaries, out_fp) out_fp = os.path.join(out_dir, 'stats.json') save_file( { 'acc': accuracy.item(), 'per_rating_acc': per_rating_acc, 'nll': avg_nll }, out_fp)
def lm_autoenc_baseline(self, data_iter, clf_model=None): """ Use the pretrained language model to initialize an encoder-decoder model. This is basically the unsupervised abstractive summarization model without training. """ # Load encoder decoder by initializing with languag emodel docs_enc = torch.load(self.opt.load_lm)['model'] # StackedLSTMEncoder docs_enc = docs_enc.module if isinstance(docs_enc, nn.DataParallel) else docs_enc summ_dec = StackedLSTMDecoder(copy.deepcopy(docs_enc.embed), copy.deepcopy(docs_enc.rnn)) # Create Summarizer so that we can use run_epoch() # Copy hp and opt as we're modifying some params. This way there won't be any unexpected errors # if it's used by another method hp = copy.deepcopy(self.hp) hp.sum_cycle = False hp.autoenc_docs = False hp.sum_clf = False opt = copy.deepcopy(self.opt) opt.print_every_nbatches = float('inf') summarizer = Summarizer(hp, opt, '/tmp/') summarizer.tb_val_sub_writer = None summarizer.tau = self.hp.tau summarizer.ngpus = 1 if len(self.opt.gpus) == 1 else len( self.opt.gpus.split(',')) summarizer.sum_model = torch.load(self.opt.load_lm) summarizer.dataset = self.dataset summarizer.fixed_lm = torch.load( self.opt.load_lm)['model'] # StackedLSTMEncoder summarizer.fixed_lm = summarizer.fixed_lm.module if isinstance(summarizer.fixed_lm, nn.DataParallel) \ else summarizer.fixed_lm # Create SummarizationModel docs_autodec, combine_encs_h_net, combine_encs_c_net = None, None, None summ_enc, docs_dec, discrim_model, clf_model_arg, fixed_lm = None, None, None, None, None summarizer.sum_model = SummarizationModel(docs_enc, docs_autodec, combine_encs_h_net, combine_encs_c_net, summ_dec, summ_enc, docs_dec, discrim_model, clf_model_arg, fixed_lm, hp, self.dataset) if torch.cuda.is_available(): summarizer.sum_model.cuda() if summarizer.ngpus > 1: summarizer.sum_model = DataParallelModel(summarizer.sum_model) summarizer.sum_model.eval() with torch.no_grad(): stats_avgs, evaluator, summaries = summarizer.run_epoch( data_iter, data_iter.__len__(), 0, 'test', store_all_rouges=True, store_all_summaries=True, save_intermediate=False, run_val_subset=False) # # Pass summaries through classifier # # Note: I know that since the SummarizationModel already calculates the classification accuracy # if sum_clf=True. Hence, technically, I could refactor it to add everything that I'd like to compute # in the forward pass and add to stats(). However, I think it's cleaner /easier to just do everything # I want here, especially if I add more things like per rating counts and accuracy. Plus, # it's just one pass through the test set -- which I'll run infrequently to evaluate a trained model. # I think that it takes more time is fine. # results = [] accuracy = 0.0 per_rating_counts = defaultdict(int) per_rating_acc = defaultdict(int) for i, (texts, ratings_batch, metadata) in enumerate(data_iter): summaries_batch = summaries[i * self.hp.batch_size:i * self.hp.batch_size + len(texts)] acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \ classify_summ_batch(clf_model, summaries_batch, ratings_batch, self.dataset, per_rating_counts, per_rating_acc) if acc is None: print('Summary was too short to classify') pred_ratings = [None for _ in range(len(summaries_batch))] pred_probs = [None for _ in range(len(summaries_batch))] else: accuracy = update_moving_avg(accuracy, acc, i + 1) for j in range(len(summaries_batch)): dic = { 'docs': texts[j], 'summary': summaries_batch[j], 'rating': ratings_batch[j].item(), 'pred_rating': pred_ratings[j].item(), 'pred_prob': pred_probs[j].item() } for k, values in metadata.items(): dic[k] = values[j] results.append(dic) return evaluator, results, accuracy.item(), per_rating_acc
def best_or_worst_review_baseline(self, data_iter, method='best', clf_model=None): """ When summarizing n_docs reviews, calculate the average ROUGE1-F for each review as if it was the summary. Choose the document with the best / worst score. Note: it'd be far more efficient to calculate best and worst at the same time as all the rouges are already calculated... """ evaluator = EvalMetrics(remove_stopwords=self.hp.remove_stopwords, use_stemmer=self.hp.use_stemmer, store_all=True) summaries = [] accuracy = 0.0 per_rating_counts = defaultdict(int) per_rating_acc = defaultdict(int) for i, (texts, ratings, metadata) in enumerate(data_iter): # texts is a list of of length batch_size # each item in texts is a str, i.e. n_docs documents concatenated together for j, text in enumerate(texts): bw_evaluator = None bw_rouge1_f = 0.0 if method == 'best' else 1.0 bw_doc = None # Set each document as the summary and find the best one src_docs = SummDataset.split_docs(text) for doc in src_docs: cur_evaluator = EvalMetrics( remove_stopwords=self.hp.remove_stopwords, use_stemmer=self.hp.use_stemmer, store_all=True) avg_rouges, _, _, _ = cur_evaluator.batch_update_avg_rouge( [doc], [src_docs]) is_better_worse = (method == 'best' and (avg_rouges['rouge1']['f'] >= bw_rouge1_f)) or \ (method == 'worst' and (avg_rouges['rouge1']['f'] <= bw_rouge1_f)) if is_better_worse: bw_evaluator = cur_evaluator bw_rouge1_f = avg_rouges['rouge1']['f'] bw_doc = doc evaluator.update_with_evaluator(bw_evaluator) try: acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \ classify_summ_batch(clf_model, [bw_doc], [ratings[j]], self.dataset, per_rating_counts, per_rating_acc) except Exception as e: # worst_review in the Amazon dataset has a rare edge case # where the worst review is an empty string. # No reviews should be empty, but it appears to just be one or two reviews pass if acc is None: print('Summary was too short to classify') pred_rating, pred_prob = None, None else: pred_rating, pred_prob = pred_ratings[j].item( ), pred_probs[j].item() accuracy = update_moving_avg(accuracy, acc, i * len(texts) + j + 1) dic = { 'docs': text, 'summary': bw_doc, 'rating': ratings[j].item(), 'pred_rating': pred_rating, 'pred_prob': pred_prob } for k, values in metadata.items(): dic[k] = values[j] summaries.append(dic) return evaluator, summaries, accuracy.item(), per_rating_acc
def ledes_baseline(self, data_iter, n=1, clf_model=None): """ Add up until the first n sentences from each review, or until the maximum review length is exceeded """ evaluator = EvalMetrics(remove_stopwords=self.hp.remove_stopwords, use_stemmer=self.hp.use_stemmer, store_all=True) summaries = [] accuracy = 0.0 per_rating_counts = defaultdict(int) per_rating_acc = defaultdict(int) for i, (texts, ratings, metadata) in enumerate(data_iter): # texts is a list of of length batch_size # each item in texts is a str, i.e. n_docs documents concatenated together for j, text in enumerate(texts): src_docs = SummDataset.split_docs(text) summary = [] doc_sents = [nltk.sent_tokenize(doc) for doc in src_docs] summary_len = 0 doc_idx, sent_idx = 0, 0 # Keep adding sentences as long as summary isn't over maximum length and # there are still sentences to add while (summary_len < self.dataset.conf.review_max_len) and (sent_idx < n): # Current document has this many sentences if sent_idx < len(doc_sents[doc_idx]): sent = doc_sents[doc_idx][sent_idx] sent_tok_len = len(nltk.word_tokenize(sent)) # Adding sentence won't exceed maximum length if summary_len + sent_tok_len <= self.dataset.conf.review_max_len: summary.append(sent) summary_len += sent_tok_len # Move on to next document doc_idx = (doc_idx + 1) % len(src_docs) if doc_idx == 0: # back to the first doc, all first sentences have been added sent_idx += 1 summary = ' '.join(summary) evaluator.batch_update_avg_rouge([summary], [src_docs]) acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \ classify_summ_batch(clf_model, [summary], [ratings[j]], self.dataset, per_rating_counts, per_rating_acc) if acc is None: print('Summary was too short to classify') pred_rating, pred_prob = None, None else: pred_rating, pred_prob = pred_ratings[j].item( ), pred_probs[j].item() accuracy = update_moving_avg(accuracy, acc, i * len(texts) + j + 1) dic = { 'docs': text, 'summary': summary, 'rating': ratings[j].item(), 'pred_rating': pred_rating, 'pred_prob': pred_prob } for k, values in metadata.items(): dic[k] = values[j] summaries.append(dic) return evaluator, summaries, accuracy.item(), per_rating_acc
def batch_update_avg_rouge(self, summaries, source_docs): """ Args: summaries: list of strs source_docs: list of lists of strs Returns: 4 (avg, min, max, std) rouge dicts for this batch """ # Store average of the four statistics for this batch batch_avg_avg_rouges = self.get_rouge_defaultdict() batch_avg_min_rouges = self.get_rouge_defaultdict() batch_avg_max_rouges = self.get_rouge_defaultdict() batch_avg_std_rouges = self.get_rouge_defaultdict() for i, summary in enumerate(summaries): docs = source_docs[i] # Compute rouges between summary and each document rouges = self.get_rouge_defaultdict(list) for doc in docs: scores = self.calc_rouges(doc, summary) for rouge_name, rouge_obj in scores.items(): # rouge_name = rouge1, rouge2, rougeL for metric in ['precision', 'recall', 'fmeasure']: score = getattr(rouge_obj, metric) rouges[rouge_name][metric[0]].append(score) # [0] for first letter # Compute statistics and update batch and global averages avg_rouges = self.get_rouge_defaultdict() min_rouges = self.get_rouge_defaultdict() max_rouges = self.get_rouge_defaultdict() std_rouges = self.get_rouge_defaultdict() self._updates += 1 # global count for rouge_name, rouge_obj in rouges.items(): for metric in ['precision', 'recall', 'fmeasure']: scores = rouges[rouge_name][metric[0]] avg, min, max, std = np.mean(scores), np.min(scores), np.max(scores), np.std(scores) avg_rouges[rouge_name][metric[0]] = avg min_rouges[rouge_name][metric[0]] = min max_rouges[rouge_name][metric[0]] = max std_rouges[rouge_name][metric[0]] = std # update batch averages cur_avg_avg = batch_avg_avg_rouges[rouge_name][metric[0]] cur_avg_min = batch_avg_min_rouges[rouge_name][metric[0]] cur_avg_max = batch_avg_max_rouges[rouge_name][metric[0]] cur_avg_std = batch_avg_std_rouges[rouge_name][metric[0]] batch_avg_avg_rouges[rouge_name][metric[0]] = update_moving_avg(cur_avg_avg, avg, i + 1) batch_avg_min_rouges[rouge_name][metric[0]] = update_moving_avg(cur_avg_min, min, i + 1) batch_avg_max_rouges[rouge_name][metric[0]] = update_moving_avg(cur_avg_max, max, i + 1) batch_avg_std_rouges[rouge_name][metric[0]] = update_moving_avg(cur_avg_std, std, i + 1) # update global averages cur_avg_avg = self.avg_avg_rouges[rouge_name][metric[0]] cur_avg_min = self.avg_min_rouges[rouge_name][metric[0]] cur_avg_max = self.avg_max_rouges[rouge_name][metric[0]] cur_avg_std = self.avg_std_rouges[rouge_name][metric[0]] self.avg_avg_rouges[rouge_name][metric[0]] = update_moving_avg(cur_avg_avg, avg, self._updates) self.avg_min_rouges[rouge_name][metric[0]] = update_moving_avg(cur_avg_min, min, self._updates) self.avg_max_rouges[rouge_name][metric[0]] = update_moving_avg(cur_avg_max, max, self._updates) self.avg_std_rouges[rouge_name][metric[0]] = update_moving_avg(cur_avg_std, std, self._updates) # Add to dictionary storing all stats if self.store_all: self.avg_rouges[rouge_name][metric[0]].append(avg) self.min_rouges[rouge_name][metric[0]].append(min) self.max_rouges[rouge_name][metric[0]].append(max) self.std_rouges[rouge_name][metric[0]].append(std) return batch_avg_avg_rouges, batch_avg_min_rouges, batch_avg_max_rouges, batch_avg_std_rouges