def extractive_baseline(self, data_iter, clf_model=None): """ Run an extractive method """ evaluator = EvalMetrics(remove_stopwords=self.hp.remove_stopwords, use_stemmer=self.hp.use_stemmer, store_all=True) summarizer = CentroidW2VSummarizer(WORD2VEC_PATH, length_limit=2, topic_threshold=0.3, sim_threshold=0.95, reordering=True, subtract_centroid=False, keep_first=False, bow_param=0, length_param=0, position_param=0, debug=False) summaries = [] accuracy = 0.0 per_rating_counts = defaultdict(int) per_rating_acc = defaultdict(int) for i, (texts, ratings, metadata) in enumerate(data_iter): for j, text in enumerate(texts): # texts is a list of of length batch_size # each item in texts is a str, i.e. n_docs documents concatenated together src_docs = SummDataset.split_docs(text) # limit is number of words # concatenate documents without the token summary = summarizer.summarize( SummDataset.concat_docs(src_docs, edok_token=False), limit=self.dataset.conf.extractive_max_len) evaluator.batch_update_avg_rouge([summary], [src_docs]) acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \ classify_summ_batch(clf_model, [summary], [ratings[j]], self.dataset, per_rating_counts, per_rating_acc) if acc is None: print('Summary was too short to classify') pred_rating, pred_prob = None, None else: pred_rating, pred_prob = pred_ratings[j].item( ), pred_probs[j].item() accuracy = update_moving_avg(accuracy, acc, i * len(texts) + j + 1) dic = { 'docs': text, 'summary': summary, 'rating': ratings[j].item(), 'pred_rating': pred_rating, 'pred_prob': pred_prob } for k, values in metadata.items(): dic[k] = values[j] summaries.append(dic) return evaluator, summaries, accuracy.item(), per_rating_acc
def __getitem__(self, idx): # Map idx to item and load reviews item = self.idx_to_item[idx] fp = os.path.join(self.ds_conf.processed_path, '{}/{}_reviews.json'.format(self.split, item)) reviews = load_file(fp) # Get reviews from item if self.sample_reviews: if len(reviews) < self.n_docs: reviews = np.random.choice(reviews, size=self.n_docs, replace=True) else: reviews = np.random.choice(reviews, size=self.n_docs, replace=False) else: start_idx = self.idx_to_item_startidx[idx] reviews = reviews[start_idx:start_idx + self.n_docs] # Collect data to be returned texts, ratings = zip(*[(s['reviewText'], s['overall']) for s in reviews]) texts = SummDataset.concat_docs(texts, edok_token=True) avg_rating = int(np.round(np.mean(ratings))) metadata = {'item': item, 'category': reviews[0]['category']} # all the reviews are for the same item, each review will have same category so use 0-th return texts, avg_rating, metadata
def __getitem__(self, idx): # Map idx to item and load reviews item = self.idx_to_item[idx] # id fp = os.path.join(self.ds_conf.processed_path, '{}/{}_reviews.json'.format(self.split, item)) reviews = load_file(fp) # Get reviews from item if self.sample_reviews: if self.n_reviews_min and self.n_reviews_max: review_idxs = self.idx_to_item_idxs[idx] reviews = [reviews[r_idx] for r_idx in review_idxs] else: if len(reviews) < self.n_reviews: reviews = np.random.choice(reviews, size=self.n_reviews, replace=True) else: reviews = np.random.choice(reviews, size=self.n_reviews, replace=False) else: start_idx = self.idx_to_item_startidx[idx] reviews = reviews[start_idx:start_idx + self.n_reviews] # Collect data for this item texts, ratings = zip(*[(s['text'], s['stars']) for s in reviews]) texts = SummDataset.concat_docs(texts, edok_token=True) avg_rating = int(np.round(np.mean(ratings))) try: categories = '---'.join(self.items[item]['categories']) except Exception as e: print(e) categories = '---' metadata = { 'item': item, 'city': self.items[item]['city'], 'categories': categories } # try: # metadata = {'item': item, # 'city': self.items[item]['city'], # 'categories': '---'.join(self.items[item]['categories'])} # except Exception as e: # print(e) # pdb.set_trace() return texts, avg_rating, metadata
def run_clf_baseline(self): """ Calculate the classification accuracy when the input is all the reviews concatenated together. This provdies a sort of ceiling on how well each of the summarization methods can do, as the classification model is not perfect either. """ print('\n', '=' * 50) print('Running classifier baseline') # Load classifier clf_model = torch.load(self.opt.load_clf)['model'] clf_model = clf_model.module if isinstance( clf_model, nn.DataParallel) else clf_model if torch.cuda.is_available(): clf_model.cuda() if len(self.opt.gpus) > 1: clf_model = nn.DataParallel(clf_model) summaries = [] accuracy = 0.0 per_rating_counts = defaultdict(int) per_rating_acc = defaultdict(int) dl = self.get_test_set_data_iter(self.hp.batch_size) for i, (texts, ratings_batch, metadata) in enumerate(dl): summaries_batch = [] for j, text in enumerate(texts): # texts is a list of of length batch_size # each item in texts is a str, i.e. n_docs documents concatenated together # concatenate documents without the token src_docs = SummDataset.split_docs(text) summary = SummDataset.concat_docs(src_docs, edok_token=False) summaries_batch.append(summary) acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \ classify_summ_batch(clf_model, summaries_batch, ratings_batch, self.dataset, per_rating_counts, per_rating_acc) accuracy = update_moving_avg(accuracy, acc, i + 1) for j in range(len(summaries_batch)): dic = { 'docs': summaries_batch[j], 'rating': ratings_batch[j].item(), 'pred_rating': pred_ratings[j].item(), 'pred_prob': pred_probs[j].item() } for k, values in metadata.items(): dic[k] = values[j] summaries.append(dic) # Calculate NLL of summaries using fixed, pretrained LM pretrained_lm = torch.load( self.opt.load_lm)['model'] # StackedLSTMEncoder pretrained_lm = pretrained_lm.module if isinstance( pretrained_lm, nn.DataParallel) else pretrained_lm avg_nll = 0.0 batch_size = self.hp.batch_size for i in range(0, len(summaries), batch_size): batch_summs = summaries[i:i + batch_size] batch_texts = [d['docs'] for d in batch_summs] dummy_ratings = [ torch.LongTensor([0]) for _ in range(len(batch_texts)) ] batch_x, _, _ = self.dataset.prepare_batch(batch_texts, dummy_ratings) nll = calc_lm_nll(pretrained_lm, batch_x) avg_nll = update_moving_avg(avg_nll, nll.item(), i + 1) # Print and save accuracies, summaries, etc. print('NLL: ', avg_nll) print('Accuracy: ', accuracy.item()) print('Per rating accuracy: ', per_rating_acc) dataset_dir = self.opt.dataset if self.opt.az_cat is None else 'amazon_{}'.format( self.opt.az_cat) out_dir = os.path.join(OUTPUTS_EVAL_DIR, dataset_dir, 'n_docs_{}'.format(self.hp.n_docs), 'clf_baseline') if not os.path.exists(out_dir): os.makedirs(out_dir) out_fp = os.path.join(out_dir, 'summaries.json') save_file(summaries, out_fp) out_fp = os.path.join(out_dir, 'stats.json') save_file( { 'acc': accuracy.item(), 'per_rating_acc': per_rating_acc, 'nll': avg_nll }, out_fp)
def best_or_worst_review_baseline(self, data_iter, method='best', clf_model=None): """ When summarizing n_docs reviews, calculate the average ROUGE1-F for each review as if it was the summary. Choose the document with the best / worst score. Note: it'd be far more efficient to calculate best and worst at the same time as all the rouges are already calculated... """ evaluator = EvalMetrics(remove_stopwords=self.hp.remove_stopwords, use_stemmer=self.hp.use_stemmer, store_all=True) summaries = [] accuracy = 0.0 per_rating_counts = defaultdict(int) per_rating_acc = defaultdict(int) for i, (texts, ratings, metadata) in enumerate(data_iter): # texts is a list of of length batch_size # each item in texts is a str, i.e. n_docs documents concatenated together for j, text in enumerate(texts): bw_evaluator = None bw_rouge1_f = 0.0 if method == 'best' else 1.0 bw_doc = None # Set each document as the summary and find the best one src_docs = SummDataset.split_docs(text) for doc in src_docs: cur_evaluator = EvalMetrics( remove_stopwords=self.hp.remove_stopwords, use_stemmer=self.hp.use_stemmer, store_all=True) avg_rouges, _, _, _ = cur_evaluator.batch_update_avg_rouge( [doc], [src_docs]) is_better_worse = (method == 'best' and (avg_rouges['rouge1']['f'] >= bw_rouge1_f)) or \ (method == 'worst' and (avg_rouges['rouge1']['f'] <= bw_rouge1_f)) if is_better_worse: bw_evaluator = cur_evaluator bw_rouge1_f = avg_rouges['rouge1']['f'] bw_doc = doc evaluator.update_with_evaluator(bw_evaluator) try: acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \ classify_summ_batch(clf_model, [bw_doc], [ratings[j]], self.dataset, per_rating_counts, per_rating_acc) except Exception as e: # worst_review in the Amazon dataset has a rare edge case # where the worst review is an empty string. # No reviews should be empty, but it appears to just be one or two reviews pass if acc is None: print('Summary was too short to classify') pred_rating, pred_prob = None, None else: pred_rating, pred_prob = pred_ratings[j].item( ), pred_probs[j].item() accuracy = update_moving_avg(accuracy, acc, i * len(texts) + j + 1) dic = { 'docs': text, 'summary': bw_doc, 'rating': ratings[j].item(), 'pred_rating': pred_rating, 'pred_prob': pred_prob } for k, values in metadata.items(): dic[k] = values[j] summaries.append(dic) return evaluator, summaries, accuracy.item(), per_rating_acc
def ledes_baseline(self, data_iter, n=1, clf_model=None): """ Add up until the first n sentences from each review, or until the maximum review length is exceeded """ evaluator = EvalMetrics(remove_stopwords=self.hp.remove_stopwords, use_stemmer=self.hp.use_stemmer, store_all=True) summaries = [] accuracy = 0.0 per_rating_counts = defaultdict(int) per_rating_acc = defaultdict(int) for i, (texts, ratings, metadata) in enumerate(data_iter): # texts is a list of of length batch_size # each item in texts is a str, i.e. n_docs documents concatenated together for j, text in enumerate(texts): src_docs = SummDataset.split_docs(text) summary = [] doc_sents = [nltk.sent_tokenize(doc) for doc in src_docs] summary_len = 0 doc_idx, sent_idx = 0, 0 # Keep adding sentences as long as summary isn't over maximum length and # there are still sentences to add while (summary_len < self.dataset.conf.review_max_len) and (sent_idx < n): # Current document has this many sentences if sent_idx < len(doc_sents[doc_idx]): sent = doc_sents[doc_idx][sent_idx] sent_tok_len = len(nltk.word_tokenize(sent)) # Adding sentence won't exceed maximum length if summary_len + sent_tok_len <= self.dataset.conf.review_max_len: summary.append(sent) summary_len += sent_tok_len # Move on to next document doc_idx = (doc_idx + 1) % len(src_docs) if doc_idx == 0: # back to the first doc, all first sentences have been added sent_idx += 1 summary = ' '.join(summary) evaluator.batch_update_avg_rouge([summary], [src_docs]) acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \ classify_summ_batch(clf_model, [summary], [ratings[j]], self.dataset, per_rating_counts, per_rating_acc) if acc is None: print('Summary was too short to classify') pred_rating, pred_prob = None, None else: pred_rating, pred_prob = pred_ratings[j].item( ), pred_probs[j].item() accuracy = update_moving_avg(accuracy, acc, i * len(texts) + j + 1) dic = { 'docs': text, 'summary': summary, 'rating': ratings[j].item(), 'pred_rating': pred_rating, 'pred_prob': pred_prob } for k, values in metadata.items(): dic[k] = values[j] summaries.append(dic) return evaluator, summaries, accuracy.item(), per_rating_acc