def extractive_baseline(self, data_iter, clf_model=None):
        """
        Run an extractive method
        """
        evaluator = EvalMetrics(remove_stopwords=self.hp.remove_stopwords,
                                use_stemmer=self.hp.use_stemmer,
                                store_all=True)
        summarizer = CentroidW2VSummarizer(WORD2VEC_PATH,
                                           length_limit=2,
                                           topic_threshold=0.3,
                                           sim_threshold=0.95,
                                           reordering=True,
                                           subtract_centroid=False,
                                           keep_first=False,
                                           bow_param=0,
                                           length_param=0,
                                           position_param=0,
                                           debug=False)

        summaries = []
        accuracy = 0.0
        per_rating_counts = defaultdict(int)
        per_rating_acc = defaultdict(int)
        for i, (texts, ratings, metadata) in enumerate(data_iter):
            for j, text in enumerate(texts):
                # texts is a list of of length batch_size
                # each item in texts is a str, i.e. n_docs documents concatenated together
                src_docs = SummDataset.split_docs(text)
                # limit is number of words
                # concatenate documents without the token
                summary = summarizer.summarize(
                    SummDataset.concat_docs(src_docs, edok_token=False),
                    limit=self.dataset.conf.extractive_max_len)
                evaluator.batch_update_avg_rouge([summary], [src_docs])
                acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \
                    classify_summ_batch(clf_model, [summary], [ratings[j]], self.dataset,
                                        per_rating_counts, per_rating_acc)

                if acc is None:
                    print('Summary was too short to classify')
                    pred_rating, pred_prob = None, None
                else:
                    pred_rating, pred_prob = pred_ratings[j].item(
                    ), pred_probs[j].item()
                    accuracy = update_moving_avg(accuracy, acc,
                                                 i * len(texts) + j + 1)

                dic = {
                    'docs': text,
                    'summary': summary,
                    'rating': ratings[j].item(),
                    'pred_rating': pred_rating,
                    'pred_prob': pred_prob
                }
                for k, values in metadata.items():
                    dic[k] = values[j]
                summaries.append(dic)

        return evaluator, summaries, accuracy.item(), per_rating_acc
Exemple #2
0
    def __getitem__(self, idx):
        # Map idx to item and load reviews
        item = self.idx_to_item[idx]
        fp = os.path.join(self.ds_conf.processed_path,
                          '{}/{}_reviews.json'.format(self.split, item))
        reviews = load_file(fp)

        # Get reviews from item
        if self.sample_reviews:
            if len(reviews) < self.n_docs:
                reviews = np.random.choice(reviews,
                                           size=self.n_docs,
                                           replace=True)
            else:
                reviews = np.random.choice(reviews,
                                           size=self.n_docs,
                                           replace=False)
        else:
            start_idx = self.idx_to_item_startidx[idx]
            reviews = reviews[start_idx:start_idx + self.n_docs]

        # Collect data to be returned
        texts, ratings = zip(*[(s['reviewText'], s['overall'])
                               for s in reviews])
        texts = SummDataset.concat_docs(texts, edok_token=True)
        avg_rating = int(np.round(np.mean(ratings)))
        metadata = {'item': item, 'category': reviews[0]['category']}
        # all the reviews are for the same item, each review will have same category so use 0-th

        return texts, avg_rating, metadata
Exemple #3
0
    def __getitem__(self, idx):
        # Map idx to item and load reviews
        item = self.idx_to_item[idx]  # id
        fp = os.path.join(self.ds_conf.processed_path,
                          '{}/{}_reviews.json'.format(self.split, item))
        reviews = load_file(fp)

        # Get reviews from item
        if self.sample_reviews:
            if self.n_reviews_min and self.n_reviews_max:
                review_idxs = self.idx_to_item_idxs[idx]
                reviews = [reviews[r_idx] for r_idx in review_idxs]
            else:
                if len(reviews) < self.n_reviews:
                    reviews = np.random.choice(reviews,
                                               size=self.n_reviews,
                                               replace=True)
                else:
                    reviews = np.random.choice(reviews,
                                               size=self.n_reviews,
                                               replace=False)
        else:
            start_idx = self.idx_to_item_startidx[idx]
            reviews = reviews[start_idx:start_idx + self.n_reviews]

        # Collect data for this item
        texts, ratings = zip(*[(s['text'], s['stars']) for s in reviews])
        texts = SummDataset.concat_docs(texts, edok_token=True)
        avg_rating = int(np.round(np.mean(ratings)))

        try:
            categories = '---'.join(self.items[item]['categories'])
        except Exception as e:
            print(e)
            categories = '---'
        metadata = {
            'item': item,
            'city': self.items[item]['city'],
            'categories': categories
        }

        # try:
        #     metadata = {'item': item,
        #                 'city': self.items[item]['city'],
        #                 'categories': '---'.join(self.items[item]['categories'])}
        # except Exception as e:
        #     print(e)
        #     pdb.set_trace()

        return texts, avg_rating, metadata
    def run_clf_baseline(self):
        """
        Calculate the classification accuracy when the input is all the reviews concatenated together. This provdies
        a sort of ceiling on how well each of the summarization methods can do, as the classification model
        is not perfect either.
        """
        print('\n', '=' * 50)
        print('Running classifier baseline')

        # Load classifier
        clf_model = torch.load(self.opt.load_clf)['model']
        clf_model = clf_model.module if isinstance(
            clf_model, nn.DataParallel) else clf_model
        if torch.cuda.is_available():
            clf_model.cuda()
        if len(self.opt.gpus) > 1:
            clf_model = nn.DataParallel(clf_model)

        summaries = []
        accuracy = 0.0
        per_rating_counts = defaultdict(int)
        per_rating_acc = defaultdict(int)
        dl = self.get_test_set_data_iter(self.hp.batch_size)
        for i, (texts, ratings_batch, metadata) in enumerate(dl):
            summaries_batch = []
            for j, text in enumerate(texts):
                # texts is a list of of length batch_size
                # each item in texts is a str, i.e. n_docs documents concatenated together
                # concatenate documents without the token
                src_docs = SummDataset.split_docs(text)
                summary = SummDataset.concat_docs(src_docs, edok_token=False)
                summaries_batch.append(summary)

            acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \
                classify_summ_batch(clf_model, summaries_batch, ratings_batch, self.dataset,
                                    per_rating_counts, per_rating_acc)
            accuracy = update_moving_avg(accuracy, acc, i + 1)

            for j in range(len(summaries_batch)):
                dic = {
                    'docs': summaries_batch[j],
                    'rating': ratings_batch[j].item(),
                    'pred_rating': pred_ratings[j].item(),
                    'pred_prob': pred_probs[j].item()
                }
                for k, values in metadata.items():
                    dic[k] = values[j]
                summaries.append(dic)

        # Calculate NLL of summaries using fixed, pretrained LM
        pretrained_lm = torch.load(
            self.opt.load_lm)['model']  # StackedLSTMEncoder
        pretrained_lm = pretrained_lm.module if isinstance(
            pretrained_lm, nn.DataParallel) else pretrained_lm
        avg_nll = 0.0
        batch_size = self.hp.batch_size
        for i in range(0, len(summaries), batch_size):
            batch_summs = summaries[i:i + batch_size]
            batch_texts = [d['docs'] for d in batch_summs]
            dummy_ratings = [
                torch.LongTensor([0]) for _ in range(len(batch_texts))
            ]
            batch_x, _, _ = self.dataset.prepare_batch(batch_texts,
                                                       dummy_ratings)
            nll = calc_lm_nll(pretrained_lm, batch_x)
            avg_nll = update_moving_avg(avg_nll, nll.item(), i + 1)

        # Print and save accuracies, summaries, etc.
        print('NLL: ', avg_nll)
        print('Accuracy: ', accuracy.item())
        print('Per rating accuracy: ', per_rating_acc)

        dataset_dir = self.opt.dataset if self.opt.az_cat is None else 'amazon_{}'.format(
            self.opt.az_cat)
        out_dir = os.path.join(OUTPUTS_EVAL_DIR, dataset_dir,
                               'n_docs_{}'.format(self.hp.n_docs),
                               'clf_baseline')
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        out_fp = os.path.join(out_dir, 'summaries.json')
        save_file(summaries, out_fp)
        out_fp = os.path.join(out_dir, 'stats.json')
        save_file(
            {
                'acc': accuracy.item(),
                'per_rating_acc': per_rating_acc,
                'nll': avg_nll
            }, out_fp)
    def best_or_worst_review_baseline(self,
                                      data_iter,
                                      method='best',
                                      clf_model=None):
        """
        When summarizing n_docs reviews, calculate the average ROUGE1-F for each review as if it was the summary.
        Choose the document with the best / worst score.

        Note: it'd be far more efficient to calculate best and worst at the same time as all the rouges
        are already calculated...
        """
        evaluator = EvalMetrics(remove_stopwords=self.hp.remove_stopwords,
                                use_stemmer=self.hp.use_stemmer,
                                store_all=True)
        summaries = []
        accuracy = 0.0
        per_rating_counts = defaultdict(int)
        per_rating_acc = defaultdict(int)
        for i, (texts, ratings, metadata) in enumerate(data_iter):
            # texts is a list of of length batch_size
            # each item in texts is a str, i.e. n_docs documents concatenated together
            for j, text in enumerate(texts):
                bw_evaluator = None
                bw_rouge1_f = 0.0 if method == 'best' else 1.0
                bw_doc = None

                # Set each document as the summary and find the best one
                src_docs = SummDataset.split_docs(text)
                for doc in src_docs:
                    cur_evaluator = EvalMetrics(
                        remove_stopwords=self.hp.remove_stopwords,
                        use_stemmer=self.hp.use_stemmer,
                        store_all=True)
                    avg_rouges, _, _, _ = cur_evaluator.batch_update_avg_rouge(
                        [doc], [src_docs])
                    is_better_worse = (method == 'best' and (avg_rouges['rouge1']['f'] >= bw_rouge1_f)) or \
                                      (method == 'worst' and (avg_rouges['rouge1']['f'] <= bw_rouge1_f))
                    if is_better_worse:
                        bw_evaluator = cur_evaluator
                        bw_rouge1_f = avg_rouges['rouge1']['f']
                        bw_doc = doc

                evaluator.update_with_evaluator(bw_evaluator)

                try:
                    acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \
                        classify_summ_batch(clf_model, [bw_doc], [ratings[j]], self.dataset,
                                            per_rating_counts, per_rating_acc)
                except Exception as e:
                    # worst_review in the Amazon dataset has a rare edge case
                    # where the worst review is an empty string.
                    # No reviews should be empty, but it appears to just be one or two reviews
                    pass

                if acc is None:
                    print('Summary was too short to classify')
                    pred_rating, pred_prob = None, None
                else:
                    pred_rating, pred_prob = pred_ratings[j].item(
                    ), pred_probs[j].item()
                    accuracy = update_moving_avg(accuracy, acc,
                                                 i * len(texts) + j + 1)

                dic = {
                    'docs': text,
                    'summary': bw_doc,
                    'rating': ratings[j].item(),
                    'pred_rating': pred_rating,
                    'pred_prob': pred_prob
                }
                for k, values in metadata.items():
                    dic[k] = values[j]
                summaries.append(dic)

        return evaluator, summaries, accuracy.item(), per_rating_acc
    def ledes_baseline(self, data_iter, n=1, clf_model=None):
        """
        Add up until the first n sentences from each review, or until the maximum review length is exceeded
        """
        evaluator = EvalMetrics(remove_stopwords=self.hp.remove_stopwords,
                                use_stemmer=self.hp.use_stemmer,
                                store_all=True)
        summaries = []
        accuracy = 0.0
        per_rating_counts = defaultdict(int)
        per_rating_acc = defaultdict(int)
        for i, (texts, ratings, metadata) in enumerate(data_iter):
            # texts is a list of of length batch_size
            # each item in texts is a str, i.e. n_docs documents concatenated together
            for j, text in enumerate(texts):
                src_docs = SummDataset.split_docs(text)

                summary = []
                doc_sents = [nltk.sent_tokenize(doc) for doc in src_docs]
                summary_len = 0
                doc_idx, sent_idx = 0, 0

                # Keep adding sentences as long as summary isn't over maximum length and
                # there are still sentences to add
                while (summary_len <
                       self.dataset.conf.review_max_len) and (sent_idx < n):
                    # Current document has this many sentences
                    if sent_idx < len(doc_sents[doc_idx]):
                        sent = doc_sents[doc_idx][sent_idx]
                        sent_tok_len = len(nltk.word_tokenize(sent))

                        # Adding sentence won't exceed maximum length
                        if summary_len + sent_tok_len <= self.dataset.conf.review_max_len:
                            summary.append(sent)
                            summary_len += sent_tok_len

                    # Move on to next document
                    doc_idx = (doc_idx + 1) % len(src_docs)
                    if doc_idx == 0:  # back to the first doc, all first sentences have been added
                        sent_idx += 1

                summary = ' '.join(summary)
                evaluator.batch_update_avg_rouge([summary], [src_docs])
                acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \
                    classify_summ_batch(clf_model, [summary], [ratings[j]], self.dataset,
                                        per_rating_counts, per_rating_acc)

                if acc is None:
                    print('Summary was too short to classify')
                    pred_rating, pred_prob = None, None
                else:
                    pred_rating, pred_prob = pred_ratings[j].item(
                    ), pred_probs[j].item()
                    accuracy = update_moving_avg(accuracy, acc,
                                                 i * len(texts) + j + 1)

                dic = {
                    'docs': text,
                    'summary': summary,
                    'rating': ratings[j].item(),
                    'pred_rating': pred_rating,
                    'pred_prob': pred_prob
                }
                for k, values in metadata.items():
                    dic[k] = values[j]
                summaries.append(dic)

        return evaluator, summaries, accuracy.item(), per_rating_acc