def get_cluster_centroid(self, documents, idf_array, threshold_calc):
        """
        The centroid for the cluster is the vector for
        the pseudo-document for the cluster, cf. MEAD paper
        :param: documents, idf_array, threshold (optional)
        :return: numpy array
        """
        word_sentence_matrix = Vectors().get_topic_matrix(documents).toarray()
        total_words_in_cluster = word_sentence_matrix.sum(0)
        sentences_per_word = np.count_nonzero(word_sentence_matrix, axis=0)  # across the cluster
        average_count = np.divide(total_words_in_cluster, sentences_per_word + 1)

        if len(average_count) != len(idf_array):
            raise Exception("Cluster centroid arrays must be the same length; "
                            "Word array length {}, IDF array length {}"
                            .format(len(average_count), len(idf_array)))

        centroid_cluster = np.multiply(average_count, idf_array)

        if threshold_calc == 'min':
            threshold = self.min_mean_threshold(centroid_cluster)
        elif threshold_calc == 'mean':
            threshold = self.mean_threshold(centroid_cluster)
        elif threshold_calc == 'max':
            threshold = self.max_mean_threshold(centroid_cluster)
        else:
            threshold = 0

        centroid_cluster[centroid_cluster < threshold] = 0  # set all centroid word values below threshold to zero
        return centroid_cluster
Exemple #2
0
    def test_generate_summary(self):
        topics = {
            'PUP1A': [
                Document('TST_ENG_20190101.0001'),
                Document('TST_ENG_20190101.0002'),
                Document('TST20190201.0001'),
                Document('TST20190201.0002')
            ],
            'WAR2A': [
                Document('TST_ENG_20190301.0001'),
                Document('TST_ENG_20190301.0002'),
                Document('TST20190401.0001'),
                Document('TST20190401.0002')
            ]
        }
        WordMap.create_mapping()
        vec = Vectors()
        vec.create_freq_vectors(topics)
        idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                   self.args).get_idf_array()

        for topic_id, documents in topics.items():
            summarizer = MeadSummaryGenerator(documents, MeadContentSelector(),
                                              self.args)
            summary = summarizer.generate_summary(idf)
            self.assertIsNot(summary, None)
Exemple #3
0
 def test_melda_generate_summary(self):
     WordMap.word_set = self.w_set
     WordMap.create_mapping()
     Vectors().create_freq_vectors(self.topics)
     Vectors().create_term_doc_freq(self.topics)
     for topic_id, documents in self.topics.items():
         summarizer = MeldaSummaryGenerator(documents,
                                            MeldaContentSelector(),
                                            self.args)
         summary = summarizer.generate_summary(self.idf)
         self.assertIsNot(summary, None)
    def test_document_topics(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)
        testtok = ['puppy', 'soldier', 'war', 'fetch']
        testsen = Vectors().create_term_sen_freq(testtok)
        document_topics = lda_model.get_document_topics(testsen, minimum_probability=0)
        topic_dist = [prob[1] for prob in document_topics]

        self.assertEqual(len(topic_dist), self.args.lda_topics)
        self.assertAlmostEquals(sum(topic_dist), 1, 2)
    def test_get_top_n(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)

        sentences = selector.calculate_mead_scores(self.doc_list, self.args, self.idf)
        sentences = selector.calculate_lda_scores(sentences, lda_model)
        sentences = selector.calculate_melda_scores(sentences)
        selector.select_top_n(sentences, self.args.lda_topics, 1)

        self.assertEqual(len(selector.selected_content), self.args.lda_topics)
Exemple #6
0
    def test_melda_info_ordering(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)
        Vectors().create_term_doc_freq(self.topics)
        summarizer = MeldaSummaryGenerator(self.doc_list,
                                           MeldaContentSelector(), self.args)
        content_selector = summarizer.select_content(self.idf)
        expected_len = len(content_selector)
        summarizer.order_information()

        actual_len = len(content_selector)

        self.assertEqual(expected_len, actual_len)
Exemple #7
0
    def test_realize_content(self):
        """
        Test applying redundancy penalty during realize_content
        :return:
        """
        expected_content = "I took my small puppy to the dog park today.\n" \
                           "In a park somewhere, a bunch of puppies played fetch with their owners today.\n" \
                           "There were many bigger puppies but he didn't get in a fight with any of them, " \
                           "they just played together with their toys and chased each other.\n" \
                           "They all ran around with their tails wagging and their tongues hanging out having " \
                           "loads of fun in the sun.\n" \
                           "He loves playing so he liked to run around with the other dogs playing fetch.\n" \
                           "Puppies love playing fetch."

        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)

        generator = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                         self.args)
        generator.select_content(self.idf)
        generator.order_information()
        generator.content_selector.selected_content = generator.content_selector.selected_content
        realized_content = generator.realize_content()
        self.assertEqual(expected_content, realized_content)
Exemple #8
0
    def test_order_information(self):
        """
        Test ordering Sentences by MEAD score
        :return:
        """
        doc_id_1 = 'TST_ENG_20190101.0001'
        sentence_1 = 'Puppies love playing fetch.'
        sentence_2 = 'They all ran around with their tails wagging ' \
                     'and their tongues hanging out having loads of fun in the sun.'
        sentence_3 = "He loves playing so he liked to run around with the other dogs playing fetch."
        expected_info = [
            Sentence(sentence_1, 1, doc_id_1),
            Sentence(sentence_3, 3, doc_id_1),
            Sentence(sentence_2, 2, doc_id_1)
        ]

        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)
        generator = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                         self.args)
        generator.select_content(self.idf)
        generator.order_information()

        first_sentences = generator.content_selector.selected_content[:3]

        self.assertListEqual(expected_info, first_sentences)
Exemple #9
0
    def get_idf_array(self):
        """
        Use external corpus to get IDF scores
        for cluster centroid calculations
        :return: numpy array of idf values
        """
        corpus = brown
        if self.args.corpus == 'R':
            corpus = reuters
        num_words = Vectors().num_unique_words
        n = len(corpus.fileids())  # number of documents in corpus
        docs_word_matrix = np.zeros([n, num_words])
        for doc_idx, doc_id in enumerate(corpus.fileids()):
            sentences = list(corpus.sents(doc_id))
            words_in_doc = set()
            for s in sentences:
                s = ' '.join(s)
                proc_s = Preprocessor.get_processed_tokens(Preprocessor.get_processed_sentence(s))
                if proc_s:
                    words_in_doc = words_in_doc.union(proc_s)
            for word in words_in_doc:
                word_idx = WordMap.id_of(word)
                if word_idx:
                    docs_word_matrix[doc_idx, word_idx] = 1

        docs_per_word = np.sum(docs_word_matrix, axis=0)
        self.idf_array = np.log10(np.divide(n, docs_per_word + 1))  # add one to avoid divide by zero error

        return self.idf_array
Exemple #10
0
    def test_create_freq_vectors(self):
        Vectors().create_freq_vectors(self.topics)
        for doc_list in self.topics.values():
            for doc in doc_list:
                # check that there's a vector for each sentence

                doc_matrix_shape = doc.vectors.get_shape()
                expected_rows = 3
                self.assertEqual(doc_matrix_shape[0], expected_rows)
    def test_get_lda_scores(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)

        sentence = self.doc_list[0].sens[0]
        selector.calculate_lda_scores([sentence], lda_model)
        lda_scores = sentence.lda_scores

        self.assertEqual(len(lda_scores), self.args.lda_topics)
        self.assertAlmostEqual(sum(lda_scores), 1, 2)
    def test_get_cluster_centroid(self):
        selector = MeadContentSelector()
        WordMap.word_set = self.w_set
        WordMap.word_to_id = self.w_map
        Vectors().create_freq_vectors(self.topics)

        centroid = selector.get_cluster_centroid(self.doc_list, self.idf,
                                                 self.args.c_threshold)

        actual_non_zero = np.count_nonzero(centroid)
        should_be_non_zero = 29

        self.assertEqual(actual_non_zero, should_be_non_zero)
    def test_select_content(self):
        selector = MeadContentSelector()
        Vectors().create_freq_vectors(self.topics)
        selected = selector.select_content(self.topics['PUP1A'], self.args,
                                           self.idf)
        top_sentence = selected[0]
        expected_top_sentence = 'In a park somewhere, a bunch of ' \
                                'puppies played fetch with their owners today.'

        top_mead_score = float("{:.5f}".format(top_sentence.mead_score))
        expected_top_mead_score = 2.40038

        self.assertEqual(top_sentence.raw_sentence, expected_top_sentence)
        self.assertEqual(top_mead_score, expected_top_mead_score)
Exemple #14
0
    def test_mead_summary_length(self):
        """
        Test length of summary is less than 100 words
        :return:
        """
        topics = {
            'PUP1A': [
                Document('TST_ENG_20190101.0001'),
                Document('TST_ENG_20190101.0002'),
                Document('TST20190201.0001'),
                Document('TST20190201.0002')
            ],
            'WAR2A': [
                Document('TST_ENG_20190301.0001'),
                Document('TST_ENG_20190301.0002'),
                Document('TST20190401.0001'),
                Document('TST20190401.0002')
            ]
        }
        WordMap.create_mapping()
        vec = Vectors()
        vec.create_freq_vectors(topics)
        idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                   self.args).get_idf_array()
        max_length = 100

        for topic_id, documents in topics.items():
            generator = MeadSummaryGenerator(documents, MeadContentSelector(),
                                             self.args)
            generator.select_content(idf)
            generator.order_information()
            realized_content = generator.realize_content()
            realized_content = [
                w for w in realized_content.split(" ") if not " "
            ]
            content_length = len(realized_content)
            self.assertLessEqual(content_length, max_length)
    def test_get_centroid_score(self):
        selector = MeadContentSelector()
        sent_1 = Sentence("Puppies love playing fetch.", 0)
        self.args.c_threshold = 'mean'

        WordMap.word_set = self.w_set
        WordMap.word_to_id = self.w_map
        Vectors().create_freq_vectors(self.topics)

        centroid = selector.get_cluster_centroid(self.doc_list, self.idf,
                                                 self.args.c_threshold)

        expected_centroid_score = 6.3
        c_score = selector.get_centroid_score(sent_1, centroid)

        self.assertAlmostEqual(expected_centroid_score, c_score, 1)
    def test_term_topics(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)

        puppy_topics = lda_model.get_term_topics(WordMap.id_of('puppy'), minimum_probability=0)
        war_topics = lda_model.get_term_topics(WordMap.id_of('war'), minimum_probability=0)
        puppy_dist = [prob[1] for prob in puppy_topics]
        enemy_dist = [prob[1] for prob in war_topics]

        puppy_war = puppy_dist[0] > enemy_dist[0] and puppy_dist[1] < enemy_dist[1]
        war_puppy = puppy_dist[0] < enemy_dist[0] and puppy_dist[1] > enemy_dist[1]

        self.assertTrue(puppy_war or war_puppy)
    def calculate_lda_scores(self, sentences, lda_model):
        """
        Calculate LDA scores for each of the given sentences
        :param sentences: the given set of sentences
        :param lda_model: the LDA model
        :return: list of sentences with lda_scores populated
        """
        for sentence in sentences:
            sen_tdf = Vectors().create_term_sen_freq(sentence.tokens)
            lda_scores = lda_model.get_document_topics(sen_tdf,
                                                       minimum_probability=0)
            lda_arr = np.zeros(len(lda_scores))
            for topic_id, prob in lda_scores:
                lda_arr[topic_id] = prob

            sentence.lda_scores = lda_arr

        return sentences
    def test_apply_redundancy_penalty(self):
        """
        Test the function to apply the redundancy penalty
        :return:
        """
        selector = MeadContentSelector()

        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)

        selected = selector.select_content(self.doc_list, self.args, self.idf)
        selector.apply_redundancy_penalty(selected[0],
                                          selector.selected_content)
        scores = [s.mead_score for s in selector.selected_content]
        expected_scores = [
            1.9003829413846463, 1.6243717975775935, 0.6522065176000799,
            2.3571461578060453, 1.532600545620478, 1.7661796758000055
        ]

        self.assertEqual(scores, expected_scores)
Exemple #19
0
def load_documents_for_topics(topic_soup):
    """
    Load documents for each topic
    :param topic_soup:
    :return:
    """
    topics = {}
    for topic in topic_soup.find_all('topic'):
        documents = load_documents(topic)
        topics[topic['id']] = documents

    # At this point, all docs have been loaded and all unique words are stored in WordMap set
    # Need to trigger creation of mapping and of vectors
    WordMap.create_mapping()
    vec = Vectors()
    vec.create_freq_vectors(topics)  # do we need to have this here if we don't run mead based content selection
    vec.create_term_doc_freq(topics)

    return topics
Exemple #20
0
 def test_get_topic_matrix(self):
     # make sure all sentences from all topic docs make it into the matrix
     topic_one_matrix = Vectors().get_topic_matrix(self.topic_one)
     expected_num_sentences = 6
     self.assertEqual(expected_num_sentences,
                      topic_one_matrix.get_shape()[0])