Esempio n. 1
0
    def test_create_mapping(self):

        Preprocessor.load_models()

        WordMap.word_set = set()
        WordMap.word_to_id = {}

        Document("TST_ENG_20190101.0001")
        Document("TST_ENG_20190101.0002")

        WordMap.create_mapping()
        mapping = WordMap.get_mapping()

        self.assertCountEqual(self.word_set, mapping.keys())  # each word in word_set got added to the dictionary
        self.assertEqual(len(mapping), len(set(mapping.items())))  # each id value in the dict is unique
Esempio n. 2
0
class DocumentTests(unittest.TestCase):
    """
    Tests for Document class
    """

    Preprocessor.load_models()

    def test_parse_doc_id(self):
        doc = Document("TST_ENG_20190101.0001")
        self.assertEqual(doc.src, 'TST')
        self.assertEqual(doc.lang, '_ENG')
        self.assertEqual(doc.date, '20190101')
        self.assertEqual(doc.art_id, '0001')
        self.assertEqual(doc.docid, 'TST_ENG_20190101.0001')

    def test_parse_doc_id2(self):
        doc = Document("TST20190201.0001")
        self.assertEqual(doc.src, 'TST')
        self.assertEqual(doc.lang, '_ENG')
        self.assertEqual(doc.date, '20190201')
        self.assertEqual(doc.art_id, '0001')
        self.assertEqual(doc.docid, 'TST20190201.0001')

    def test_document_headline(self):
        doc = Document("TST_ENG_20190101.0001")
        self.assertEqual(doc.headline, "Puppies play fetch in the park")

    def test_document_headline2(self):
        doc = Document("TST_ENG_20190101.0002")
        self.assertEqual(doc.headline, "Playing in the dog park")
    def test_process_sentence(self):
        Preprocessor.load_models()
        test_sentence = "In a park somewhere, a bunch of puppies played fetch with their owners today."
        doc_id = "TST_ENG_20190101.0001"
        s = Sentence(test_sentence, 0, doc_id)
        a = s.tokenized()
        b = s.word_count()
        c = s.is_first_sentence()
        d = s.position()
        e = s.document_id()

        features = [a, b, c, d, e]
        expected_features = [['park', 'somewhere', 'bunch', 'puppy', 'play', 'fetch', 'owner', 'today'],
                             14, True, 0, 'TST_ENG_20190101.0001']

        self.assertCountEqual(features, expected_features)
Esempio n. 4
0
def main():
    """
    Read in the input files and output summaries
    :return:
    """

    args = parse_args(sys.argv[1:])

    # load spacy en model for later tokenization, stemming and NER
    Preprocessor.load_models()

    # read in the topics
    topic_soup = make_soup(args.topic_file)

    topics = load_documents_for_topics(topic_soup)
    idf = None

    # for each topic, load the documents and generate the summary
    for topic_id, documents in topics.items():
        if args.version == 'lead':
            summarizer = LeadSummaryGenerator(documents, LeadSentenceSelector(), args)
        elif args.version == 'mead':
            summarizer = MeadSummaryGenerator(documents, MeadContentSelector(), args)
            if idf is None:
                idf = summarizer.get_idf_array()
        elif args.version == 'melda':
            summarizer = MeldaSummaryGenerator(documents, MeldaContentSelector(), args)
            if idf is None:
                idf = summarizer.get_idf_array()
        else:
            summarizer = BaseSummaryGenerator(documents, BaseContentSelector(), args)
        output_file = get_output_filename(topic_id, args)

        with open(output_file, "w") as f:

            # print summary
            print(summarizer.generate_summary(idf), file=f)
Esempio n. 5
0
class LeadSentenceSelectorTests(unittest.TestCase):
    """
    Tests for LeadSentenceSelector
    """

    Preprocessor.load_models()

    def test_select_content(self):
        sentence_1 = 'In a park somewhere, a bunch of puppies played fetch with their owners today.'
        doc_id_1 = 'TST_ENG_20190101.0001'
        sentence_2 = 'I took my small puppy to the dog park today.'
        doc_id_2 = 'TST_ENG_20190101.0002'

        selector = LeadSentenceSelector()
        documents = [Document(doc_id_1), Document(doc_id_2)]
        expected_sentences = [Sentence(sentence_1, 1, doc_id_1), Sentence(sentence_2, 1, doc_id_2)]
        selector.select_content(documents, [])
        selected_sentences = selector.selected_content

        self.assertCountEqual(expected_sentences, selected_sentences)
Esempio n. 6
0
class IOTests(unittest.TestCase):
    """
    Tests for file IO operations
    """

    Preprocessor.load_models()

    def test_get_documents_for_topics(self):
        topic_soup = make_soup('test_data/test_topics.xml')
        expected_topics = {
            'PUP1A': [
                Document('TST_ENG_20190101.0001'),
                Document('TST_ENG_20190101.0002'),
                Document('TST20190201.0001'),
                Document('TST20190201.0002')
            ],
            'WAR2A': [
                Document('TST_ENG_20190301.0001'),
                Document('TST_ENG_20190301.0002'),
                Document('TST20190401.0001'),
                Document('TST20190401.0002')
            ]
        }
        topics = load_documents_for_topics(topic_soup)
        self.assertCountEqual(topics, expected_topics)

    def test_get_output_filename(self):
        topic_id = 'PUP1A'
        args = parse_args([
            'test_data/test_topics.xml', 'test', '--output_dir',
            '../outputs/D0/'
        ])
        output_file = get_output_filename(topic_id, args)

        self.assertEqual(output_file,
                         '../outputs/D0/PUP1-A.M.100.A.test-B-max-111')

    def test_argparse(self):
        args = parse_args(['test_data/test_topics.xml', 'test'])

        self.assertEqual(len(args._get_kwargs()), 10)
Esempio n. 7
0
class VectorsTests(unittest.TestCase):

    Preprocessor.load_models()
    topics = {
        1:
        [Document('TST_ENG_20190101.0001'),
         Document('TST_ENG_20190101.0002')]
    }
    WordMap.create_mapping()
    mapping = WordMap.get_mapping()
    topic_one = topics.get(1)  # list of Documents

    def test_create_freq_vectors(self):
        Vectors().create_freq_vectors(self.topics)
        for doc_list in self.topics.values():
            for doc in doc_list:
                # check that there's a vector for each sentence

                doc_matrix_shape = doc.vectors.get_shape()
                expected_rows = 3
                self.assertEqual(doc_matrix_shape[0], expected_rows)

    def test_sentence_vector(self):
        s = self.topics.get(1)[1].sens[1]  # s1 is a Sentence object
        # s text: 'He loves playing so he liked to run around with the other dogs playing fetch.'
        id_of_playing = WordMap.id_of('playing')
        self.assertEqual(s.vector.getcol(id_of_playing).sum(), 1)
        for word in s.tokens:
            id_of_word = WordMap.id_of(word)
            self.assertGreater(s.vector.getcol(id_of_word).sum(), 0)

    def test_get_topic_matrix(self):
        # make sure all sentences from all topic docs make it into the matrix
        topic_one_matrix = Vectors().get_topic_matrix(self.topic_one)
        expected_num_sentences = 6
        self.assertEqual(expected_num_sentences,
                         topic_one_matrix.get_shape()[0])
Esempio n. 8
0
class MeadContentSelectorTests(unittest.TestCase):
    """
    Tests for MeadContentSelector
    """

    # variables used in multiple tests
    Preprocessor.load_models()
    doc_1 = Document("TST_ENG_20190101.0001")
    doc_2 = Document("TST_ENG_20190101.0002")
    doc_list = [doc_1, doc_2]
    topics = {'PUP1A': [doc_1, doc_2]}
    w_set = {
        'he', 'owner', 'i', 'play', 'big', 'chase', 'fetch', 'park', 'dog',
        'fun', 'toy', 'tongue', 'take', 'ran', 'in', 'sun', 'love',
        'somewhere', 'many', 'together', 'around', 'puppy', 'today', 'load',
        'fight', 'small', "n't", '-PRON-', 'wag', 'hang', 'loads', 'bunch',
        'get', 'playing', 'they', 'like', 'tail', 'run', 'there'
    }

    w_map = {
        'he': 1,
        'owner': 2,
        'i': 3,
        'play': 4,
        'big': 5,
        'chase': 6,
        'fetch': 7,
        'park': 8,
        'dog': 9,
        'fun': 10,
        'toy': 11,
        'tongue': 12,
        'take': 13,
        'ran': 14,
        'in': 15,
        'sun': 16,
        'love': 17,
        'somewhere': 18,
        'many': 19,
        'together': 20,
        'around': 21,
        'puppy': 22,
        'today': 23,
        'load': 24,
        'fight': 25,
        'small': 26,
        "n't": 27,
        '-PRON-': 28,
        'wag': 29,
        'hang': 30,
        'loads': 31,
        'bunch': 32,
        'get': 33,
        'playing': 34,
        'they': 35,
        'like': 36,
        'tail': 37,
        'run': 38,
        'there': 39
    }

    idf = [
        4.032940937780854, 2.420157081061118, 1.3730247377110034,
        2.8868129021026157, 2.7776684326775474, 3.7319109421168726,
        3.25478968739721, 2.7107216430469343, 3.7319109421168726,
        4.032940937780854, 3.3339709334448346, 4.032940937780854,
        1.9257309681329853, 2.5705429398818973, 0.21458305982249878,
        2.3608430798451363, 3.5558196830611912, 3.3339709334448346,
        1.5660733174267443, 2.024340766018936, 1.2476111027700865,
        4.032940937780854, 0.9959130580250786, 3.7319109421168726,
        2.5415792439465807, 1.7107216430469343, 4.032940937780854,
        3.4308809464528913, 4.032940937780854, 3.4308809464528913,
        3.5558196830611912, 3.5558196830611912, 4.032940937780854,
        1.734087861371147, 3.0786984283415286, 0.9055121599292547,
        3.5558196830611912, 3.5558196830611912, 1.9876179589941962
    ]

    # idf = [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9,
    #        1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9,
    #        1, 2, 3]

    args = parse_args(['test_data/test_topics.xml', 'test'])
    args.c_threshold = 'min'

    def test_get_sentence_position(self):
        selector = MeadContentSelector()
        sentence_1 = Sentence("Here is a test sentence.", 0)
        sentence_2 = Sentence("Here is another one.", 50)

        pos_score_1 = selector.get_sentence_position(sentence_1, 100)
        pos_score_2 = selector.get_sentence_position(sentence_2, 100)

        expected_score_1 = 1
        expected_score_2 = 50 / 100

        self.assertEqual(expected_score_1, pos_score_1)
        self.assertEqual(expected_score_2, pos_score_2)

    def test_get_cluster_centroid(self):
        selector = MeadContentSelector()
        WordMap.word_set = self.w_set
        WordMap.word_to_id = self.w_map
        Vectors().create_freq_vectors(self.topics)

        centroid = selector.get_cluster_centroid(self.doc_list, self.idf,
                                                 self.args.c_threshold)

        actual_non_zero = np.count_nonzero(centroid)
        should_be_non_zero = 29

        self.assertEqual(actual_non_zero, should_be_non_zero)

    def test_get_centroid_score(self):
        selector = MeadContentSelector()
        sent_1 = Sentence("Puppies love playing fetch.", 0)
        self.args.c_threshold = 'mean'

        WordMap.word_set = self.w_set
        WordMap.word_to_id = self.w_map
        Vectors().create_freq_vectors(self.topics)

        centroid = selector.get_cluster_centroid(self.doc_list, self.idf,
                                                 self.args.c_threshold)

        expected_centroid_score = 6.3
        c_score = selector.get_centroid_score(sent_1, centroid)

        self.assertAlmostEqual(expected_centroid_score, c_score, 1)

    def test_apply_redundancy_penalty(self):
        """
        Test the function to apply the redundancy penalty
        :return:
        """
        selector = MeadContentSelector()

        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)

        selected = selector.select_content(self.doc_list, self.args, self.idf)
        selector.apply_redundancy_penalty(selected[0],
                                          selector.selected_content)
        scores = [s.mead_score for s in selector.selected_content]
        expected_scores = [
            1.9003829413846463, 1.6243717975775935, 0.6522065176000799,
            2.3571461578060453, 1.532600545620478, 1.7661796758000055
        ]

        self.assertEqual(scores, expected_scores)

    def test_select_content(self):
        selector = MeadContentSelector()
        Vectors().create_freq_vectors(self.topics)
        selected = selector.select_content(self.topics['PUP1A'], self.args,
                                           self.idf)
        top_sentence = selected[0]
        expected_top_sentence = 'In a park somewhere, a bunch of ' \
                                'puppies played fetch with their owners today.'

        top_mead_score = float("{:.5f}".format(top_sentence.mead_score))
        expected_top_mead_score = 2.40038

        self.assertEqual(top_sentence.raw_sentence, expected_top_sentence)
        self.assertEqual(top_mead_score, expected_top_mead_score)
class MeldaSentenceCompressionTests(unittest.TestCase):
    """
    Tests for MeldaInfoOrdering
    """
    Preprocessor.load_models()

    s0 = Sentence(
        "In a park somewhere, a bunch of puppies played fetch with their owners today.",
        1)
    s1 = Sentence("I took my small puppy to the dog park today.", 1)
    s2 = Sentence(
        "He loves playing so he liked to run around with the other dogs playing fetch.",
        1)
    s3 = Sentence("Puppies love playing fetch.", 1)

    input_summary = [s0, s1, s2, s3]

    args = parse_args(['test_data/test_topics.xml', 'test'])
    args.n = 1

    realizer = MeldaContentRealizer()

    def test_remove_adverbs(self):
        s = Sentence("Puppies love running quickly and playing loudly.", 1)
        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "Puppies love running and playing."
        self.assertEqual(summary, expected)

    def test_remove_initial_conj(self):
        s = Sentence("But, puppies are great.", 1)
        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "Puppies are great."
        self.assertEqual(summary, expected)

    def test_remove_parens(self):
        s = Sentence("The puppy (aka Mr. Mayor) was the cutest.", 1)
        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "The puppy was the cutest."
        self.assertEqual(summary, expected)

    def test_remove_appositives(self):
        s = Sentence(
            "Dennis, the cutest puppy in the park, ran towards the ball.", 1)
        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "Dennis ran towards the ball."
        self.assertEqual(expected, summary)

    def test_remove_junk(self):
        s = Sentence("Seattle, WA --- Puppies are great.", 1)
        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "Puppies are great."
        self.assertEqual(summary, expected)

    def test_remove_attributions(self):
        s = Sentence("Julia said that puppies are cute.", 1)

        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "Puppies are cute."
        self.assertEqual(summary, expected)

    def test_remove_attribution_phrases(self):
        s = Sentence(
            "Seattle State Bureau of Animal Rating said "
            "in a press release that puppies are cute.", 1)

        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "Puppies are cute."
        self.assertEqual(summary, expected)

    def test_remove_temporal_mod(self):
        s = Sentence("By 8 a.m. on Saturday the park was full of puppies.", 1)

        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "The park was full of puppies."
        self.assertEqual(summary, expected)

    def test_remove_mod_rel(self):
        s = Sentence(
            "Joe said that by 8 a.m. on Saturday the park was full of puppies.",
            1)

        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "The park was full of puppies."
        self.assertEqual(summary, expected)

    def test_bad(self):
        s = Sentence(
            "Heilongjiang Provincial Bureau of Environmental Protection said in a press release that by 6 a.m. on Saturday, concentration of nitrobenzene monitored at Sujiatun upstream Sifangtai, one major water intake spot of Harbin, capital of northeast China's Heilongjiang Province, fell to 0.0793 mg per liter, but above the state safety standard of 0.017 mg per liter, but the density of benzene stood at 0.0011 mg per liter, which is within   the state safety benchmark.",
            1)

        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "Concentration of nitrobenzene monitored at Sujiatun upstream Sifangtai fell, " \
                   "but above the state safety standard, but the density of benzene stood, " \
                   "which is within the state safety benchmark."
        self.assertEqual(summary, expected)

    def test_remove_npadvmod(self):
        s = Sentence("Joe said Saturday that the park was full of puppies.", 1)

        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "The park was full of puppies."
        self.assertEqual(summary, expected)
Esempio n. 10
0
class MeldaContentSelectorTests(unittest.TestCase):
    Preprocessor.load_models()
    doc_1 = Document("TST_ENG_20190101.0001")
    doc_3 = Document("TST_ENG_20190301.0001")
    doc_list = [doc_1, doc_3]
    topics = {'PUPWAR': doc_list}

    w_set = {'park', 'somewhere', 'bunch', 'puppy', 'play', 'fetch', 'their', 'owner', 'today', 'they', 'all', 'run',
             'around', 'their', 'tail', 'wag', 'tongue', 'hang', 'out', 'have', 'load', 'fun', 'sun', 'love', 'our',
             'country', 'go', 'war', 'soldier', 'go', 'fight', 'travel', 'wherever', 'fight', 'enemy', 'try', 'kill',
             'before', 'get', 'kill', 'themselves', '-PRON-', 'playing'}

    idf = [4.032940937780854, 2.420157081061118, 1.3730247377110034,
           2.8868129021026157, 2.7776684326775474, 3.7319109421168726,
           3.25478968739721, 2.7107216430469343, 3.7319109421168726,
           4.032940937780854, 3.3339709334448346, 4.032940937780854,
           1.9257309681329853, 2.5705429398818973, 0.21458305982249878,
           2.3608430798451363, 3.5558196830611912, 3.3339709334448346,
           1.5660733174267443, 2.024340766018936, 1.2476111027700865,
           4.032940937780854, 0.9959130580250786, 3.7319109421168726,
           2.5415792439465807, 1.7107216430469343, 4.032940937780854,
           3.4308809464528913, 4.032940937780854, 3.4308809464528913,
           3.5558196830611912, 3.5558196830611912, 4.032940937780854,
           1.734087861371147, 3.0786984283415286, 0.9055121599292547,
           3.5558196830611912, 3.5558196830611912, 1.9876179589941962]


    args = parse_args(['test_data/test_topics.xml', 'test'])
    args.lda_topics = 2

    def test_document_topics(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)
        testtok = ['puppy', 'soldier', 'war', 'fetch']
        testsen = Vectors().create_term_sen_freq(testtok)
        document_topics = lda_model.get_document_topics(testsen, minimum_probability=0)
        topic_dist = [prob[1] for prob in document_topics]

        self.assertEqual(len(topic_dist), self.args.lda_topics)
        self.assertAlmostEquals(sum(topic_dist), 1, 2)


    def test_term_topics(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)

        puppy_topics = lda_model.get_term_topics(WordMap.id_of('puppy'), minimum_probability=0)
        war_topics = lda_model.get_term_topics(WordMap.id_of('war'), minimum_probability=0)
        puppy_dist = [prob[1] for prob in puppy_topics]
        enemy_dist = [prob[1] for prob in war_topics]

        puppy_war = puppy_dist[0] > enemy_dist[0] and puppy_dist[1] < enemy_dist[1]
        war_puppy = puppy_dist[0] < enemy_dist[0] and puppy_dist[1] > enemy_dist[1]

        self.assertTrue(puppy_war or war_puppy)

    def test_get_lda_scores(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)

        sentence = self.doc_list[0].sens[0]
        selector.calculate_lda_scores([sentence], lda_model)
        lda_scores = sentence.lda_scores

        self.assertEqual(len(lda_scores), self.args.lda_topics)
        self.assertAlmostEqual(sum(lda_scores), 1, 2)

    def test_get_melda_scores(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)

        sentence = self.doc_list[0].sens[0]
        sentences = selector.calculate_mead_scores(self.doc_list, self.args, self.idf)
        selector.calculate_lda_scores(sentences, lda_model)
        selector.calculate_melda_scores(sentences)
        melda_scores = sentence.melda_scores

        self.assertEqual(len(melda_scores), self.args.lda_topics)

    def test_get_top_n(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)

        sentences = selector.calculate_mead_scores(self.doc_list, self.args, self.idf)
        sentences = selector.calculate_lda_scores(sentences, lda_model)
        sentences = selector.calculate_melda_scores(sentences)
        selector.select_top_n(sentences, self.args.lda_topics, 1)

        self.assertEqual(len(selector.selected_content), self.args.lda_topics)
Esempio n. 11
0
class MeadSummaryGeneratorTests(unittest.TestCase):
    """
    Tests for MeadSummaryGenerator
    """

    # variables used in multiple tests
    Preprocessor.load_models()
    doc_1 = Document("TST_ENG_20190101.0001")
    doc_2 = Document("TST_ENG_20190101.0002")
    doc_list = [doc_1, doc_2]
    topics = {'PUP1A': [doc_1, doc_2]}
    w_set = {
        'he', 'owner', 'i', 'play', 'big', 'chase', 'fetch', 'park', 'dog',
        'fun', 'toy', 'tongue', 'take', 'ran', 'in', 'sun', 'love',
        'somewhere', 'many', 'together', 'around', 'puppy', 'today', 'load',
        'fight', 'small', "n't", '-PRON-', 'wag', 'hang', 'loads', 'bunch',
        'get', 'playing', 'they', 'like', 'tail', 'run', 'there'
    }

    idf = [
        4.032940937780854, 2.420157081061118, 1.3730247377110034,
        2.8868129021026157, 2.7776684326775474, 3.7319109421168726,
        3.25478968739721, 2.7107216430469343, 3.7319109421168726,
        4.032940937780854, 3.3339709334448346, 4.032940937780854,
        1.9257309681329853, 2.5705429398818973, 0.21458305982249878,
        2.3608430798451363, 3.5558196830611912, 3.3339709334448346,
        1.5660733174267443, 2.024340766018936, 1.2476111027700865,
        4.032940937780854, 0.9959130580250786, 3.7319109421168726,
        2.5415792439465807, 1.7107216430469343, 4.032940937780854,
        3.4308809464528913, 4.032940937780854, 3.4308809464528913,
        3.5558196830611912, 3.5558196830611912, 4.032940937780854,
        1.734087861371147, 3.0786984283415286, 0.9055121599292547,
        3.5558196830611912, 3.5558196830611912, 1.9876179589941962
    ]

    args = parse_args(['test_data/test_topics.xml', 'test'])
    WordMap.reset()

    def test_order_information(self):
        """
        Test ordering Sentences by MEAD score
        :return:
        """
        doc_id_1 = 'TST_ENG_20190101.0001'
        sentence_1 = 'Puppies love playing fetch.'
        sentence_2 = 'They all ran around with their tails wagging ' \
                     'and their tongues hanging out having loads of fun in the sun.'
        sentence_3 = "He loves playing so he liked to run around with the other dogs playing fetch."
        expected_info = [
            Sentence(sentence_1, 1, doc_id_1),
            Sentence(sentence_3, 3, doc_id_1),
            Sentence(sentence_2, 2, doc_id_1)
        ]

        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)
        generator = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                         self.args)
        generator.select_content(self.idf)
        generator.order_information()

        first_sentences = generator.content_selector.selected_content[:3]

        self.assertListEqual(expected_info, first_sentences)

    def test_realize_content(self):
        """
        Test applying redundancy penalty during realize_content
        :return:
        """
        expected_content = "I took my small puppy to the dog park today.\n" \
                           "In a park somewhere, a bunch of puppies played fetch with their owners today.\n" \
                           "There were many bigger puppies but he didn't get in a fight with any of them, " \
                           "they just played together with their toys and chased each other.\n" \
                           "They all ran around with their tails wagging and their tongues hanging out having " \
                           "loads of fun in the sun.\n" \
                           "He loves playing so he liked to run around with the other dogs playing fetch.\n" \
                           "Puppies love playing fetch."

        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)

        generator = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                         self.args)
        generator.select_content(self.idf)
        generator.order_information()
        generator.content_selector.selected_content = generator.content_selector.selected_content
        realized_content = generator.realize_content()
        self.assertEqual(expected_content, realized_content)

    def test_get_idf_array(self):
        words = [
            "i", "eat", "cake", "is", "delicious", "puppies", "are", "cute",
            "cats", "furry", "bank", "company", "sugar", "dollar", "however",
            "say"
        ]
        # Must override WordMap dictionary for test
        WordMap.word_to_id = {
            'delicious': 0,
            'eat': 1,
            'furry': 2,
            'puppies': 3,
            'i': 4,
            'cats': 5,
            'are': 6,
            'is': 7,
            'cute': 8,
            'cake': 9,
            'bank': 10,
            'company': 11,
            'sugar': 12,
            'dollar': 13,
            'however': 14,
            'say': 15
        }

        idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                   self.args).get_idf_array()

        scores = []
        for word in words:
            curr_score = idf[WordMap.id_of(word)]
            scores.append("{:.5f}".format(curr_score))

        expected_scores = [
            '2.69897', '0.80688', '1.49485', '2.69897', '2.69897', '2.69897',
            '2.69897', '1.92082', '2.69897', '2.69897', '1.04576', '0.65365',
            '1.44370', '0.98297', '0.24718', '0.10018'
        ]

        self.assertListEqual(scores, expected_scores, 5)

    def test_mead_summary_length(self):
        """
        Test length of summary is less than 100 words
        :return:
        """
        topics = {
            'PUP1A': [
                Document('TST_ENG_20190101.0001'),
                Document('TST_ENG_20190101.0002'),
                Document('TST20190201.0001'),
                Document('TST20190201.0002')
            ],
            'WAR2A': [
                Document('TST_ENG_20190301.0001'),
                Document('TST_ENG_20190301.0002'),
                Document('TST20190401.0001'),
                Document('TST20190401.0002')
            ]
        }
        WordMap.create_mapping()
        vec = Vectors()
        vec.create_freq_vectors(topics)
        idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                   self.args).get_idf_array()
        max_length = 100

        for topic_id, documents in topics.items():
            generator = MeadSummaryGenerator(documents, MeadContentSelector(),
                                             self.args)
            generator.select_content(idf)
            generator.order_information()
            realized_content = generator.realize_content()
            realized_content = [
                w for w in realized_content.split(" ") if not " "
            ]
            content_length = len(realized_content)
            self.assertLessEqual(content_length, max_length)

    def test_generate_summary(self):
        topics = {
            'PUP1A': [
                Document('TST_ENG_20190101.0001'),
                Document('TST_ENG_20190101.0002'),
                Document('TST20190201.0001'),
                Document('TST20190201.0002')
            ],
            'WAR2A': [
                Document('TST_ENG_20190301.0001'),
                Document('TST_ENG_20190301.0002'),
                Document('TST20190401.0001'),
                Document('TST20190401.0002')
            ]
        }
        WordMap.create_mapping()
        vec = Vectors()
        vec.create_freq_vectors(topics)
        idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                   self.args).get_idf_array()

        for topic_id, documents in topics.items():
            summarizer = MeadSummaryGenerator(documents, MeadContentSelector(),
                                              self.args)
            summary = summarizer.generate_summary(idf)
            self.assertIsNot(summary, None)
Esempio n. 12
0
class LeadSummaryGeneratorTests(unittest.TestCase):
    """
    Tests for LeadSummaryGenerator
    """

    Preprocessor.load_models()

    def test_order_information(self):
        sentence_1 = 'Puppies are cute because many of them are small.'
        doc_id_1 = 'TST20190201.0001'
        sentence_2 = 'In a park somewhere, a bunch of puppies played fetch with their owners today.'
        doc_id_2 = 'TST_ENG_20190101.0001'
        expected_info = [
            Sentence(sentence_2, 1, doc_id_2),
            Sentence(sentence_1, 1, doc_id_1)
        ]

        documents = [
            Document('TST_ENG_20190101.0001'),
            Document('TST20190201.0001')
        ]
        generator = LeadSummaryGenerator(documents, LeadSentenceSelector(), [])
        generator.select_content()
        generator.order_information()

        self.assertListEqual(expected_info,
                             generator.content_selector.selected_content)

    def test_realize_content(self):
        documents = [
            Document('TST_ENG_20190101.0001'),
            Document('TST_ENG_20190101.0002'),
            Document('TST20190201.0001'),
            Document('TST20190201.0002')
        ]
        expected_content = "In a park somewhere, a bunch of puppies played fetch with their owners today.\n" \
                           "I took my small puppy to the dog park today.\n" \
                           "Puppies are cute because many of them are small.\n" \
                           "Puppies love to play with toys."

        generator = LeadSummaryGenerator(documents, LeadSentenceSelector(), [])
        generator.select_content()
        generator.order_information()
        realized_content = generator.realize_content()
        self.assertEqual(expected_content, realized_content)

    def test_lead_summary_length(self):
        documents = [
            Document('TST_ENG_20190101.0001'),
            Document('TST_ENG_20190101.0002'),
            Document('TST20190201.0001'),
            Document('TST20190201.0002'),
            Document('TST_ENG_20190101.0001'),
            Document('TST_ENG_20190101.0002'),
            Document('TST20190201.0001'),
            Document('TST20190201.0002'),
            Document('TST_ENG_20190101.0001'),
            Document('TST_ENG_20190101.0002'),
            Document('TST20190201.0001'),
            Document('TST20190201.0002'),
            Document('TST_ENG_20190101.0001'),
            Document('TST_ENG_20190101.0002'),
            Document('TST20190201.0001'),
            Document('TST20190201.0002'),
            Document('TST_ENG_20190101.0001'),
            Document('TST_ENG_20190101.0002'),
            Document('TST20190201.0001'),
            Document('TST20190201.0002'),
            Document('TST_ENG_20190101.0001'),
            Document('TST_ENG_20190101.0002'),
            Document('TST20190201.0001'),
            Document('TST20190201.0002'),
            Document('TST_ENG_20190101.0001'),
            Document('TST_ENG_20190101.0002'),
            Document('TST20190201.0001'),
            Document('TST20190201.0002'),
            Document('TST_ENG_20190101.0001'),
            Document('TST_ENG_20190101.0002'),
            Document('TST20190201.0001'),
            Document('TST20190201.0002')
        ]
        max_length = 100

        generator = LeadSummaryGenerator(documents, LeadSentenceSelector(), [])
        generator.select_content()
        generator.order_information()
        realized_content = generator.realize_content()
        content_length = len(realized_content.split(" "))
        self.assertLessEqual(content_length, max_length)
Esempio n. 13
0
class MeldaSummaryGeneratorTests(unittest.TestCase):
    """
    Tests for MeldaSummaryGenerator
    """

    # variables used in multiple tests
    Preprocessor.load_models()
    doc_1 = Document("TST_ENG_20190101.0001")
    doc_3 = Document("TST_ENG_20190301.0001")
    doc_list = [doc_1, doc_3]
    topics = {'PUPWAR': doc_list}

    w_set = {
        'park', 'somewhere', 'bunch', 'puppy', 'play', 'fetch', 'their',
        'owner', 'today', 'they', 'all', 'run', 'around', 'their', 'tail',
        'wag', 'tongue', 'hang', 'out', 'have', 'load', 'fun', 'sun', 'love',
        'our', 'country', 'go', 'war', 'soldier', 'go', 'fight', 'travel',
        'wherever', 'fight', 'enemy', 'try', 'kill', 'before', 'get', 'kill',
        'themselves', '-PRON-', 'playing'
    }

    idf = [
        4.032940937780854, 2.420157081061118, 1.3730247377110034,
        2.8868129021026157, 2.7776684326775474, 3.7319109421168726,
        3.25478968739721, 2.7107216430469343, 3.7319109421168726,
        4.032940937780854, 3.3339709334448346, 4.032940937780854,
        1.9257309681329853, 2.5705429398818973, 0.21458305982249878,
        2.3608430798451363, 3.5558196830611912, 3.3339709334448346,
        1.5660733174267443, 2.024340766018936, 1.2476111027700865,
        4.032940937780854, 0.9959130580250786, 3.7319109421168726,
        2.5415792439465807, 1.7107216430469343, 4.032940937780854,
        3.4308809464528913, 4.032940937780854, 3.4308809464528913,
        3.5558196830611912, 3.5558196830611912, 4.032940937780854,
        1.734087861371147, 3.0786984283415286, 0.9055121599292547,
        3.5558196830611912, 3.5558196830611912, 1.9876179589941962
    ]

    args = parse_args(['test_data/test_topics.xml', 'test'])
    args.lda_topics = 2

    def test_melda_info_ordering(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)
        Vectors().create_term_doc_freq(self.topics)
        summarizer = MeldaSummaryGenerator(self.doc_list,
                                           MeldaContentSelector(), self.args)
        content_selector = summarizer.select_content(self.idf)
        expected_len = len(content_selector)
        summarizer.order_information()

        actual_len = len(content_selector)

        self.assertEqual(expected_len, actual_len)

    def test_melda_generate_summary(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)
        Vectors().create_term_doc_freq(self.topics)
        for topic_id, documents in self.topics.items():
            summarizer = MeldaSummaryGenerator(documents,
                                               MeldaContentSelector(),
                                               self.args)
            summary = summarizer.generate_summary(self.idf)
            self.assertIsNot(summary, None)

    def test_ifvalid_sent(self):
        for topic_id, documents in self.topics.items():
            summarizer = MeldaSummaryGenerator(documents,
                                               MeldaContentSelector(),
                                               self.args)
            break
        raw_sent1 = "--"
        self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent1), 1)

        raw_sent2 = "---"
        self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent2), 0)

        raw_sent3 = "-342--"
        self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent3), 1)

        raw_sent4 = "-342dafd23480134"
        self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent4), 0)

        raw_sent5 = "\n\nsafadj\n\n"
        self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent5), 0)

        raw_sent6 = "-342dafd23480"
        self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent6), 1)