Beispiel #1
0
    def test_get_all_content_words_in_doc(self):
        summarizer = self._build_summarizer(self.EMPTY_STOP_WORDS)
        s0 = Sentence("One two three.", Tokenizer("english"))
        s1 = Sentence("One two three.", Tokenizer("english"))
        document = build_document([s0, s1])

        content_words = summarizer._get_all_content_words_in_doc(document.sentences)
        content_words_freq = {}
        for w in content_words:
            content_words_freq[w] = content_words_freq.get(w, 0) + 1
        content_words_correct = {"one": 2, "two": 2, "three": 2}
        self.assertEqual(content_words_freq, content_words_correct)
def test_single_sentence(summarizer):
    s = Sentence("I am one slightly longer sentence.", Tokenizer("english"))
    document = build_document([s])

    returned = summarizer(document, 10)

    assert len(returned) == 1
Beispiel #3
0
    def test_single_sentence(self):

        s = Sentence("I am one slightly longer sentence.", Tokenizer("english"))
        document = build_document([s])
        summarizer = self._build_summarizer(self.EMPTY_STOP_WORDS)

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 1)
Beispiel #4
0
def calculate_rouge_1_score(summary):
    scores = []
    average = 0
    k = 0
    ideal_summary_1 = 'Jason saw a nice weather forecast and went to the beach with his kids for 2 hours.'
    ideal_summary_2 = 'Jason took the kids swimming at the beach on a sunny day.'
    ideal_summary_3 = 'Jason decided to take the kids to the beach since it was a sunny day.'
    ideal_summaries = [ideal_summary_1, ideal_summary_2, ideal_summary_3]
    tokenizer = Tokenizer('english')
    summary_sentence = Sentence(summary, tokenizer)
    for s in ideal_summaries:
        ideal_sentence = Sentence(ideal_summaries[k], tokenizer)
        scores.append(rouge_1([summary_sentence], [ideal_sentence]))
        average += scores[k]
        k += 1
    scores.append(average / 3)
    return scores
Beispiel #5
0
def cutoffSentences(sentences, threshold=7):
    result = []
    tokenizer = Tokenizer("english")
    sentences = list(sentences)
    sentences = map(unicode, sentences)
    for sentence in sentences:
        if (len(sentence.split()) > threshold):
            result.append(Sentence(sentence, tokenizer))

    return tuple(result)
Beispiel #6
0
def test_stemmer():
    summarizer_w_stemmer = _build_summarizer(EMPTY_STOP_WORDS,
                                             Stemmer('english'))
    summarizer_wo_stemmer = _build_summarizer(EMPTY_STOP_WORDS)
    word = Sentence('testing', Tokenizer('english'))
    assert summarizer_w_stemmer._get_content_words_in_sentence(word) == [
        'test'
    ]
    assert summarizer_wo_stemmer._get_content_words_in_sentence(word) == [
        'testing'
    ]
Beispiel #7
0
def test_compute_tf():
    summarizer = _build_summarizer(EMPTY_STOP_WORDS)
    s0 = Sentence("kicking soccer balls.", Tokenizer("english"))
    s1 = Sentence("eating chicken dumplings.", Tokenizer("english"))
    document = build_document([s0, s1])
    freq = summarizer._compute_tf(document.sentences)
    assert freq["kicking"] == 1 / 6
    assert freq["soccer"] == 1 / 6
    assert freq["balls"] == 1 / 6
    assert freq["eating"] == 1 / 6
    assert freq["chicken"] == 1 / 6
    assert freq["dumplings"] == 1 / 6

    document = build_document([s0, s0, s1])
    freq = summarizer._compute_tf(document.sentences)
    assert freq["kicking"] == 2 / 9
    assert freq["soccer"] == 2 / 9
    assert freq["balls"] == 2 / 9
    assert freq["eating"] == 1 / 9
    assert freq["chicken"] == 1 / 9
    assert freq["dumplings"] == 1 / 9
Beispiel #8
0
    def test_compute_tf(self):
        summarizer = self._build_summarizer(self.EMPTY_STOP_WORDS)
        s0 = Sentence("kicking soccer balls.", Tokenizer("english"))
        s1 = Sentence("eating chicken dumplings.", Tokenizer("english"))
        document = build_document([s0, s1])
        freq = summarizer._compute_tf(document.sentences)
        self.assertEqual(freq["kicking"], 1/6)
        self.assertEqual(freq["soccer"], 1/6)
        self.assertEqual(freq["balls"], 1/6)
        self.assertEqual(freq["eating"], 1/6)
        self.assertEqual(freq["chicken"], 1/6)
        self.assertEqual(freq["dumplings"], 1/6)

        document = build_document([s0, s0, s1])
        freq = summarizer._compute_tf(document.sentences)
        self.assertEqual(freq["kicking"], 2/9)
        self.assertEqual(freq["soccer"], 2/9)
        self.assertEqual(freq["balls"], 2/9)
        self.assertEqual(freq["eating"], 1/9)
        self.assertEqual(freq["chicken"], 1/9)
        self.assertEqual(freq["dumplings"], 1/9)
Beispiel #9
0
def test_tf_idf_metric_should_be_real_number():
    """https://github.com/miso-belica/sumy/issues/41"""
    summarizer = KLSummarizer()
    frequencies = summarizer.compute_tf([Sentence("There are five words, jop.", Tokenizer("english"))])

    assert frequencies == {
        "there": 0.2,
        "are": 0.2,
        "five": 0.2,
        "words": 0.2,
        "jop": 0.2,
    }
def get_lexrank(tweets):
    sens = [Sentence(t, TwokenizeWrapper()) for t in tweets]
    tweet_document = ObjectDocumentModel([Paragraph(sens)])
    LANGUAGE = "english"
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    SENTENCES_COUNT = len(sens)
    lex_ranked = summarizer(tweet_document, SENTENCES_COUNT)
    if len(sens) != len(lex_ranked):
        print('lr error')
    return [lex_ranked[s] for s in sens]
Beispiel #11
0
def test_compute_ratings():
    summarizer = _build_summarizer(EMPTY_STOP_WORDS)

    s0 = Sentence("Dog cat fish.", Tokenizer("english"))
    s1 = Sentence("Dog cat camel.", Tokenizer("english"))
    s2 = Sentence("Fish frog horse.", Tokenizer("english"))
    document = build_document([s0, s1, s2])

    ratings = summarizer._compute_ratings(document.sentences)
    assert ratings[s0] == 0
    assert ratings[s1] == -2
    assert ratings[s2] == -1

    # Due to the frequency discounting, after finding sentence s0,
    # s2 should come before s1 since only two of its words get discounted
    # rather than all 3 of s1's
    s0 = Sentence("one two three", Tokenizer("english"))
    s1 = Sentence("one two four", Tokenizer("english"))
    s2 = Sentence("three five six", Tokenizer("english"))
    document = build_document([s0, s1, s2])

    ratings = summarizer._compute_ratings(document.sentences)
    assert ratings[s0] == 0
    assert ratings[s1] == -2
    assert ratings[s2] == -1
Beispiel #12
0
def main():
    print("\n\t\t SUMMARIZATION REVIEW\t\t\n")
    print('[INFO] Loading configuration')
    with open("./config.yml", 'r') as file:
        config_var = safe_load(file)["main"]

    data = load_clean_data(path_to_file=str(config_var['dataset_folder']) + "/" +
                       str(config_var['data_to_use']))
    #
    #
    #
    print("[INFO] Training sentence tokenizer for summary on all articles.")
    punkt_tokenizer = PunktSentenceTokenizer(
    train_text="\n".join([sent for sent in data["Paragraphs_as_string"]])
                                            )
    #
    #
    #
    len_sum = np.mean(data["Summary"].apply(lambda x: len(punkt_tokenizer.tokenize(x))))
    print("[INFO] Average number of sentences in article summaries", len_sum)
    print("[COMMENT] Considering this value as reference to generate the automatic summaries.")
    len_sum = int(len_sum)
    #
    #
    #
    print("[INFO] Using "+str(config_var['language'])+"stenner")
    stemmer = Stemmer(config_var['language'])
    print("[INFO] Preparing summarizers")
    summarizer_dict = {"LSA":LsaSummarizer(stemmer),
                        "Luhn": LuhnSummarizer(stemmer),
                        "LexRank":LexRankSummarizer(stemmer),
                        "SumBasics":SumBasicSummarizer(stemmer),
                        "Random": RandomSummarizer(stemmer),
                        "Reduction": ReductionSummarizer(stemmer)}

    print("[INFO] Preparing stopwords.")
    for summarizer in summarizer_dict.values():
        summarizer.stop_words = get_stop_words('english')

    print("[INFO] Summaries preparation")
    dict_res = {}
    dict_summs = {}
    for name, summarizer in summarizer_dict.items():
        print("[INFO] Method:", name)
        results_rouge_1 = []
        results_rouge_2 = []
        results_rouge_l_1 = []
        results_rouge_l_2 = []
        sums = {}
        for i in progressbar.progressbar(range(len(data))):
            (article, summary) = (data["Paragraphs_as_string"][i],
                                  data["Summary"][i])

            parser = PlaintextParser.from_string(
                article, tokenizer=Tokenizer('english'))

            summaries = [
                sentence for sentence in summarizer(parser.document, len_sum)
            ]
            summaries_str = [
                str(sentence) for sentence in summarizer(parser.document, len_sum)
            ]
            # Append current summary results
            # Since there are problems with some documents
            # being skipped, I need to save the index as well
            sums[i] = (" ".join(summaries_str))

            #     To use sumy's evaluation functions, I need to have the text in
            #     Sentence objects
            reference_sentences = [
                Sentence(sent, tokenizer=Tokenizer("english"))
                for sent in punkt_tokenizer.tokenize(summary)
            ]
            try:
                results_rouge_1.append(
                    rouge_1(evaluated_sentences=summaries,
                            reference_sentences=reference_sentences))
            except:
                results_rouge_1.append(np.nan)
            try:
                results_rouge_2.append(
                    rouge_2(evaluated_sentences=summaries,
                                          reference_sentences=reference_sentences))
            except:
                # print("[ERROR] Some problem occurd in the rouge_L (summary level) calculation. This is most likely caused by sentences too short in the summary. No workaround has been found for this: the value will be set to NA.")
                results_rouge_2.append(np.nan)

            try:
                results_rouge_l_1.append(
                    rouge_l_sentence_level(evaluated_sentences=summaries,
                                          reference_sentences=reference_sentences))
            except:
                # print("[ERROR] Some problem occurd in the rouge_L (summary level) calculation. This is most likely caused by sentences too short in the summary. No workaround has been found for this: the value will be set to NA.")
                results_rouge_l_1.append(np.nan)

            try:
                results_rouge_l_2.append(
                    rouge_l_summary_level(evaluated_sentences=summaries,
                                          reference_sentences=reference_sentences))
            except:
                # print("[ERROR] Some problem occurd in the rouge_L (summary level) calculation. This is most likely caused by sentences too short in the summary. No workaround has been found for this: the value will be set to NA.")
                results_rouge_l_2.append(np.nan)
    #         Save results and progress to next summarizer
        dict_res[name] = {
            "Rouge_1": results_rouge_1,
            "Rouge_2": results_rouge_2,
            "Rouge_L_sentence_level": results_rouge_l_1,
            "Rouge_L_summary_level": results_rouge_l_2
        }
        # Save summaries to dictionary
        dict_summs[name] = sums
    print("[INFO] Summaries and evaluations completed.")
    print("[INFO] Saving data to output.")
    # Create pandas dataframe for mean of results
    res_mean = pd.DataFrame(columns = dict_res.keys())
    # Dataframe for std of results
    res_se = pd.DataFrame(columns = dict_res.keys())
    for col in res_mean:
        res_mean[col] = pd.Series(
            {key: np.nanmean(value)
             for key, value in dict_res[col].items()})
        res_se[col] = pd.Series(
            {key: np.nanstd(value)/np.sqrt(len(value))
             for key, value in dict_res[col].items()})

    print("[INFO] Saving evaluation averages.")
    with open(config_var['output_folder']+"/avgs.csv", 'w') as file:
        res_mean.to_csv(file)
    print("[INFO] Saving evaluations standard errors.")
    with open(config_var['output_folder']+"/ses.csv", 'w') as file:
        res_se.to_csv(file)
    print("[INFO] Saving to json all produced summaries.")
    with open(config_var['output_folder']+"/summaries.json", 'w') as file:
        json.dump(dict_summs, file)
    print("[INFO] Program completed successfully.")