Exemple #1
0
def Random(rsc_file, dst_file, count):
    language = "chinese"
    parser = PlaintextParser.from_file(rsc_file,
                                       Tokenizer(language),
                                       encoding='utf-8')
    stemmer = Stemmer(language)  # 语言容器

    summarizer = RandomSummarizer(stemmer)  # LSA算法
    summarizer.stop_words = get_stop_words(language)
    with open(dst_file, 'w', encoding='utf-8') as f:
        for sentence in summarizer(parser.document, count):
            f.write(str(sentence))
            f.write('\n')
            print(sentence)
Exemple #2
0
def find_relevant_quote(book_id, chapter, num_sentences=1, technique='luhn'):
    """
    Create an extractive summary for a chapter of the book.

    Parameters:
    book_id: (str) the book identifier
    chapter: is the chapter number to summarize
    num_sentences: how many sentences to extract

    Returns:
    sentences: the extracted sentences
    """
    chapter_filename = get_data_filename(book_id, 'book_chapters', chapter)
    parser = PlaintextParser.from_file(chapter_filename, Tokenizer("english"))
    if technique == 'lsa':
        summarizer = LsaSummarizer()
    elif technique == 'lexrank':
        summarizer = LexRankSummarizer()
    elif technique == 'textrank':
        summarizer = TextRankSummarizer()
    elif technique == 'kl':
        summarizer = KLSummarizer()
    elif technique == 'random':
        summarizer = RandomSummarizer()
    elif technique == 'reduction':
        summarizer = ReductionSummarizer()
    elif technique == 'sumbasic':
        summarizer = SumBasicSummarizer()
    else:
        summarizer = LuhnSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return summary
Exemple #3
0
    def summarize_with_info(self, corpus, length, algorithm):
        parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE))

        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE))
            summarizer.bonus_words = parser.significant_words
            summarizer.stigma_words = parser.stigma_words
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(self.LANGUAGE))
        else:
            raise NotImplemented("Summary algorithm is not available")

        summarizer.stop_words = get_stop_words(self.LANGUAGE)

        return summarizer(parser.document, length)
Exemple #4
0
def test_less_than_10_words_should_be_returned():
    """https://github.com/miso-belica/sumy/issues/159"""
    document = build_document_from_string("""
        # Heading one
        First sentence.
        Second sentence.
        Third sentence.

        # Heading two
        I like sentences but this one is really long.
        They are so wordy
        And have many many letters
        And are green in my editor
        But someone doesn't like them :(
    """)
    summarizer = RandomSummarizer()

    def count(max_words, sentence_infos):
        results = []
        words_count = 0
        for info in sentence_infos:
            words_count += len(info.sentence.words)
            if words_count > max_words:
                return results
            else:
                results.append(info)

        return results

    sentences = summarizer(document, partial(count, 10))
    assert 0 < sum(len(s.words) for s in sentences) <= 10
Exemple #5
0
    def summarize(self, corpus, length, algorithm):
        parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE))

        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(self.LANGUAGE))
        else:
            raise NotImplemented("Summary algorithm is not available")

        summarizer.stop_words = get_stop_words(self.LANGUAGE)
        summary = " ".join(
            [obj._text for obj in summarizer(parser.document, length)])

        return summary
def test_less_sentences_than_requested():
    document = build_document_from_string("""
        This is only one sentence.
    """)
    summarizer = RandomSummarizer()

    sentences = summarizer(document, 10)
    assert len(sentences) == 1
    assert to_unicode(sentences[0]) == "This is only one sentence."
Exemple #7
0
    def test_less_sentences_than_requested(self):
        document = build_document_from_string("""
            This is only one sentence.
        """)
        summarizer = RandomSummarizer()

        sentences = summarizer(document, 10)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]),
                         "This is only one sentence.")
Exemple #8
0
    def test_sentences_in_right_order(self):
        document = build_document_from_string("""
            # Heading one
            First sentence.
            Second sentence.
            Third sentence.
        """)
        summarizer = RandomSummarizer()

        sentences = summarizer(document, 4)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "First sentence.")
        self.assertEqual(to_unicode(sentences[1]), "Second sentence.")
        self.assertEqual(to_unicode(sentences[2]), "Third sentence.")
def test_sentences_in_right_order():
    document = build_document_from_string("""
        # Heading one
        First sentence.
        Second sentence.
        Third sentence.
    """)
    summarizer = RandomSummarizer()

    sentences = summarizer(document, 4)
    assert len(sentences) == 3
    assert to_unicode(sentences[0]) == "First sentence."
    assert to_unicode(sentences[1]) == "Second sentence."
    assert to_unicode(sentences[2]) == "Third sentence."
Exemple #10
0
    def test_more_sentences_than_requested(self):
        document = build_document_from_string("""
            # Heading one
            First sentence.
            Second sentence.
            Third sentence.

            # Heading two
            I like sentences
            They are so wordy
            And have many many letters
            And are green in my editor
            But someone doesn't like them :(
        """)
        summarizer = RandomSummarizer()

        sentences = summarizer(document, 4)
        self.assertEqual(len(sentences), 4)
Exemple #11
0
    def get_summarizers(self, names):
        """Retrieves sumy summarizers algorithms

            Parameters:
            names (list): list of summarizer algorithm names

            Returns:
            dict:summarizers

        """
        summarizers = {}
        for name in names:
            if name == "random":
                from sumy.summarizers.random import RandomSummarizer
                summarizers["random"] = RandomSummarizer(null_stemmer)
            elif name == "luhn":
                from sumy.summarizers.luhn import LuhnSummarizer
                summarizers["luhn"] = LuhnSummarizer(stemmer=null_stemmer)
            elif name == "lsa":
                from sumy.summarizers.lsa import LsaSummarizer
                summarizers["lsa"] = LsaSummarizer(stemmer=null_stemmer)
            elif name == "lexrank":
                from sumy.summarizers.lex_rank import LexRankSummarizer
                summarizers["lexrank"] = LexRankSummarizer(null_stemmer)
            elif name == "textrank":
                from sumy.summarizers.text_rank import TextRankSummarizer
                summarizers["textrank"] = TextRankSummarizer(null_stemmer)
            elif name == "sumbasic":
                from sumy.summarizers.sum_basic import SumBasicSummarizer
                summarizers["sumbasic"] = SumBasicSummarizer(null_stemmer)
            elif name == "kl-sum":
                from sumy.summarizers.kl import KLSummarizer
                summarizers["kl-sum"] = KLSummarizer(null_stemmer)
            elif name == "reduction":
                from sumy.summarizers.reduction import ReductionSummarizer
                summarizers["reduction"] = ReductionSummarizer(null_stemmer)

        for _, summarizer in summarizers.items():
            summarizer.stop_words = frozenset(
                self.stop_words._get_stop_words(custom_stop_words=[]))

        return summarizers
Exemple #12
0
    logger.info('Loading standardized docs')
    with open(base_dir + '/std_docs.dump', mode='rb') as f:
        std_docs = pickle.load(f)
    with open(base_dir + '/std_sums.dump', mode='rb') as f:
        std_sums = pickle.load(f)
else:
    logger.error('You must first generate standardized docs and refs')
    exit(1)

rouge = Rouge()

summarizers = {'Luhn': LuhnSummarizer(), \
               'LexRank': LexRankSummarizer(), \
               # 'Lsa': LsaSummarizer(), \
               'TextRank': TextRankSummarizer(), \
               'Random' : RandomSummarizer(), \
               # 'KLSum': KLSummarizer(), \
               'SumBasic': SumBasicSummarizer()\
               }

text_sums = []
for i, summary in enumerate(std_sums):
    text_sums.append(' '.join([' '.join(line) for line in summary]))

# Length of each document, used to guess the size of the summary
x = []
for doc in std_docs:
    words = 0
    for line in doc:
        words += len(line)
    x.append(words)
Exemple #13
0
    def test_empty_document(self):
        document = build_document()
        summarizer = RandomSummarizer()

        sentences = summarizer(document, 10)
        self.assertEqual(len(sentences), 0)
def test_empty_document():
    document = build_document()
    summarizer = RandomSummarizer()

    sentences = summarizer(document, 10)
    assert len(sentences) == 0
Exemple #15
0
def main():
    print("\n\t\t SUMMARIZATION REVIEW\t\t\n")
    print('[INFO] Loading configuration')
    with open("./config.yml", 'r') as file:
        config_var = safe_load(file)["main"]

    data = load_clean_data(path_to_file=str(config_var['dataset_folder']) + "/" +
                       str(config_var['data_to_use']))
    #
    #
    #
    print("[INFO] Training sentence tokenizer for summary on all articles.")
    punkt_tokenizer = PunktSentenceTokenizer(
    train_text="\n".join([sent for sent in data["Paragraphs_as_string"]])
                                            )
    #
    #
    #
    len_sum = np.mean(data["Summary"].apply(lambda x: len(punkt_tokenizer.tokenize(x))))
    print("[INFO] Average number of sentences in article summaries", len_sum)
    print("[COMMENT] Considering this value as reference to generate the automatic summaries.")
    len_sum = int(len_sum)
    #
    #
    #
    print("[INFO] Using "+str(config_var['language'])+"stenner")
    stemmer = Stemmer(config_var['language'])
    print("[INFO] Preparing summarizers")
    summarizer_dict = {"LSA":LsaSummarizer(stemmer),
                        "Luhn": LuhnSummarizer(stemmer),
                        "LexRank":LexRankSummarizer(stemmer),
                        "SumBasics":SumBasicSummarizer(stemmer),
                        "Random": RandomSummarizer(stemmer),
                        "Reduction": ReductionSummarizer(stemmer)}

    print("[INFO] Preparing stopwords.")
    for summarizer in summarizer_dict.values():
        summarizer.stop_words = get_stop_words('english')

    print("[INFO] Summaries preparation")
    dict_res = {}
    dict_summs = {}
    for name, summarizer in summarizer_dict.items():
        print("[INFO] Method:", name)
        results_rouge_1 = []
        results_rouge_2 = []
        results_rouge_l_1 = []
        results_rouge_l_2 = []
        sums = {}
        for i in progressbar.progressbar(range(len(data))):
            (article, summary) = (data["Paragraphs_as_string"][i],
                                  data["Summary"][i])

            parser = PlaintextParser.from_string(
                article, tokenizer=Tokenizer('english'))

            summaries = [
                sentence for sentence in summarizer(parser.document, len_sum)
            ]
            summaries_str = [
                str(sentence) for sentence in summarizer(parser.document, len_sum)
            ]
            # Append current summary results
            # Since there are problems with some documents
            # being skipped, I need to save the index as well
            sums[i] = (" ".join(summaries_str))

            #     To use sumy's evaluation functions, I need to have the text in
            #     Sentence objects
            reference_sentences = [
                Sentence(sent, tokenizer=Tokenizer("english"))
                for sent in punkt_tokenizer.tokenize(summary)
            ]
            try:
                results_rouge_1.append(
                    rouge_1(evaluated_sentences=summaries,
                            reference_sentences=reference_sentences))
            except:
                results_rouge_1.append(np.nan)
            try:
                results_rouge_2.append(
                    rouge_2(evaluated_sentences=summaries,
                                          reference_sentences=reference_sentences))
            except:
                # print("[ERROR] Some problem occurd in the rouge_L (summary level) calculation. This is most likely caused by sentences too short in the summary. No workaround has been found for this: the value will be set to NA.")
                results_rouge_2.append(np.nan)

            try:
                results_rouge_l_1.append(
                    rouge_l_sentence_level(evaluated_sentences=summaries,
                                          reference_sentences=reference_sentences))
            except:
                # print("[ERROR] Some problem occurd in the rouge_L (summary level) calculation. This is most likely caused by sentences too short in the summary. No workaround has been found for this: the value will be set to NA.")
                results_rouge_l_1.append(np.nan)

            try:
                results_rouge_l_2.append(
                    rouge_l_summary_level(evaluated_sentences=summaries,
                                          reference_sentences=reference_sentences))
            except:
                # print("[ERROR] Some problem occurd in the rouge_L (summary level) calculation. This is most likely caused by sentences too short in the summary. No workaround has been found for this: the value will be set to NA.")
                results_rouge_l_2.append(np.nan)
    #         Save results and progress to next summarizer
        dict_res[name] = {
            "Rouge_1": results_rouge_1,
            "Rouge_2": results_rouge_2,
            "Rouge_L_sentence_level": results_rouge_l_1,
            "Rouge_L_summary_level": results_rouge_l_2
        }
        # Save summaries to dictionary
        dict_summs[name] = sums
    print("[INFO] Summaries and evaluations completed.")
    print("[INFO] Saving data to output.")
    # Create pandas dataframe for mean of results
    res_mean = pd.DataFrame(columns = dict_res.keys())
    # Dataframe for std of results
    res_se = pd.DataFrame(columns = dict_res.keys())
    for col in res_mean:
        res_mean[col] = pd.Series(
            {key: np.nanmean(value)
             for key, value in dict_res[col].items()})
        res_se[col] = pd.Series(
            {key: np.nanstd(value)/np.sqrt(len(value))
             for key, value in dict_res[col].items()})

    print("[INFO] Saving evaluation averages.")
    with open(config_var['output_folder']+"/avgs.csv", 'w') as file:
        res_mean.to_csv(file)
    print("[INFO] Saving evaluations standard errors.")
    with open(config_var['output_folder']+"/ses.csv", 'w') as file:
        res_se.to_csv(file)
    print("[INFO] Saving to json all produced summaries.")
    with open(config_var['output_folder']+"/summaries.json", 'w') as file:
        json.dump(dict_summs, file)
    print("[INFO] Program completed successfully.")
Exemple #16
0
                     stemming=True,
                     stopwords=False,
                     word_level=True,
                     length_limit=True,
                     length=600,
                     use_cf=False,
                     cf=95,
                     scoring_formula='best',
                     resampling=True,
                     samples=1,
                     favor=True,
                     p=0.5)

    score6 = r6.calc_score()

    summarizer7 = RandomSummarizer()
    summary7 = summarizer7(parser.document, 10)

    for sentence in summary7:
        generated7 = generated7 + " " + sentence._text

    candidate7 = generated7.split(" ")
    b7 = []

    b7.append(sentence_bleu(ref_bleu, candidate7,
                            weights=(1, 0, 0, 0)))  #1 gram
    b7.append(sentence_bleu(ref_bleu, candidate7,
                            weights=(0, 1, 0, 0)))  #2 gram
    b7.append(sentence_bleu(ref_bleu, candidate7,
                            weights=(0, 0, 1, 0)))  #3 gram
    b7.append(sentence_bleu(ref_bleu, candidate7,
import os


#create folder
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' + directory)


LANGUAGE = "bangla"
SENTENCES_COUNT = 2

if __name__ == "__main__":

    createFolder('Dataset/NCTB/RandomSummary/')
    for i in range(1, 140):
        serial_no = str(i)
        path = "Dataset/NCTB/Source/" + serial_no + ".txt"
        parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        summary = ""
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            summary = summary + " " + str(sentence)
        fi = open('Dataset/NCTB/RandomSummary/' + serial_no + '.txt', '+w')
        fi.write(summary)