Example #1
0
def test_single_sentence():
    document = build_document(("I am one sentence",))
    summarizer = ReductionSummarizer()
    summarizer.stop_words = ("I", "am",)

    returned = summarizer(document, 10)
    assert len(returned) == 1
Example #2
0
def reduction(parser, sentence_count):
    summarizer_7 = ReductionSummarizer(Stemmer(language))
    summarizer_7.stop_words = get_stop_words(language)
    summary_7 = summarizer_7(parser.document, sentence_count)
    temp = ''
    for sentence in summary_7:
        temp = temp + str(sentence)
    return (temp)
Example #3
0
def reduction_summarizer(text, stemmer, language, sentences_count):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer_luhn = ReductionSummarizer(stemmer)
    summarizer_luhn.stop_words = get_stop_words(language)
    sentences = []
    for sentence in summarizer_luhn(parser.document, sentences_count):
        a = sentence
        sentences.append(str(a))
    return "\n".join(sentences)
Example #4
0
def test_two_sentences():
    document = build_document(("I am that 1. sentence", "And I am 2. winning prize"))
    summarizer = ReductionSummarizer()
    summarizer.stop_words = ("I", "am", "and", "that",)

    returned = summarizer(document, 10)
    assert len(returned) == 2
    assert to_unicode(returned[0]) == "I am that 1. sentence"
    assert to_unicode(returned[1]) == "And I am 2. winning prize"
Example #5
0
def test_three_sentences_but_second_winner():
    document = build_document([
        "I am that 1. sentence",
        "And I am 2. sentence - winning sentence",
        "And I am 3. sentence - winner is my 2nd name",
    ])
    summarizer = ReductionSummarizer()
    summarizer.stop_words = ["I", "am", "and", "that"]

    returned = summarizer(document, 1)
    assert len(returned) == 1
    assert to_unicode(returned[0]) == "And I am 2. sentence - winning sentence"
Example #6
0
def test_sentences_rating():
    document = build_document([
        "a c e g",
        "a b c d e f g",
        "b d f",
    ])
    summarizer = ReductionSummarizer()
    summarizer.stop_words = ["I", "am", "and", "that"]

    ratings = summarizer.rate_sentences(document)
    assert len(ratings) == 3
    assert ratings[document.sentences[1]] > ratings[document.sentences[0]]
    assert ratings[document.sentences[0]] > ratings[document.sentences[2]]
Example #7
0
def test_stop_words_correctly_removed():
    summarizer = ReductionSummarizer()
    summarizer.stop_words = ["stop", "Halt", "SHUT", "HmMm"]

    document = build_document(
        ("stop halt shut hmmm", "Stop Halt Shut Hmmm",),
        ("StOp HaLt ShUt HmMm", "STOP HALT SHUT HMMM",),
        ("Some relevant sentence", "Some moRe releVant sentEnce",),
    )
    sentences = document.sentences

    expected = []
    returned = summarizer._to_words_set(sentences[0])
    assert expected == returned
    returned = summarizer._to_words_set(sentences[1])
    assert expected == returned
    returned = summarizer._to_words_set(sentences[2])
    assert expected == returned
    returned = summarizer._to_words_set(sentences[3])
    assert expected == returned

    expected = ["some", "relevant", "sentence"]
    returned = summarizer._to_words_set(sentences[4])
    assert expected == returned
    expected = ["some", "more", "relevant", "sentence"]
    returned = summarizer._to_words_set(sentences[5])
    assert expected == returned
def ReductionSummary(document, sentences):
    parser = PlaintextParser.from_string(document, Tokenizer("english"))
    summarizer = ReductionSummarizer()
    summary = summarizer(parser.document, sentences)
    # for sentence in summary:
    #     print(sentence)
    return summary
Example #9
0
def find_relevant_quote(book_id, chapter, num_sentences=1, technique='luhn'):
    """
    Create an extractive summary for a chapter of the book.

    Parameters:
    book_id: (str) the book identifier
    chapter: is the chapter number to summarize
    num_sentences: how many sentences to extract

    Returns:
    sentences: the extracted sentences
    """
    chapter_filename = get_data_filename(book_id, 'book_chapters', chapter)
    parser = PlaintextParser.from_file(chapter_filename, Tokenizer("english"))
    if technique == 'lsa':
        summarizer = LsaSummarizer()
    elif technique == 'lexrank':
        summarizer = LexRankSummarizer()
    elif technique == 'textrank':
        summarizer = TextRankSummarizer()
    elif technique == 'kl':
        summarizer = KLSummarizer()
    elif technique == 'random':
        summarizer = RandomSummarizer()
    elif technique == 'reduction':
        summarizer = ReductionSummarizer()
    elif technique == 'sumbasic':
        summarizer = SumBasicSummarizer()
    else:
        summarizer = LuhnSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return summary
Example #10
0
def ReductionSummary(document, sentences):
    parser = PlaintextParser.from_string(document, Tokenizer("english"))
    summarizer = ReductionSummarizer()
    summary = summarizer(parser.document, sentences)
    results = []
    for sentence in summary:
        results.append(str(sentence))
    return results
Example #11
0
def dada_summarize(content: str, title: str = "") -> dict :
    response = dict()
    content = process_content(content)
    title = process_title(title)
    
    # textrank [need newline to split sentence]
    response["textrank"] = summarize(content)
    
    # textteaser [need newline to split sentence]
    cnt = int(len(content.split('\n'))*0.3)
    response['textteaser'] = "\n".join(tt.summarize(title, content, count=cnt))
    
    ### sumy
    parser = PlaintextParser.from_string(content, tokenizer)
    
    # LSA
    summarizer = LsaSummarizer(stemmer)
    sentences = [str(i) for i in summarizer(parser.document, SENTENCES_COUNT)]
    response['lsa'] = "\n".join(sentences)
    
    # textrank2
    summarizer = TextRankSummarizer(stemmer)
    sentences = [str(i) for i in summarizer(parser.document, SENTENCES_COUNT)]
    response['textrank2'] = "\n".join(sentences)
    
    # lexrank
    summarizer = LexRankSummarizer(stemmer)
    sentences = [str(i) for i in summarizer(parser.document, SENTENCES_COUNT)]
    response['lexrank'] = "\n".join(sentences)
    
    # ruduction
    summarizer = ReductionSummarizer(stemmer)
    sentences = [str(i) for i in summarizer(parser.document, SENTENCES_COUNT)]
    response['reduction'] = "\n".join(sentences)
    
    #kl-sum
    summarizer = KLSummarizer(stemmer)
    sentences = [str(i) for i in summarizer(parser.document, SENTENCES_COUNT)]
    response['kl-sum'] = "\n".join(sentences)
    
    # bert
    response['bert'] = model(content, ratio=0.4)
    
    return response
Example #12
0
    def get_summarizers(self, names):
        """Retrieves sumy summarizers algorithms

            Parameters:
            names (list): list of summarizer algorithm names

            Returns:
            dict:summarizers

        """
        summarizers = {}
        for name in names:
            if name == "random":
                from sumy.summarizers.random import RandomSummarizer
                summarizers["random"] = RandomSummarizer(null_stemmer)
            elif name == "luhn":
                from sumy.summarizers.luhn import LuhnSummarizer
                summarizers["luhn"] = LuhnSummarizer(stemmer=null_stemmer)
            elif name == "lsa":
                from sumy.summarizers.lsa import LsaSummarizer
                summarizers["lsa"] = LsaSummarizer(stemmer=null_stemmer)
            elif name == "lexrank":
                from sumy.summarizers.lex_rank import LexRankSummarizer
                summarizers["lexrank"] = LexRankSummarizer(null_stemmer)
            elif name == "textrank":
                from sumy.summarizers.text_rank import TextRankSummarizer
                summarizers["textrank"] = TextRankSummarizer(null_stemmer)
            elif name == "sumbasic":
                from sumy.summarizers.sum_basic import SumBasicSummarizer
                summarizers["sumbasic"] = SumBasicSummarizer(null_stemmer)
            elif name == "kl-sum":
                from sumy.summarizers.kl import KLSummarizer
                summarizers["kl-sum"] = KLSummarizer(null_stemmer)
            elif name == "reduction":
                from sumy.summarizers.reduction import ReductionSummarizer
                summarizers["reduction"] = ReductionSummarizer(null_stemmer)

        for _, summarizer in summarizers.items():
            summarizer.stop_words = frozenset(
                self.stop_words._get_stop_words(custom_stop_words=[]))

        return summarizers
Example #13
0
def sumy_summarizer(text, ratio, summarizer_type):
    num_sent = int(len(text.split(".")) * ratio)
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    if ((summarizer_type == 'lexrank') or (summarizer_type == 'Lexrank')):
        summarizer_instance = LexRankSummarizer()
    elif ((summarizer_type == 'reduction')
          or (summarizer_type == 'reduction')):
        summarizer_instance = ReductionSummarizer()
    elif ((summarizer_type == 'lsa') or (summarizer_type == 'LSA')):
        summarizer_instance = LsaSummarizer()
    elif ((summarizer_type == 'luhn') or (summarizer_type == 'Luhn')):
        summarizer_instance = LuhnSummarizer()
    elif ((summarizer_type == 'KL') or (summarizer_type == 'kl')):
        summarizer_instance = KLSummarizer()
    summary_values = summarizer_instance(parser.document, num_sent)
    final_summary = []
    for sent in summary_values:
        final_summary.append(str(sent))
    summary_values = convert_to_string(final_summary)
    return summary_values
Example #14
0
        
else:
    # Link input
    link = st.text_input("Input website/link here:", "https://www.osapabroad.com/academics/the-oxford-tutorial/")
    st.subheader("Summary")
    LANGUAGE = "english"
    SENTENCES_COUNT = st.sidebar.slider("Sentence Count", 1, 20, 10, 1)
    kw_ratio = st.sidebar.slider("Keyword Ratio: ", 0., 1., 0.2, 0.01)
#     SENTENCES_COUNT = 20
    try:
        parser = HtmlParser.from_url(link, Tokenizer(LANGUAGE))
        # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))

        stemmer = Stemmer(LANGUAGE)

        # summarizer = LsaSummarizer(stemmer)
        summarizer = ReductionSummarizer(stemmer)

        summarizer.stop_words = get_stop_words(LANGUAGE)

        sentences_list = [str(sentence) for sentence in summarizer(parser.document, SENTENCES_COUNT)]

        summary_text = "\n".join(sentences_list)
        st.write(summary_text)

        keywords_list = keywords(summary_text, ratio=kw_ratio).split("\n")

        st.markdown("**Keywords:** " + ", ".join("`" + i + "`" for i in keywords_list))
    except:
        st.write("Link cannot be parsed.")
Example #15
0
    
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.utils import get_stop_words

# algorithms
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.kl import KLSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.reduction import ReductionSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer

algorithm_dic = {"lex": LexRankSummarizer(), "tex": TextRankSummarizer(), "lsa": LsaSummarizer(),\
                 "kl": KLSummarizer(), "luhn": LuhnSummarizer(), "redu": ReductionSummarizer(),\
                 "sum": SumBasicSummarizer()}

def summarize_sentences(sentences, sentences_count=3, algorithm="lex", language="japanese"):
    # ①
    if language == "japanese":
        corpus_maker = JapaneseCorpus()
    else:
        corpus_maker = EnglishCorpus()
    preprocessed_sentences = corpus_maker.preprocessing(sentences)
    preprocessed_sentence_list = corpus_maker.make_sentence_list(preprocessed_sentences)
    corpus = corpus_maker.make_corpus()
    parser = PlaintextParser.from_string(" ".join(corpus), Tokenizer(language))

    # ②
    try:
Example #16
0
def main():
    print("\n\t\t SUMMARIZATION REVIEW\t\t\n")
    print('[INFO] Loading configuration')
    with open("./config.yml", 'r') as file:
        config_var = safe_load(file)["main"]

    data = load_clean_data(path_to_file=str(config_var['dataset_folder']) + "/" +
                       str(config_var['data_to_use']))
    #
    #
    #
    print("[INFO] Training sentence tokenizer for summary on all articles.")
    punkt_tokenizer = PunktSentenceTokenizer(
    train_text="\n".join([sent for sent in data["Paragraphs_as_string"]])
                                            )
    #
    #
    #
    len_sum = np.mean(data["Summary"].apply(lambda x: len(punkt_tokenizer.tokenize(x))))
    print("[INFO] Average number of sentences in article summaries", len_sum)
    print("[COMMENT] Considering this value as reference to generate the automatic summaries.")
    len_sum = int(len_sum)
    #
    #
    #
    print("[INFO] Using "+str(config_var['language'])+"stenner")
    stemmer = Stemmer(config_var['language'])
    print("[INFO] Preparing summarizers")
    summarizer_dict = {"LSA":LsaSummarizer(stemmer),
                        "Luhn": LuhnSummarizer(stemmer),
                        "LexRank":LexRankSummarizer(stemmer),
                        "SumBasics":SumBasicSummarizer(stemmer),
                        "Random": RandomSummarizer(stemmer),
                        "Reduction": ReductionSummarizer(stemmer)}

    print("[INFO] Preparing stopwords.")
    for summarizer in summarizer_dict.values():
        summarizer.stop_words = get_stop_words('english')

    print("[INFO] Summaries preparation")
    dict_res = {}
    dict_summs = {}
    for name, summarizer in summarizer_dict.items():
        print("[INFO] Method:", name)
        results_rouge_1 = []
        results_rouge_2 = []
        results_rouge_l_1 = []
        results_rouge_l_2 = []
        sums = {}
        for i in progressbar.progressbar(range(len(data))):
            (article, summary) = (data["Paragraphs_as_string"][i],
                                  data["Summary"][i])

            parser = PlaintextParser.from_string(
                article, tokenizer=Tokenizer('english'))

            summaries = [
                sentence for sentence in summarizer(parser.document, len_sum)
            ]
            summaries_str = [
                str(sentence) for sentence in summarizer(parser.document, len_sum)
            ]
            # Append current summary results
            # Since there are problems with some documents
            # being skipped, I need to save the index as well
            sums[i] = (" ".join(summaries_str))

            #     To use sumy's evaluation functions, I need to have the text in
            #     Sentence objects
            reference_sentences = [
                Sentence(sent, tokenizer=Tokenizer("english"))
                for sent in punkt_tokenizer.tokenize(summary)
            ]
            try:
                results_rouge_1.append(
                    rouge_1(evaluated_sentences=summaries,
                            reference_sentences=reference_sentences))
            except:
                results_rouge_1.append(np.nan)
            try:
                results_rouge_2.append(
                    rouge_2(evaluated_sentences=summaries,
                                          reference_sentences=reference_sentences))
            except:
                # print("[ERROR] Some problem occurd in the rouge_L (summary level) calculation. This is most likely caused by sentences too short in the summary. No workaround has been found for this: the value will be set to NA.")
                results_rouge_2.append(np.nan)

            try:
                results_rouge_l_1.append(
                    rouge_l_sentence_level(evaluated_sentences=summaries,
                                          reference_sentences=reference_sentences))
            except:
                # print("[ERROR] Some problem occurd in the rouge_L (summary level) calculation. This is most likely caused by sentences too short in the summary. No workaround has been found for this: the value will be set to NA.")
                results_rouge_l_1.append(np.nan)

            try:
                results_rouge_l_2.append(
                    rouge_l_summary_level(evaluated_sentences=summaries,
                                          reference_sentences=reference_sentences))
            except:
                # print("[ERROR] Some problem occurd in the rouge_L (summary level) calculation. This is most likely caused by sentences too short in the summary. No workaround has been found for this: the value will be set to NA.")
                results_rouge_l_2.append(np.nan)
    #         Save results and progress to next summarizer
        dict_res[name] = {
            "Rouge_1": results_rouge_1,
            "Rouge_2": results_rouge_2,
            "Rouge_L_sentence_level": results_rouge_l_1,
            "Rouge_L_summary_level": results_rouge_l_2
        }
        # Save summaries to dictionary
        dict_summs[name] = sums
    print("[INFO] Summaries and evaluations completed.")
    print("[INFO] Saving data to output.")
    # Create pandas dataframe for mean of results
    res_mean = pd.DataFrame(columns = dict_res.keys())
    # Dataframe for std of results
    res_se = pd.DataFrame(columns = dict_res.keys())
    for col in res_mean:
        res_mean[col] = pd.Series(
            {key: np.nanmean(value)
             for key, value in dict_res[col].items()})
        res_se[col] = pd.Series(
            {key: np.nanstd(value)/np.sqrt(len(value))
             for key, value in dict_res[col].items()})

    print("[INFO] Saving evaluation averages.")
    with open(config_var['output_folder']+"/avgs.csv", 'w') as file:
        res_mean.to_csv(file)
    print("[INFO] Saving evaluations standard errors.")
    with open(config_var['output_folder']+"/ses.csv", 'w') as file:
        res_se.to_csv(file)
    print("[INFO] Saving to json all produced summaries.")
    with open(config_var['output_folder']+"/summaries.json", 'w') as file:
        json.dump(dict_summs, file)
    print("[INFO] Program completed successfully.")