def summarize(text):
    if isvalid(text):
        all_capital = False
        # to avoid that all capital letter sentence gives empty output: we lower all and the upper all later on
        if text.upper() == text:
            text = text.lower()
            all_capital = True

        if PY2:
            parser = PlaintextParser.from_string(
                text.decode('ascii', errors='ignore'), Tokenizer(LANGUAGE))
        else:
            parser = PlaintextParser.from_string(
                text.encode().decode('ascii', errors='ignore'),
                Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        sentences = [
            str(s)
            for s in summarizer(parser.document, sentences_count=n_sentences)
        ]

        if all_capital:
            output_sentences = ' '.join(sentences).upper()
            all_capital = False
        else:
            output_sentences = ' '.join(sentences)

        return output_sentences
    else:
        return ''
Example #2
0
def get_doc_summary(html, url):
    '''
    Parse document text and extract summary with summarization 
    algorithms. This is helpful when meta-desc tag is not available
    '''
    from sumy.parsers.html import HtmlParser
    # from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer
    from sumy.nlp.stemmers import Stemmer
    from sumy.utils import get_stop_words

    LANGUAGE = "english"
    SENTENCES_COUNT = 3

    parser = HtmlParser.from_string(html, url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    res = ""
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        res += str(sentence)
    return res
    def test_single_sentence(self):
        document = build_document(("I am one sentence",))
        summarizer = TextRankSummarizer()
        summarizer.stop_words = ("I", "am",)

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 1)
Example #4
0
    def __init__(self,
                 modelfn=None,
                 classnames=None,
                 language="english",
                 explainer=None,
                 summarizer=None,
                 fm=962,
                 topfeaturescount=100,
                 sentencescount=6,
                 logger=None):
        self.fm = fm
        self.modelfn = modelfn
        self.classnames = classnames
        self.topfeaturescount = topfeaturescount
        self.language = language
        self.sentencescount = sentencescount

        if explainer is not None:
            self.explainer = explainer
        else:
            self.explainer = lime_text.LimeTextExplainer(
                class_names=self.classnames)

        if summarizer is not None:
            self.summarizer = summarizer
        else:
            self.summarizer = TextRankSummarizer(Stemmer(self.language))
            self.summarizer.stop_words = get_stop_words(self.language)

        if logger is not None:
            self.log = logger
        else:
            self.log = logging.getLogger()
    def articleSummerization(self, article, length):

        parser = PlaintextParser.from_string(article, Tokenizer("english"))
        stemmer = Stemmer("english")
        summarizer = TextRankSummarizer(stemmer)
        summarizer.stop_words = get_stop_words("english")
        return ' '.join([str(i) for i in summarizer(parser.document, length)])
Example #6
0
    def summarize(self, corpus, length, algorithm):
        parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE))

        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(self.LANGUAGE))
        else:
            raise NotImplemented("Summary algorithm is not available")

        summarizer.stop_words = get_stop_words(self.LANGUAGE)
        summary = " ".join(
            [obj._text for obj in summarizer(parser.document, length)])

        return summary
Example #7
0
    def post(self):
        """
        Extract summary (key sentences) from text
        """
        # data = api.payload
        data = request.json
        text = data['text']
        num_sentences = data['num_sentences']
        num_sentences = num_sentences if isinstance(
            num_sentences, int) else DEFAULT_NUM_SENTENCES
        log.debug('num_sentences={}'.format(num_sentences))

        # log.debug('text: {}'.format(text))

        # TODO: check for minimum number of sentences in text?

        summary_sentences = []
        if text:
            parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))

            stemmer = Stemmer(LANGUAGE)
            summarizer = TextRankSummarizer(stemmer)
            summarizer.stop_words = get_stop_words(LANGUAGE)

            summary = summarizer(parser.document, num_sentences)
            # summary_text = ' '.join([sentence._text for sentence in summary])
            summary_sentences = [sentence._text for sentence in summary]

        log.debug('response body:\n{}'.format(summary_sentences))
        return summary_sentences, 200, {'Access-Control-Allow-Origin': '*'}
Example #8
0
def sum_from_string(string, language="english", sentences_cout=100):
    parser = PlaintextParser.from_string(string, Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = Summarizer(stemmer)
    summarizer.stem_words = get_stop_words(language)
    sentences = summarizer(parser.document, sentences_cout)
    return sentences
Example #9
0
    def test_single_sentence(self):
        document = build_document(("I am one sentence",))
        summarizer = TextRankSummarizer()
        summarizer.stop_words = ("I", "am",)

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 1)
Example #10
0
def textrank(parser,sentence_count):
    summarizer_3 = TextRankSummarizer(Stemmer(language))
    summarizer_3.stop_words = get_stop_words(language)
    summary_3 = summarizer_3(parser.document, sentence_count)
    temp = ''
    for sentence in summary_3:
        temp = temp + str(sentence)
    return (temp)
Example #11
0
def textrank_summarizer(text, stemmer, language, sentences_count):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer_luhn = TextRankSummarizer(stemmer)
    summarizer_luhn.stop_words = get_stop_words(language)
    sentences = []
    for sentence in summarizer_luhn(parser.document, sentences_count):
        a = sentence
        sentences.append(str(a))
    return "\n".join(sentences)
Example #12
0
def __init__():
    LANGUAGE = "english"
    SENTENCES_COUNT = 1


    stemmer = Stemmer(LANGUAGE)

    lsaSummarizer = Lsa(stemmer)
    lsaSummarizer.stop_words = get_stop_words(LANGUAGE)
    luhnSummarizer = Luhn(stemmer)
    luhnSummarizer.stop_words = get_stop_words(LANGUAGE)
    # edmundsonSummarizer.bonus_words = get_bonus_words

    lexrankSummarizer = LexRank(stemmer)
    lexrankSummarizer.stop_words = get_stop_words(LANGUAGE)

    textrankSummarizer = TxtRank(stemmer)
    textrankSummarizer.stop_words = get_stop_words(LANGUAGE)

    sumbasicSummarizer = SumBasic(stemmer)
    sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE)


    klSummarizer = KL(stemmer)
    klSummarizer.stop_words = get_stop_words(LANGUAGE)

    parser = HtmlParser.from_string(text, 0, Tokenizer(LANGUAGE))

    allvariations = []

    for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT):
       # print("Summarizing text via LSA: ")
        print((str(sentence)))


        allvariations.append(sentence)
    for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Luhn: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Lexrank: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Textrank: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Sumbasic: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in klSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via klSum: ")
        print(str(sentence))
        allvariations.append(sentence)
        return allvariations
Example #13
0
def test_two_sentences():
    document = build_document(("I am that 1. sentence", "And I am 2. winning prize"))
    summarizer = TextRankSummarizer()
    summarizer.stop_words = ("I", "am", "and", "that",)

    returned = summarizer(document, 10)
    assert len(returned) == 2
    assert to_unicode(returned[0]) == "I am that 1. sentence"
    assert to_unicode(returned[1]) == "And I am 2. winning prize"
Example #14
0
    def test_two_sentences(self):
        document = build_document(("I am that 1. sentence", "And I am 2. winning prize"))
        summarizer = TextRankSummarizer()
        summarizer.stop_words = ("I", "am", "and", "that",)

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "I am that 1. sentence")
        self.assertEqual(to_unicode(returned[1]), "And I am 2. winning prize")
    def test_two_sentences(self):
        document = build_document(("I am that 1. sentence", "And I am 2. winning prize"))
        summarizer = TextRankSummarizer()
        summarizer.stop_words = ("I", "am", "and", "that",)

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "I am that 1. sentence")
        self.assertEqual(to_unicode(returned[1]), "And I am 2. winning prize")
Example #16
0
def sumy_tr_summarizer(docx):
    parser = PlaintextParser.from_string(docx, Tokenizer("english"))
    tr_summarizer = TextRankSummarizer()
    tr_summarizer = TextRankSummarizer(Stemmer("english"))
    tr_summarizer.stop_words = get_stop_words("english")
    #Summarize the document with 2 sentences
    summary = tr_summarizer(parser.document, 2)
    summary_list = [str(sentence) for sentence in summary]
    result = ' '.join(summary_list)
    return result
Example #17
0
def summarize(corpus, length, algorithm):
    summarizer = None
    summary = "No compatible summarizer was selected, please use one of these : textrank, lexrank, luhn, edmonson*, kl, lsa, sumbasic, random (* doesn\'t work yet)"
    algorithm = algorithm.lower()
    try:
        parser = PlaintextParser.from_string(corpus,Tokenizer(LANGUAGE))
        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(LANGUAGE))

        if summarizer:
            summarizer.stop_words = get_stop_words(LANGUAGE)
            summary = " ".join([obj._text for obj in summarizer(parser.document, length)])

        return summary

    except Exception as e:
        return str(e)
Example #18
0
    def summarize(self, corpus, length, algorithm):
        parser = PlaintextParser.from_string(corpus,Tokenizer(self.LANGUAGE))

        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(self.LANGUAGE))
        else:
            raise NotImplemented("Summary algorithm is not available")

        summarizer.stop_words = get_stop_words(self.LANGUAGE)
        summary = " ".join([obj._text for obj in summarizer(parser.document, length)])

        return summary
    def test_stop_words_correctly_removed(self):
        summarizer = TextRankSummarizer()
        summarizer.stop_words = ["stop", "Halt", "SHUT", "HmMm"]

        document = build_document(
            ("stop halt shut hmmm", "Stop Halt Shut Hmmm",),
            ("StOp HaLt ShUt HmMm", "STOP HALT SHUT HMMM",),
            ("Some relevant sentence", "Some moRe releVant sentEnce",),
        )
        sentences = document.sentences

        expected = []
        returned = summarizer._to_words_set(sentences[0])
        self.assertEqual(expected, returned)
        returned = summarizer._to_words_set(sentences[1])
        self.assertEqual(expected, returned)
        returned = summarizer._to_words_set(sentences[2])
        self.assertEqual(expected, returned)
        returned = summarizer._to_words_set(sentences[3])
        self.assertEqual(expected, returned)

        expected = ["some", "relevant", "sentence"]
        returned = summarizer._to_words_set(sentences[4])
        self.assertEqual(expected, returned)
        expected = ["some", "more", "relevant", "sentence"]
        returned = summarizer._to_words_set(sentences[5])
        self.assertEqual(expected, returned)
Example #20
0
def textSummary(data, SENTENCES_COUNT):
    LANGUAGE = "english"
    parser = PlaintextParser.from_string(data, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    x = ''
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        x += ' {}'.format(str(sentence))
    return x
Example #21
0
def summarize(url, sent_count=10):
    """Automatic text summarizer
    https://pypi.python.org/pypi/sumy
    """
    lang = "english"
    parser = HtmlParser.from_url(url, Tokenizer(lang))
    stemmer = Stemmer(lang)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(lang)
    summary = [str(sent) for sent in summarizer(parser.document, sent_count)]
    return (summary)
    def test_three_sentences_but_second_winner(self):
        document = build_document([
            "I am that 1. sentence",
            "And I am 2. sentence - winning sentence",
            "And I am 3. sentence - winner is my 2nd name",
        ])
        summarizer = TextRankSummarizer()
        summarizer.stop_words = ["I", "am", "and", "that"]

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "And I am 2. sentence - winning sentence")
Example #23
0
def test_sentences_rating():
    document = build_document([
        "a c e g",
        "a b c d e f g",
        "b d f",
    ])
    summarizer = TextRankSummarizer()
    summarizer.stop_words = ["I", "am", "and", "that"]

    ratings = summarizer.rate_sentences(document)
    assert len(ratings) == 3
    assert pytest.approx(sum(ratings.values())) == 1
Example #24
0
def summarize(text):

    parser = PlaintextParser.from_string(text.decode(
        'ascii', errors='ignore'), Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    sentences = [str(s) for s in summarizer(
        parser.document, sentences_count=n_sentences)]
    return ' '.join(sentences)
Example #25
0
def textrankReferenceSummary(path):
    sentencesList = []
    parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = TextRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        #print(sentence._text)
        sentencesList.append(sentence._text)

    return sentencesList
Example #26
0
def test_sentences_rating():
    document = build_document([
        "a c e g",
        "a b c d e f g",
        "b d f",
    ])
    summarizer = TextRankSummarizer()
    summarizer.stop_words = ["I", "am", "and", "that"]

    ratings = summarizer.rate_sentences(document)
    assert len(ratings) == 3
    assert pytest.approx(sum(ratings.values())) == 1
Example #27
0
def sumy(text, LANGUAGE='english', COUNT=2):
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    # summarizer = TextRankSummarizer()
    summarizer = TextRankSummarizer(Stemmer(LANGUAGE))
    summarizer.stop_words = get_stop_words(LANGUAGE)
    summay_text = ""
    for sentence in summarizer(parser.document, COUNT):
        summay_text = summay_text + " " + str(sentence)
    summay_text = summay_text.strip()
    # summay_text = re.sub(' +', ' ', summay_text)

    return summay_text
Example #28
0
    def test_three_sentences_but_second_winner(self):
        document = build_document([
            "I am that 1. sentence",
            "And I am 2. sentence - winning sentence",
            "And I am 3. sentence - winner is my 2nd name",
        ])
        summarizer = TextRankSummarizer()
        summarizer.stop_words = ["I", "am", "and", "that"]

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "And I am 2. sentence - winning sentence")
Example #29
0
    def test_sentences_rating(self):
        document = build_document([
            "a c e g",
            "a b c d e f g",
            "b d f",
        ])
        summarizer = TextRankSummarizer()
        summarizer.stop_words = ["I", "am", "and", "that"]

        ratings = summarizer.rate_sentences(document)
        self.assertEqual(len(ratings), 3)
        self.assertTrue(ratings[document.sentences[1]] > ratings[document.sentences[0]])
        self.assertTrue(ratings[document.sentences[0]] > ratings[document.sentences[2]])
    def test_sentences_rating(self):
        document = build_document([
            "a c e g",
            "a b c d e f g",
            "b d f",
        ])
        summarizer = TextRankSummarizer()
        summarizer.stop_words = ["I", "am", "and", "that"]

        ratings = summarizer.rate_sentences(document)
        self.assertEqual(len(ratings), 3)
        self.assertTrue(ratings[document.sentences[1]] > ratings[document.sentences[0]])
        self.assertTrue(ratings[document.sentences[0]] > ratings[document.sentences[2]])
Example #31
0
def textrankReferenceSummary(path):	
	sentencesList=[]
	parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE))
	stemmer = Stemmer(LANGUAGE)
	summarizer = TextRankSummarizer(stemmer)
	summarizer.stop_words = get_stop_words(LANGUAGE)
	

	for sentence in summarizer(parser.document, SENTENCES_COUNT):
		#print(sentence._text)
		sentencesList.append(sentence._text)

	return sentencesList
Example #32
0
def run_summarizer(parser, sentences, language='english'):
    """
    :params parser: Parser for selected document type
    :params sentences: Maximum sentences for summarizer.

    :returns summary: Summarized page.
    """

    summarizer = Summarizer(Stemmer(language))
    summarizer.stop_words = get_stop_words(language)
    return [
        str(sentence) for sentence in summarizer(parser.document, sentences)
    ]
Example #33
0
def Summarize_Content_Custom(Audio_Text, sentences_count, Summarize_Method):
    actual_sentences_count = float(len(sent_tokenize(Audio_Text))) * 0.5
    parser = PlaintextParser.from_string(Audio_Text, Tokenizer("english"))
    stemmer = Stemmer("english")
    if (Summarize_Method == "Gensim"):
        #ratio: define length of the summary as a proportion of the text
        temp = summarize(Audio_Text, ratio=0.5)
        sen = sent_tokenize(temp)
        sen = Counter(sen)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
    elif (Summarize_Method == "LexRankSummarizer"):
        # Using LexRank(Sentence based ranking based on repeating sentences)
        summarizer_Lex = LexRankSummarizer(stemmer)
        summarizer_Lex.stop_words = get_stop_words("english")
        #Summarize the document with 2 sentences
        summary = summarizer_Lex(parser.document, actual_sentences_count)
        sen = Counter(summary)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
    elif (Summarize_Method == "LuhnSummarizer"):
        # Using LUHN(Sentence based on frequency of most important words)
        summarizer_luhn = LuhnSummarizer(stemmer)
        summarizer_luhn.stop_words = get_stop_words("english")
        summary_1 = summarizer_luhn(parser.document, actual_sentences_count)
        sen = Counter(summary_1)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
    elif (Summarize_Method == "LsaSummarizer"):
        # Using LSA(Sentence based on frequency of most important words)
        summarizer_lsa2 = LsaSummarizer()
        summarizer_lsa2 = LsaSummarizer(stemmer)
        summarizer_lsa2.stop_words = get_stop_words("english")
        summary = summarizer_lsa2(parser.document, actual_sentences_count)
        sen = Counter(summary)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
    elif (Summarize_Method == "TextRankSummarizer"):
        # Using LSA(Sentence based on frequency of most important words)
        summarizer_text = TextRankSummarizer()
        summarizer_text = TextRankSummarizer(stemmer)
        summarizer_text.stop_words = get_stop_words("english")
        summary = summarizer_text(parser.document, actual_sentences_count)
        sen = Counter(summary)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
Example #34
0
def TextRank(rsc_file, dst_file, count):
    language = "chinese"
    parser = PlaintextParser.from_file(rsc_file,
                                       Tokenizer(language),
                                       encoding='utf-8')
    stemmer = Stemmer(language)  # 语言容器

    summarizer = TextRankSummarizer(stemmer)  # Luhn算法
    summarizer.stop_words = get_stop_words(language)
    with open(dst_file, 'w', encoding='utf-8') as f:
        for sentence in summarizer(parser.document, count):
            f.write(str(sentence))
            f.write('\n')
            print(sentence)
Example #35
0
    def test_stop_words_correctly_removed(self):
        summarizer = TextRankSummarizer()
        summarizer.stop_words = ["stop", "Halt", "SHUT", "HmMm"]

        document = build_document(
            ("stop halt shut hmmm", "Stop Halt Shut Hmmm",),
            ("StOp HaLt ShUt HmMm", "STOP HALT SHUT HMMM",),
            ("Some relevant sentence", "Some moRe releVant sentEnce",),
        )
        sentences = document.sentences

        expected = []
        returned = summarizer._to_words_set(sentences[0])
        self.assertEqual(expected, returned)
        returned = summarizer._to_words_set(sentences[1])
        self.assertEqual(expected, returned)
        returned = summarizer._to_words_set(sentences[2])
        self.assertEqual(expected, returned)
        returned = summarizer._to_words_set(sentences[3])
        self.assertEqual(expected, returned)

        expected = ["some", "relevant", "sentence"]
        returned = summarizer._to_words_set(sentences[4])
        self.assertEqual(expected, returned)
        expected = ["some", "more", "relevant", "sentence"]
        returned = summarizer._to_words_set(sentences[5])
        self.assertEqual(expected, returned)
Example #36
0
def summarizer(parser, sentences, language='english'):
    """
    :params parser: Parser for selected document type
    :params sentences: Maximum sentences for summarizer.

    :returns summary: Summarized page.
    """
    stemmer    = Stemmer(language)
    summarizer = Summarizer(stemmer)

    summarizer.stop_words = get_stop_words(language)

    output = [str(sentence) for sentence in summarizer(parser.document, sentences)]

    return ' '.join(output)
Example #37
0
def test_rating_with_zero_or_single_words_in_sentences(sentences,
                                                       expected_ratings):
    """
    This is an edge-case test when the sentence(s) have only one word or even zero words.
    This test makes me sure the logic will not break when such a case is encountered.
    """
    document = build_document(sentences)
    summarizer = TextRankSummarizer()

    ratings = summarizer.rate_sentences(document)

    assert ratings == {
        document.sentences[0]: pytest.approx(expected_ratings[0]),
        document.sentences[1]: pytest.approx(expected_ratings[1]),
    }
def TextRankSummary(document, sentences):
    parser = PlaintextParser.from_string(document, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, sentences)
    # for sentence in summary:
    #     print(sentence)
    return summary
Example #39
0
def test_sentences_rating():
    document = build_document([
        "a c e g",
        "a b c d e f g",
        "b d f",
    ])
    summarizer = TextRankSummarizer()

    ratings = summarizer.rate_sentences(document)

    assert ratings == {
        document.sentences[0]: pytest.approx(0.29714368215098025),
        document.sentences[1]: pytest.approx(0.42683373199392705),
        document.sentences[2]: pytest.approx(0.2760223553913001),
    }
    assert pytest.approx(sum(ratings.values())) == 1
Example #40
0
def find_relevant_quote(book_id, chapter, num_sentences=1, technique='luhn'):
    """
    Create an extractive summary for a chapter of the book.

    Parameters:
    book_id: (str) the book identifier
    chapter: is the chapter number to summarize
    num_sentences: how many sentences to extract

    Returns:
    sentences: the extracted sentences
    """
    chapter_filename = get_data_filename(book_id, 'book_chapters', chapter)
    parser = PlaintextParser.from_file(chapter_filename, Tokenizer("english"))
    if technique == 'lsa':
        summarizer = LsaSummarizer()
    elif technique == 'lexrank':
        summarizer = LexRankSummarizer()
    elif technique == 'textrank':
        summarizer = TextRankSummarizer()
    elif technique == 'kl':
        summarizer = KLSummarizer()
    elif technique == 'random':
        summarizer = RandomSummarizer()
    elif technique == 'reduction':
        summarizer = ReductionSummarizer()
    elif technique == 'sumbasic':
        summarizer = SumBasicSummarizer()
    else:
        summarizer = LuhnSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return summary
Example #41
0
def _summ_score(storyName, highlightName):
    parser = PlaintextParser.from_file(storyName, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    geneSen = summarizer(parser.document, SENTENCES_COUNT)
    refSen = PlaintextParser.from_file(highlightName, Tokenizer(LANGUAGE)).document.sentences


    #print geneSen
    #print "=========="
    #print refSen
    try:
        return evaluate(geneSen, refSen)
    except Exception as e:
        print storyName
        print e
        raise e
Example #42
0
def summarize():

	final = []

	# Checking the integrity of the url query
	url = request.args.get('url')

	
	if(url == None or url == ""):
		return abort(400)

	# Checking the integrity of the num query
	try:
		num = int(request.args.get('num'))

		num = MIN_SENTENCES_COUNT if num < MIN_SENTENCES_COUNT else num
		num = MAX_SENTENCES_COUNT if num > MAX_SENTENCES_COUNT else num

	except (ValueError, TypeError) as e:
		num = MIN_SENTENCES_COUNT

	# Handles error where url is not a valid url
	try:
		parser = Parser.from_url(url, Tokenizer(LANGUAGE))
	except (requests.exceptions.MissingSchema, requests.exceptions.HTTPError) as e:
		try:
			parser = Parser.from_url("http://" + url, Tokenizer(LANGUAGE))
		except:
			return "URL is not valid.", 403 


	stemmer = Stemmer(LANGUAGE)
	summarizer = Summarizer(stemmer)
	summarizer.stop_words = get_stop_words(LANGUAGE)

	# Take each sentence and append
	for sentence in summarizer(parser.document, num):
		# unidecode takes unicode characters and converts it into ASCII
		final.append(unidecode(str(sentence)))

	return json.dumps({"title": parser.get_title(), "content":final})
Example #43
0
    def summarize_with_info(self, corpus, length, algorithm):
        parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE))

        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE))
            summarizer.bonus_words = parser.significant_words
            summarizer.stigma_words = parser.stigma_words
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(self.LANGUAGE))
        else:
            raise NotImplemented("Summary algorithm is not available")

        summarizer.stop_words = get_stop_words(self.LANGUAGE)

        return summarizer(parser.document, length)
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer 
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import sys


LANGUAGE = "english"
SENTENCES_COUNT = int(sys.argv[2])
text_file = sys.argv[1]


if __name__ == "__main__":
    
    parser = PlaintextParser.from_file(text_file, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = TextRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)