Esempio n. 1
0
def test_single_sentence():
    document = build_document(("Já jsem jedna věta",))
    summarizer = LuhnSummarizer()
    summarizer.stop_words = ("já", "jsem",)

    returned = summarizer(document, 10)
    assert len(returned) == 1
Esempio n. 2
0
def summarize(final_transcript, askuser=False):
    print('Summarizing transcript...')
    parser = PlaintextParser.from_file(final_transcript, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    if askuser == True:
        summtype = input('Summarizer type? [1: Luhn, 2: Lex-Rank, 3: Text-Rank] ')
    else:
        summtype = SUMMMETHOD
    
    if summtype == '1':
        summarizer = LuhnSummarizer(stemmer)
        typename = 'luhn'
    elif summtype == '2':
        summarizer = LexRankSummarizer(stemmer)
        typename = 'lex'
    elif summtype == '3':
        summarizer = TextRankSummarizer(stemmer)
        typename = 'tex'
    
    summarizer.stop_words = get_stop_words(LANGUAGE)
    count = SENTENCES_COUNT
    summaryfile = str(final_transcript)[:-4] + '_summ_' + typename + '.txt'
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        sentence_out = str(SENTENCES_COUNT - count + 1) + ':\n' + str(sentence) + '\n--------------\n'
        with open(summaryfile, 'a') as f:
            f.write(sentence_out)
        print(sentence_out)
        count -= 1
    return summaryfile
def models_LUHN_LEX_LSA_2(dataframe):
    LANGUAGE = "english"
    stop = get_stop_words(LANGUAGE)
    size = len(dataframe)
    stemmer = Stemmer(LANGUAGE)

    for i in range(0, size):
        article = dataframe.loc[i, "post_content"]

        parser = PlaintextParser.from_string(article, Tokenizer(LANGUAGE))

        summarizerLUHN = LUHN(stemmer)
        summarizerLUHN.stop_words = stop

        summarizerLEX = LEX(stemmer)
        summarizerLEX.stop_words = stop

        summarizerLSA = LSA(stemmer)
        summarizerLSA.stop_words = stop

        LUHNsentence = summarizerLUHN(parser.document, 1) #summarize document with one sentence
        LEXsentence = summarizerLEX(parser.document, 1) #summarize document with one sentence
        LSAsentence = summarizerLSA(parser.document, 1) #summarize document with one sentence

        for sentence1 in LUHNsentence:
            LUHNsummary = sentence1
        for sentence2 in LEXsentence:
            LEXsummary = sentence2
        for sentence3 in LSAsentence:
            LSAsummary = sentence3

        dataframe.loc[i, "LUHN"] = LUHNsummary
        dataframe.loc[i, "LEX"] = LEXsummary
        dataframe.loc[i, "LSA"] = LSAsummary
Esempio n. 4
0
def test_three_sentences():
    document = build_document((
        "wa s s s wa s s s wa",
        "wb s wb s wb s s s s s s s s s wb",
        "wc s s wc s s wc",
    ))
    summarizer = LuhnSummarizer()
    summarizer.stop_words = ("s",)

    returned = summarizer(document, 1)
    assert list(map(to_unicode, returned)) == [
        "wb s wb s wb s s s s s s s s s wb",
    ]

    returned = summarizer(document, 2)
    assert list(map(to_unicode, returned)) == [
        "wb s wb s wb s s s s s s s s s wb",
        "wc s s wc s s wc",
    ]

    returned = summarizer(document, 3)
    assert list(map(to_unicode, returned)) == [
        "wa s s s wa s s s wa",
        "wb s wb s wb s s s s s s s s s wb",
        "wc s s wc s s wc",
    ]
Esempio n. 5
0
    def test_various_words_with_significant_percentage(self):
        document = build_document((
            "1 a",
            "2 b b",
            "3 c c c",
            "4 d d d",
            "5 z z z z",
            "6 e e e e e",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("1", "2", "3", "4", "5", "6")

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "6 e e e e e")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "5 z z z z")
        self.assertEqual(to_unicode(returned[1]), "6 e e e e e")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "3 c c c")
        self.assertEqual(to_unicode(returned[1]), "5 z z z z")
        self.assertEqual(to_unicode(returned[2]), "6 e e e e e")
Esempio n. 6
0
def test_three_sentences():
    document = build_document((
        "wa s s s wa s s s wa",
        "wb s wb s wb s s s s s s s s s wb",
        "wc s s wc s s wc",
    ))
    summarizer = LuhnSummarizer()
    summarizer.stop_words = ("s",)

    returned = summarizer(document, 1)
    assert list(map(to_unicode, returned)) == [
        "wb s wb s wb s s s s s s s s s wb",
    ]

    returned = summarizer(document, 2)
    assert list(map(to_unicode, returned)) == [
        "wb s wb s wb s s s s s s s s s wb",
        "wc s s wc s s wc",
    ]

    returned = summarizer(document, 3)
    assert list(map(to_unicode, returned)) == [
        "wa s s s wa s s s wa",
        "wb s wb s wb s s s s s s s s s wb",
        "wc s s wc s s wc",
    ]
Esempio n. 7
0
def get_data_list(URL, file_type=""):
    SUMMARY_SENTENCES_COUNT = 5
    sentences = []
    try:
        LANGUAGE = "english"
        # parser = None
        if file_type == "txt":
            parser = HtmlParser.from_string(URL, None, Tokenizer(LANGUAGE))
        elif file_type == "pdf":
            content = read_pdf(URL)
            parser = HtmlParser.from_string(content, None, Tokenizer(LANGUAGE))
        else:
            parser = HtmlParser.from_url(URL, Tokenizer(LANGUAGE))

        document = parser.document
        stemmer = Stemmer(LANGUAGE)

        from sumy.summarizers.luhn import LuhnSummarizer

        LHS = LuhnSummarizer(stemmer)
        LHS.stop_words = get_stop_words(LANGUAGE)
        print("\nSummary using Luhn Summarizer")
        print("*******************************")
        for sentence in LHS(document, SUMMARY_SENTENCES_COUNT):
            sentences.append(str(sentence))
    except Exception as e:
        print(str(e))
    finally:
        return sentences
Esempio n. 8
0
def test_various_words_with_significant_percentage():
    document = build_document((
        "1 a",
        "2 b b",
        "3 c c c",
        "4 d d d",
        "5 z z z z",
        "6 e e e e e",
    ))
    summarizer = LuhnSummarizer()
    summarizer.stop_words = ("1", "2", "3", "4", "5", "6")

    returned = summarizer(document, 1)
    assert list(map(to_unicode, returned)) == [
        "6 e e e e e",
    ]

    returned = summarizer(document, 2)
    assert list(map(to_unicode, returned)) == [
        "5 z z z z",
        "6 e e e e e",
    ]

    returned = summarizer(document, 3)
    assert list(map(to_unicode, returned)) == [
        "3 c c c",
        "5 z z z z",
        "6 e e e e e",
    ]
Esempio n. 9
0
def test_various_words_with_significant_percentage():
    document = build_document((
        "1 a",
        "2 b b",
        "3 c c c",
        "4 d d d",
        "5 z z z z",
        "6 e e e e e",
    ))
    summarizer = LuhnSummarizer()
    summarizer.stop_words = ("1", "2", "3", "4", "5", "6")

    returned = summarizer(document, 1)
    assert list(map(to_unicode, returned)) == [
        "6 e e e e e",
    ]

    returned = summarizer(document, 2)
    assert list(map(to_unicode, returned)) == [
        "5 z z z z",
        "6 e e e e e",
    ]

    returned = summarizer(document, 3)
    assert list(map(to_unicode, returned)) == [
        "3 c c c",
        "5 z z z z",
        "6 e e e e e",
    ]
Esempio n. 10
0
    def test_various_words_with_significant_percentage(self):
        document = build_document((
            "1 a",
            "2 b b",
            "3 c c c",
            "4 d d d",
            "5 z z z z",
            "6 e e e e e",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("1", "2", "3", "4", "5", "6")

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "6 e e e e e")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "5 z z z z")
        self.assertEqual(to_unicode(returned[1]), "6 e e e e e")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "3 c c c")
        self.assertEqual(to_unicode(returned[1]), "5 z z z z")
        self.assertEqual(to_unicode(returned[2]), "6 e e e e e")
Esempio n. 11
0
    def test_three_sentences(self):
        document = build_document((
            "wa s s s wa s s s wa",
            "wb s wb s wb s s s s s s s s s wb",
            "wc s s wc s s wc",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("s", )

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]),
                         "wb s wb s wb s s s s s s s s s wb")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]),
                         "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[1]), "wc s s wc s s wc")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "wa s s s wa s s s wa")
        self.assertEqual(to_unicode(returned[1]),
                         "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[2]), "wc s s wc s s wc")
Esempio n. 12
0
    def test_single_sentence(self):
        document = build_document(("Já jsem jedna věta",))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("já", "jsem",)

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 1)
Esempio n. 13
0
def luhn(parser,sentence_count):
    summarizer_1 = LuhnSummarizer(Stemmer(language))
    summarizer_1.stop_words = get_stop_words(language)
    summary_1 = summarizer_1(parser.document, sentence_count)
    temp = ''
    for sentence in summary_1:
        temp = temp + str(sentence)
    return (temp)
def luhn_summarizer(text, stemmer, LANGUAGE, SENTENCES_COUNT):
    parser = PlaintextParser.from_string(text, sumytoken(LANGUAGE))
    summarizer_luhn = LuhnSummarizer(stemmer)
    summarizer_luhn.stop_words = get_stop_words(LANGUAGE)
    sentences = []
    for sentence in summarizer_luhn(parser.document, SENTENCES_COUNT):
        a = sentence
        sentences.append(str(a))
    return " ".join(sentences)
Esempio n. 15
0
def luhn_summarizer(text, stemmer, language, sentences_count):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer_luhn = LuhnSummarizer(stemmer)
    summarizer_luhn.stop_words = get_stop_words(language)
    sentences = []
    for sentence in summarizer_luhn(parser.document, sentences_count):
        a = sentence
        sentences.append(str(a))
    return "\n".join(sentences)
Esempio n. 16
0
    def test_two_sentences(self):
        document = build_document(("Já jsem 1. věta", "A já ta 2. vítězná výhra"))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("já", "jsem", "a", "ta",)

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "Já jsem 1. věta")
        self.assertEqual(to_unicode(returned[1]), "A já ta 2. vítězná výhra")
Esempio n. 17
0
def __init__():
    LANGUAGE = "english"
    SENTENCES_COUNT = 1


    stemmer = Stemmer(LANGUAGE)

    lsaSummarizer = Lsa(stemmer)
    lsaSummarizer.stop_words = get_stop_words(LANGUAGE)
    luhnSummarizer = Luhn(stemmer)
    luhnSummarizer.stop_words = get_stop_words(LANGUAGE)
    # edmundsonSummarizer.bonus_words = get_bonus_words

    lexrankSummarizer = LexRank(stemmer)
    lexrankSummarizer.stop_words = get_stop_words(LANGUAGE)

    textrankSummarizer = TxtRank(stemmer)
    textrankSummarizer.stop_words = get_stop_words(LANGUAGE)

    sumbasicSummarizer = SumBasic(stemmer)
    sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE)


    klSummarizer = KL(stemmer)
    klSummarizer.stop_words = get_stop_words(LANGUAGE)

    parser = HtmlParser.from_string(text, 0, Tokenizer(LANGUAGE))

    allvariations = []

    for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT):
       # print("Summarizing text via LSA: ")
        print((str(sentence)))


        allvariations.append(sentence)
    for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Luhn: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Lexrank: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Textrank: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Sumbasic: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in klSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via klSum: ")
        print(str(sentence))
        allvariations.append(sentence)
        return allvariations
Esempio n. 18
0
    def luhn(self,text_parser):
        assert isinstance(text_parser,plaintext.PlaintextParser)

        summarizer=Luhn()
        #EnglishStemmer())
        #summarizer.stop_words=stopwords.words("english")

        summarizer.stop_words=get_stop_words(settings.SUMMARIZER_LANGUAGE)
        return summarizer(text_parser.document,settings.SUMMARIZER_TOP_X_SENTENCES)
Esempio n. 19
0
def test_two_sentences():
    document = build_document(("Já jsem 1. věta", "A já ta 2. vítězná výhra"))
    summarizer = LuhnSummarizer()
    summarizer.stop_words = ("já", "jsem", "a", "ta",)

    returned = summarizer(document, 10)
    assert list(map(to_unicode, returned)) == [
        "Já jsem 1. věta",
        "A já ta 2. vítězná výhra",
    ]
Esempio n. 20
0
def test_two_sentences():
    document = build_document(("Já jsem 1. věta", "A já ta 2. vítězná výhra"))
    summarizer = LuhnSummarizer()
    summarizer.stop_words = ("já", "jsem", "a", "ta",)

    returned = summarizer(document, 10)
    assert list(map(to_unicode, returned)) == [
        "Já jsem 1. věta",
        "A já ta 2. vítězná výhra",
    ]
Esempio n. 21
0
def sumy_luhn_summarizer(docx):
    parser = PlaintextParser.from_string(docx, Tokenizer("english"))
    luhn_summarizer = LuhnSummarizer()
    luhn_summarizer = LuhnSummarizer(Stemmer("english"))
    luhn_summarizer.stop_words = get_stop_words("english")
    #Summarize the document with 2 sentences
    summary = luhn_summarizer(parser.document, 2)
    summary_list = [str(sentence) for sentence in summary]
    result = ' '.join(summary_list)
    return result
Esempio n. 22
0
def summarize_url(url, summarizer):
    # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html"
    print 'Summarizing ', url
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    if summarizer == 'luhn':
        summarizer = LuhnSummarizer(stemmer)
    elif summarizer == 'edmundson':
        summarizer = ESummarizer(stemmer)
    elif summarizer == 'lsa':
        summarizer = LsaSummarizer(stemmer)
    elif summarizer == 'lex':
        summarizer = LexSummarizer(stemmer)
    elif summarizer == 'text':
        summarizer = TextSummarizer(stemmer)
    elif summarizer == 'sb':
        summarizer = SumBasicSummarizer(stemmer)
    else:
        summarizer = KLSummarizer(stemmer)

    summarizer.stop_words = get_stop_words(LANGUAGE)
    print summarizer

    sentences = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print sentence
        sentences.append(str(sentence))

    return sentences
Esempio n. 23
0
def luhn_summarizer(data):
    text = data
    parser = PlaintextParser.from_string((text), sumytoken(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)    
    print ("\n","*"*30, "LUHN SUMMARIZER", "*"*30)
    summarizer_luhn = LuhnSummarizer(stemmer)
    summarizer_luhn.stop_words = get_stop_words(LANGUAGE)
    result = ''
    for sentence in summarizer_luhn(parser.document, SENTENCES_COUNT):
        result += str(sentence)
    
    return result
Esempio n. 24
0
def searchGoogle(querystring):
    # to do -> handle exceptions, re-query on google if there is an exception by going to the next link and same for pdf and ppt
    num_page = 1
    linkno = 0
    while (True):
        # infinite loop to search for the answer for querystring until it is found
        try:
            print(
                '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
            )
            print("QUERY --> " + str(querystring))
            # print(querystring)

            searchresult = google.search(querystring, num_page)

            searchlink = searchresult[
                linkno].link  # this is the first link of the google search results...we will always go to the first link
            print("Search Link --> " + str(searchlink))

            if searchlink[-4:] == '.pdf' or searchlink[-4:] == '.ppt':
                # go to next link id the current link is a ppt or pdf
                print("Can't include ppts or pdfs, trying next link on Google")
                linkno += 1
                if linkno > 9:
                    # if number of links on one page have been exceede, go to the next google link page
                    num_page += 1
                    linkno = 0
            else:
                LANGUAGE = "english"
                SENTENCES_COUNT = 10

                parser = HtmlParser.from_url(searchlink, Tokenizer(LANGUAGE))

                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # Summarisation using Luhn Summarizer
                stopwords1 = set(stopwords.words('english'))

                datastring = ''

                # using the LuhnSummarizer
                summarizer = LuhnSummarizer()
                summarizer.stop_words = stopwords1
                for sentence in summarizer(parser.document, SENTENCES_COUNT):
                    # print(sentence)
                    datastring += str(sentence)

                return datastring
        except:
            linkno += 1
            if linkno > 9:
                # if number of links on one page have been exceede, go to the next google link page
                num_page += 1
                linkno = 0
Esempio n. 25
0
    def summarize_text(text):
        language = "english"

        parser = PlaintextParser.from_string(text, Tokenizer(language))

        summarizer = LuhnSummarizer(Stemmer(language))
        summarizer.stop_words = sumy.utils.get_stop_words(language)
        summary_text = ""
        for sentence in summarizer(parser.document, 5):
            summary_text += str(sentence) + " "

        return summary_text
Esempio n. 26
0
def luhnReferenceSummary(path):
    sentencesList = []
    parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = LuhnSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        #print(sentence._text)
        sentencesList.append(sentence._text)

    return sentencesList
Esempio n. 27
0
def _get_summary(document):
    parser = PlaintextParser.from_string(document, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = STOP_WORDS

    summary = " "
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        summary += " ".join(sentence.words)

    return summary
Esempio n. 28
0
def luhnReferenceSummary(path):	
	sentencesList=[]
	parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE))
	stemmer = Stemmer(LANGUAGE)
	summarizer = LuhnSummarizer(stemmer)
	summarizer.stop_words = get_stop_words(LANGUAGE)
	

	for sentence in summarizer(parser.document, SENTENCES_COUNT):
		#print(sentence._text)
		sentencesList.append(sentence._text)

	return sentencesList
Esempio n. 29
0
def Summarize_Content_Custom(Audio_Text, sentences_count, Summarize_Method):
    actual_sentences_count = float(len(sent_tokenize(Audio_Text))) * 0.5
    parser = PlaintextParser.from_string(Audio_Text, Tokenizer("english"))
    stemmer = Stemmer("english")
    if (Summarize_Method == "Gensim"):
        #ratio: define length of the summary as a proportion of the text
        temp = summarize(Audio_Text, ratio=0.5)
        sen = sent_tokenize(temp)
        sen = Counter(sen)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
    elif (Summarize_Method == "LexRankSummarizer"):
        # Using LexRank(Sentence based ranking based on repeating sentences)
        summarizer_Lex = LexRankSummarizer(stemmer)
        summarizer_Lex.stop_words = get_stop_words("english")
        #Summarize the document with 2 sentences
        summary = summarizer_Lex(parser.document, actual_sentences_count)
        sen = Counter(summary)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
    elif (Summarize_Method == "LuhnSummarizer"):
        # Using LUHN(Sentence based on frequency of most important words)
        summarizer_luhn = LuhnSummarizer(stemmer)
        summarizer_luhn.stop_words = get_stop_words("english")
        summary_1 = summarizer_luhn(parser.document, actual_sentences_count)
        sen = Counter(summary_1)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
    elif (Summarize_Method == "LsaSummarizer"):
        # Using LSA(Sentence based on frequency of most important words)
        summarizer_lsa2 = LsaSummarizer()
        summarizer_lsa2 = LsaSummarizer(stemmer)
        summarizer_lsa2.stop_words = get_stop_words("english")
        summary = summarizer_lsa2(parser.document, actual_sentences_count)
        sen = Counter(summary)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
    elif (Summarize_Method == "TextRankSummarizer"):
        # Using LSA(Sentence based on frequency of most important words)
        summarizer_text = TextRankSummarizer()
        summarizer_text = TextRankSummarizer(stemmer)
        summarizer_text.stop_words = get_stop_words("english")
        summary = summarizer_text(parser.document, actual_sentences_count)
        sen = Counter(summary)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
Esempio n. 30
0
    def test_two_sentences_but_one_winner(self):
        document = build_document(
            ("Já jsem 1. vítězná ta věta", "A já ta 2. vítězná věta"))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = (
            "já",
            "jsem",
            "a",
            "ta",
        )

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "A já ta 2. vítězná věta")
Esempio n. 31
0
def summarize_url(url,summarizer):
    # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html"
    print 'Summarizing ', url
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    if summarizer == 'luhn':
        summarizer = LuhnSummarizer(stemmer)
    elif summarizer == 'edmundson':
        summarizer = ESummarizer(stemmer)
    elif summarizer == 'lsa':
        summarizer = LsaSummarizer(stemmer)
    elif summarizer == 'lex':
        summarizer = LexSummarizer(stemmer)
    elif summarizer == 'text':
        summarizer = TextSummarizer(stemmer)
    elif summarizer == 'sb':
        summarizer = SumBasicSummarizer(stemmer)
    else:
        summarizer = KLSummarizer(stemmer)

    summarizer.stop_words = get_stop_words(LANGUAGE)
    print summarizer

    sentences = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print sentence
        sentences.append(str(sentence))

    return sentences
Esempio n. 32
0
def Luhn(rsc_file, dst_file, count):
    language = "chinese"
    parser = PlaintextParser.from_file(rsc_file,
                                       Tokenizer(language),
                                       encoding='utf-8')
    stemmer = Stemmer(language)  # 语言容器

    summarizer = LuhnSummarizer(stemmer)  # Luhn算法
    summarizer.stop_words = get_stop_words(language)
    with open(dst_file, 'w', encoding='utf-8') as f:
        for sentence in summarizer(parser.document, count):
            f.write(str(sentence))
            f.write('\n')
            print(sentence)
Esempio n. 33
0
def webBrowse():
    SENTENCES_COUNT = numOfSent.get()
    parser = HtmlParser.from_url(url.get(), Tokenizer(LANGUAGE))

    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizerurl(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    outputFile = open("C://Users//rakesh chandra//Desktop//ATS//outputU.txt",
                      'w')
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
        outputFile.write("->  ")
        outputFile.write(str(sentence))
        outputFile.write("\n \n")
    os.startfile("C://Users//rakesh chandra//Desktop//ATS//outputU.txt")
Esempio n. 34
0
def find_relevant_quote(book_id, chapter, num_sentences=1, technique='luhn'):
    """
    Create an extractive summary for a chapter of the book.

    Parameters:
    book_id: (str) the book identifier
    chapter: is the chapter number to summarize
    num_sentences: how many sentences to extract

    Returns:
    sentences: the extracted sentences
    """
    chapter_filename = get_data_filename(book_id, 'book_chapters', chapter)
    parser = PlaintextParser.from_file(chapter_filename, Tokenizer("english"))
    if technique == 'lsa':
        summarizer = LsaSummarizer()
    elif technique == 'lexrank':
        summarizer = LexRankSummarizer()
    elif technique == 'textrank':
        summarizer = TextRankSummarizer()
    elif technique == 'kl':
        summarizer = KLSummarizer()
    elif technique == 'random':
        summarizer = RandomSummarizer()
    elif technique == 'reduction':
        summarizer = ReductionSummarizer()
    elif technique == 'sumbasic':
        summarizer = SumBasicSummarizer()
    else:
        summarizer = LuhnSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return summary
Esempio n. 35
0
def luhn(docx,x):
    parser = PlaintextParser.from_string(docx, Tokenizer("english"))
    summarizer_1 = LuhnSummarizer()
    summary = summarizer_1(parser.document, x)
    summary_list = [str(sentence) for sentence in summary]
    result = ' '.join(summary_list)
    return result
Esempio n. 36
0
    def summarize_with_info(self, corpus, length, algorithm):
        parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE))

        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE))
            summarizer.bonus_words = parser.significant_words
            summarizer.stigma_words = parser.stigma_words
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(self.LANGUAGE))
        else:
            raise NotImplemented("Summary algorithm is not available")

        summarizer.stop_words = get_stop_words(self.LANGUAGE)

        return summarizer(parser.document, length)
Esempio n. 37
0
def createSummary (text, language="english", num_sentences=3, method="lexrank"):
    #LANGUAGE = "english"
    #SENTENCES_COUNT = 5
    # url = "https://en.wikipedia.org/wiki/Automatic_summarization"
    # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    

    # Language tokenizer
    tokenizer = Tokenizer(language)
    parser = PlaintextParser.from_string(text, tokenizer)
    # word stemming
    stemmer = Stemmer(language)

    if (method == "lexrank"):
        summarizer = LexRankSummarizer(stemmer)
    elif (method == "lsa"):
        summarizer = LSASummarizer(stemmer)
    elif (method == "luhn"):
        summarizer = LuhnSummarizer(stemmer)
    elif (method == "kl"):
        summarizer = KLSummarizer(stemmer)
    else:
        raise Exception (f'Unknown summarization method: ${method}')

    summarizer.stop_words = get_stop_words(language)

    result = []
    for sentence in summarizer(parser.document, num_sentences):
        result.append (str(sentence))
    
    return result
Esempio n. 38
0
def Luhn_summarizer(docx):
    parser = PlaintextParser.from_string(docx, Tokenizer("english"))
    luhn_summarizer = LuhnSummarizer()
    summary_4 = luhn_summarizer(parser.document, 3)
    summary_list_4 = [str(sentence) for sentence in summary_4]
    result_4 = ' '.join(summary_list_4)
    return result_4
def LuhnSummary(document, sentences):
    parser = PlaintextParser.from_string(document, Tokenizer("english"))
    summarizer = LuhnSummarizer()
    summary = summarizer(parser.document, sentences)
    # for sentence in summary:
    #     print(sentence)
    return summary
Esempio n. 40
0
def test_real_example():
    parser = PlaintextParser.from_string(
        "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. "
        "Přerostly až v reparát z jazyka na konci školního roku. "
        "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. "
        "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě "
        "o rok mladších dětí budoval vedoucí pozici. "
        "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.",
        Tokenizer("czech")
    )
    summarizer = LuhnSummarizer(stem_word)
    summarizer.stop_words = get_stop_words("czech")

    returned = summarizer(parser.document, 2)
    assert list(map(to_unicode, returned)) == [
        "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením.",
        "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě o rok mladších dětí budoval vedoucí pozici.",
    ]
Esempio n. 41
0
    def test_three_sentences(self):
        document = build_document((
            "wa s s s wa s s s wa",
            "wb s wb s wb s s s s s s s s s wb",
            "wc s s wc s s wc",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("s",)

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[1]), "wc s s wc s s wc")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "wa s s s wa s s s wa")
        self.assertEqual(to_unicode(returned[1]), "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[2]), "wc s s wc s s wc")
Esempio n. 42
0
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer 
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import sys


LANGUAGE = "english"
SENTENCES_COUNT = int(sys.argv[2])
text_file = sys.argv[1]


if __name__ == "__main__":
    
    parser = PlaintextParser.from_file(text_file, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = LuhnSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)

Esempio n. 43
0
class TestSentenceRating(unittest.TestCase):
    def setUp(self):
        self.summarizer = LuhnSummarizer()
        self.sentence = build_sentence(
            "Nějaký muž šel kolem naší zahrady a žil pěkný život samotáře")

    def test_significant_words(self):
        self.summarizer.significant_percentage = 1/5
        words = self.summarizer._get_significant_words((
            "wa", "wb", "wc", "wd", "we", "wf", "wg", "wh", "wi", "wj",
            "wa", "wb",
        ))

        self.assertEqual(tuple(sorted(words)), ("wa", "wb"))

    def test_stop_words_not_in_significant_words(self):
        self.summarizer.stop_words = ["stop", "Halt", "SHUT", "HmMm"]
        words = self.summarizer._get_significant_words([
            "stop", "Stop", "StOp", "STOP",
            "halt", "Halt", "HaLt", "HALT",
            "shut", "Shut", "ShUt", "SHUT",
            "hmmm", "Hmmm", "HmMm", "HMMM",
            "some", "relevant", "word",
            "some", "more", "relevant", "word",
        ])

        self.assertEqual(tuple(sorted(words)), ("relevant", "some", "word"))

    def test_zero_rating(self):
        significant_stems = ()
        self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 0)

    def test_single_word(self):
        significant_stems = ("muž",)
        self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 0)

    def test_single_word_before_end(self):
        significant_stems = ("život",)
        self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 0)

    def test_single_word_at_end(self):
        significant_stems = ("samotáře",)
        self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 0)

    def test_two_chunks_too_far(self):
        significant_stems = ("šel", "žil",)
        self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 0)

    def test_two_chunks_at_begin(self):
        significant_stems = ("muž", "šel",)
        self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 2)

    def test_two_chunks_before_end(self):
        significant_stems = ("pěkný", "život",)
        self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 2)

    def test_two_chunks_at_end(self):
        significant_stems = ("pěkný", "samotáře",)
        self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 4/3)

    def test_three_chunks_at_begin(self):
        significant_stems = ("nějaký", "muž", "šel",)
        self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 3)

    def test_three_chunks_at_end(self):
        significant_stems = ("pěkný", "život", "samotáře",)
        self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 3)

    def test_three_chunks_with_gaps(self):
        significant_stems = ("muž", "šel", "zahrady",)
        self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 9/5)

    def test_chunks_with_user_gap(self):
        self.summarizer.max_gap_size = 6
        significant_stems = ("muž", "šel", "pěkný",)
        self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 9/8)

    def test_three_chunks_with_1_gap(self):
        sentence = build_sentence("w s w s w")
        significant_stems = ("w",)

        self.assertEqual(self.summarizer.rate_sentence(sentence, significant_stems), 9/5)

    def test_three_chunks_with_2_gap(self):
        sentence = build_sentence("w s s w s s w")
        significant_stems = ("w",)

        self.assertEqual(self.summarizer.rate_sentence(sentence, significant_stems), 9/7)

    def test_three_chunks_with_3_gap(self):
        sentence = build_sentence("w s s s w s s s w")
        significant_stems = ("w",)

        self.assertEqual(self.summarizer.rate_sentence(sentence, significant_stems), 1)
Esempio n. 44
0
 def setUp(self):
     self.summarizer = LuhnSummarizer()
     self.sentence = build_sentence(
         "Nějaký muž šel kolem naší zahrady a žil pěkný život samotáře")