Example #1
0
def getSummarizedList(sqs):
    output = ""

    # Directory checking
    if not os.path.exists(Dir):
        os.makedirs(Dir)

    try:
        summary = open(Dir + "input.txt", "w", encoding='utf-8-sig')
        file = open(Dir + "headline_summary.txt", "w", encoding='utf-8-sig')
    except error_to_catch:
        print("!")

    date = ""
    # filtering data
    for i in sqs:
        title = i.title.rstrip()
        pub_date = dateReformat(i.pub_date)

        # Creating new date dataset
        if pub_date != date:
            if date != "":
                local_summary.close()
                sys.stdout = file
                #summarizer = LexRankSummarizer(Stemmer(LANGUAGE))  # LexRankSummarizer not work if # of sentenses > ~25
                summarizer = LsaSummarizer(Stemmer(LANGUAGE))
                summarizer.stop_words = get_stop_words(LANGUAGE)
                headline = PlaintextParser.from_file(Dir + date + ".txt",
                                                     Tokenizer(LANGUAGE))

                for sentence in summarizer(headline.document, SENTENCES_COUNT):
                    print(sentence)

            output = output + pub_date + "\n"
            date = pub_date
            local_summary = open(Dir + date + ".txt",
                                 "w",
                                 encoding='utf-8-sig')

        local_summary.write(title + ".\n")
        output = output + title + ".\n"

        #For last post summarization#
        if title == sqs.latest('pub_date').title.rstrip():
            local_summary.close()
            sys.stdout = file
            summarizer = LsaSummarizer(Stemmer(LANGUAGE))
            summarizer.stop_words = get_stop_words(LANGUAGE)
            headline = PlaintextParser.from_file(Dir + date + ".txt",
                                                 Tokenizer(LANGUAGE))
            for sentence in summarizer(headline.document, SENTENCES_COUNT):
                print(sentence)
        #############################

    summary.write(output)
    file.close()
    summary.close()
    testing = readSummarizerResultToList("headline_summary.txt")

    return testing
    def summarizeEdmundsonTitle(self, text, SENTENCES_COUNT, LANGUAGE):
        parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))

        stemmer = Stemmer(LANGUAGE)

        nullWords = get_stop_words(LANGUAGE)
        summarizer = EdmundsonTitleMethod(stemmer, nullWords)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        summaryList = summarizer(parser.document, SENTENCES_COUNT)
        summary = ''.join([str(sentence) for sentence in summaryList])

        return summary
Example #3
0
def Summarize_Content_Custom(Audio_Text, sentences_count, Summarize_Method):
    actual_sentences_count = float(len(sent_tokenize(Audio_Text))) * 0.5
    parser = PlaintextParser.from_string(Audio_Text, Tokenizer("english"))
    stemmer = Stemmer("english")
    if (Summarize_Method == "Gensim"):
        #ratio: define length of the summary as a proportion of the text
        temp = summarize(Audio_Text, ratio=0.5)
        sen = sent_tokenize(temp)
        sen = Counter(sen)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
    elif (Summarize_Method == "LexRankSummarizer"):
        # Using LexRank(Sentence based ranking based on repeating sentences)
        summarizer_Lex = LexRankSummarizer(stemmer)
        summarizer_Lex.stop_words = get_stop_words("english")
        #Summarize the document with 2 sentences
        summary = summarizer_Lex(parser.document, actual_sentences_count)
        sen = Counter(summary)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
    elif (Summarize_Method == "LuhnSummarizer"):
        # Using LUHN(Sentence based on frequency of most important words)
        summarizer_luhn = LuhnSummarizer(stemmer)
        summarizer_luhn.stop_words = get_stop_words("english")
        summary_1 = summarizer_luhn(parser.document, actual_sentences_count)
        sen = Counter(summary_1)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
    elif (Summarize_Method == "LsaSummarizer"):
        # Using LSA(Sentence based on frequency of most important words)
        summarizer_lsa2 = LsaSummarizer()
        summarizer_lsa2 = LsaSummarizer(stemmer)
        summarizer_lsa2.stop_words = get_stop_words("english")
        summary = summarizer_lsa2(parser.document, actual_sentences_count)
        sen = Counter(summary)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
    elif (Summarize_Method == "TextRankSummarizer"):
        # Using LSA(Sentence based on frequency of most important words)
        summarizer_text = TextRankSummarizer()
        summarizer_text = TextRankSummarizer(stemmer)
        summarizer_text.stop_words = get_stop_words("english")
        summary = summarizer_text(parser.document, actual_sentences_count)
        sen = Counter(summary)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
Example #4
0
def summarize(srt_file, summarizer, n_sentences, language, bonusWords,
              stigmaWords):
    # Converting the srt file to a plain text document and passing in to Sumy library(The text summarization library) functions.
    ##print(srt_to_doc(srt_file))
    parser = PlaintextParser.from_string(srt_to_doc(srt_file),
                                         Tokenizer(language))

    if (summarizer == 'ED'):
        summarizer = EdmundsonSummarizer()

        with open(bonusWords, "r+") as f:
            bonus_wordsList = f.readlines()
            bonus_wordsList = [x.strip() for x in bonus_wordsList]
            f.close()
        with open(stigmaWords, "r+") as f:
            stigma_wordsList = f.readlines()
            stigma_wordsList = [x.strip() for x in stigma_wordsList]
            f.close()

        summarizer.bonus_words = (bonus_wordsList)
        summarizer.stigma_words = (stigma_wordsList)
        summarizer.null_words = get_stop_words(language)
    else:
        stemmer = Stemmer(language)
        summarizer = SUMMARIZERS[summarizer](stemmer)
        summarizer.stop_words = get_stop_words(language)

    ret = []
    summarizedSubtitles = []
    # Now the the document passed is summarized and we can access the filtered sentences along with the no of sentence
    # for sentence in parser.document:
    #     print("sentence ",sentence)
    # print("cod ",srt_file)
    # for ob in srt_file:
    #         sent=srt_to_doc([ob])
    #         print("sent ",sent[4:])

    for sentence in summarizer(parser.document, n_sentences):
        # Index of the sentence
        # print("sentence ",sentence)
        index = int(re.findall("\(([0-9]+)\)", str(sentence))[0])
        # Using the index we determine the subtitle to be selected
        item = srt_file[index]
        # print("item ",item)
        summarizedSubtitles.append(item)

        # add the selected subtitle to the result array
        ret.append(srt_item_to_range(item))

    return ret, summarizedSubtitles
Example #5
0
def summarize_url(url, summarizer):
    # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html"
    print 'Summarizing ', url
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    if summarizer == 'luhn':
        summarizer = LuhnSummarizer(stemmer)
    elif summarizer == 'edmundson':
        summarizer = ESummarizer(stemmer)
    elif summarizer == 'lsa':
        summarizer = LsaSummarizer(stemmer)
    elif summarizer == 'lex':
        summarizer = LexSummarizer(stemmer)
    elif summarizer == 'text':
        summarizer = TextSummarizer(stemmer)
    elif summarizer == 'sb':
        summarizer = SumBasicSummarizer(stemmer)
    else:
        summarizer = KLSummarizer(stemmer)

    summarizer.stop_words = get_stop_words(LANGUAGE)
    print summarizer

    sentences = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print sentence
        sentences.append(str(sentence))

    return sentences
def summarize(text):
    if isvalid(text):
        all_capital = False
        # to avoid that all capital letter sentence gives empty output: we lower all and the upper all later on
        if text.upper() == text:
            text = text.lower()
            all_capital = True

        if PY2:
            parser = PlaintextParser.from_string(
                text.decode('ascii', errors='ignore'), Tokenizer(LANGUAGE))
        else:
            parser = PlaintextParser.from_string(
                text.encode().decode('ascii', errors='ignore'),
                Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        sentences = [
            str(s)
            for s in summarizer(parser.document, sentences_count=n_sentences)
        ]

        if all_capital:
            output_sentences = ' '.join(sentences).upper()
            all_capital = False
        else:
            output_sentences = ' '.join(sentences)

        return output_sentences
    else:
        return ''
Example #7
0
def node_page():
    nid = request.args.get('id')
    KDB = client.kg_scrapy
    items = KDB.kg_content.find_one({'_id': nid})
    if items == None:
        return "没有内容"
    else:

        LANGUAGE = "chinese"
        SENTENCES_COUNT = 10
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        if len(items['content']) > 500:
            SENTENCES_COUNT = 5
        else:
            SENTENCES_COUNT = 3
        parser = PlaintextParser.from_string(items['content'],
                                             Tokenizer(LANGUAGE))
        summary = []

        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            summary.append(str(sentence))
        titles = []
        titles_p = DB.pre_titles.find({"parent": items['_id']})
        for item in titles_p:
            irank, grade, softmax = get_rank(item['title'])
            # print(irank,grade,softmax)
            # print((items[i]))
            item['rank'] = irank
            item['softmax'] = softmax
            item['grade'] = grade
            titles.append(item)

        return render_template("node.html", **locals())
Example #8
0
def summarize_url(url,summarizer):
    # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html"
    print 'Summarizing ', url
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    if summarizer == 'luhn':
        summarizer = LuhnSummarizer(stemmer)
    elif summarizer == 'edmundson':
        summarizer = ESummarizer(stemmer)
    elif summarizer == 'lsa':
        summarizer = LsaSummarizer(stemmer)
    elif summarizer == 'lex':
        summarizer = LexSummarizer(stemmer)
    elif summarizer == 'text':
        summarizer = TextSummarizer(stemmer)
    elif summarizer == 'sb':
        summarizer = SumBasicSummarizer(stemmer)
    else:
        summarizer = KLSummarizer(stemmer)

    summarizer.stop_words = get_stop_words(LANGUAGE)
    print summarizer

    sentences = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print sentence
        sentences.append(str(sentence))

    return sentences
def summarizer(request):
    inp=request.POST['geturl']

    LANGUAGE = "english"
    SENTENCES_COUNT = 10

    url = str(inp)

    f = open("denemedosyasiU3.txt", "w")

    f.write(url)

    f.close()

    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
        return render(request, 'home.html', {'data1': sentence})


    '''
Example #10
0
def summarizer(input_obj, SENTENCES_COUNT=2, op='url'):
    LANGUAGE = "english"
    # SENTENCES_COUNT = 1
    # url =  "https://sea.pcmag.com/smartphones/17424/apple-iphone-x"

    # text = ' '.join(text.split())
    # print(input_obj)
    # print(type(input_obj))
    parser = None
    if op == 'text':
        text = input_obj['text']
        parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    elif op == 'url':
        url = input_obj['link']
        parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    else:
        print('OP ERROR')
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    sentences = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        # print(sentence)
        sentences.append(str(sentence))
    return sentences


# print(get_summarize("https://sea.pcmag.com/smartphones/17424/apple-iphone-x"))
Example #11
0
def summarize(corpus, length, algorithm):
    summarizer = None
    summary = "No compatible summarizer was selected, please use one of these : textrank, lexrank, luhn, edmonson*, kl, lsa, sumbasic, random (* doesn\'t work yet)"
    algorithm = algorithm.lower()
    try:
        parser = PlaintextParser.from_string(corpus,Tokenizer(LANGUAGE))
        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(LANGUAGE))

        if summarizer:
            summarizer.stop_words = get_stop_words(LANGUAGE)
            summary = " ".join([obj._text for obj in summarizer(parser.document, length)])

        return summary

    except Exception as e:
        return str(e)
Example #12
0
def pdfToText(BookPAth, bookid):
    pdfContent = ""
    summaryText = ""
    MEDIA = ROOT_MEDIA + str(BookPAth)
    pdfFileObj = open(MEDIA, 'rb')

    PdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    totalPage = PdfReader.numPages
    for page in range(totalPage):
        page = PdfReader.getPage(page)
        page_content = page.extractText()
        page_content = page_content
        pdfContent = pdfContent + page_content
    with open(RESULT_ROOT + "BookText" + bookid + ".txt", "w") as f:
        f.write(pdfContent)
    f.close()

    LANGUAGE = "czech"
    SENTENCES_COUNT = 50
    parser = PlaintextParser.from_file(
        RESULT_ROOT + "BookText" + bookid + ".txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        summaryText = summaryText + str(sentence)

    with open(RESULT_ROOT + "SummaryText" + bookid + ".txt", "w") as f:
        f.write(summaryText)
    f.close()

    return pdfContent
Example #13
0
File: iatv.py Project: mtpain/iatv
def summarize(text, n_sentences, sep='\n'):
    '''
    Args:
        text (str or file): text itself or file in memory of text
        n_sentences (int): number of sentences to include in summary

    Kwargs:
        sep (str): separator to join summary sentences

    Returns:
        (str) n_sentences-long, automatically-produced summary of text
    '''

    if isinstance(text, str):
        parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    elif isinstance(text, file):
        parser = PlaintextParser.from_file(text, Tokenizer(LANGUAGE))
    else:
        raise TypeError('text must be either str or file')

    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    return '\n'.join(str(s) for s in summarizer(parser.document, n_sentences))
def summarize_article(article, vibe_description_file_path):
    try:
        article_url = article['alternate'][0]['href']
        article_title = article['title']

        article_newspaper = Article(article_url)
        article_newspaper.download()
        article_newspaper.parse()
        article_newspaper.nlp()

        text_content = article_newspaper.text
        update_json_file(vibe_description_file_path, 'textContent',
                         text_content)

        LANGUAGE = 'english'
        parser = HtmlParser.from_url(article_url, Tokenizer('english'))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        article_summary = []
        for sentence in summarizer(parser.document, 3):
            article_summary.append(sentence._text)

    except:
        print('Error summarizing article')
        return False

    update_json_file(vibe_description_file_path, 'summary', article_summary)
    update_json_file(vibe_description_file_path, 'keywords',
                     article_newspaper.keywords)

    return True
Example #15
0
def main(url, num_sentences=10, language='english'):
	parser = HtmlParser.from_url(url, Tokenizer(language))
	stemmer = Stemmer(language)
	summarizer = Summarizer(stemmer)
	summarizer.stop_words = get_stop_words(language)
	for sentence in summarizer(parser.document, num_sentences):
		print(sentence)
Example #16
0
    def summarize_with_info(self, corpus, length, algorithm):
        parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE))

        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE))
            summarizer.bonus_words = parser.significant_words
            summarizer.stigma_words = parser.stigma_words
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(self.LANGUAGE))
        else:
            raise NotImplemented("Summary algorithm is not available")

        summarizer.stop_words = get_stop_words(self.LANGUAGE)

        return summarizer(parser.document, length)
Example #17
0
    def summarize(self, corpus, length, algorithm):
        parser = PlaintextParser.from_string(corpus,Tokenizer(self.LANGUAGE))

        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(self.LANGUAGE))
        else:
            raise NotImplemented("Summary algorithm is not available")

        summarizer.stop_words = get_stop_words(self.LANGUAGE)
        summary = " ".join([obj._text for obj in summarizer(parser.document, length)])

        return summary
def SumySummarize(text):

    from sumy.parsers.html import HtmlParser
    from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.summarizers.lsa import LsaSummarizer as Summarizer
    from sumy.nlp.stemmers import Stemmer
    from sumy.utils import get_stop_words

    LANGUAGE = "english"
    SENTENCES_COUNT = 3
    import nltk
    nltk.download('punkt')

    # url = "https://en.wikipedia.org/wiki/Automatic_summarization"
    # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    s = ""
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        s += (str)(sentence)
    return s
Example #19
0
 def get_summary(self, summary_length: int = 10) -> Iterator[str]:
     parser = HtmlParser.from_url(self.link, Tokenizer(LANGUAGE))
     stemmer = Stemmer(LANGUAGE)
     summarizer = Summarizer(stemmer)
     summarizer.stop_words = get_stop_words(LANGUAGE)
     for sentence in summarizer(parser.document, summary_length):
         yield sentence
Example #20
0
def summarizeFile(inputFile):
	summarizer = LsaSummarizer(stem_word)
	summarizer.stop_words = get_stop_words("english")
	url = findURLS(inputFile)
	if url != None:
		if url[-1] == '.':
			url = url[0:-1]
		#print (url)
		#urlContent = 'Summary from URL ['+url+']: \n'
		urlContent = ''
		try:
			parser = HtmlParser.from_url(url, Tokenizer("english"))		
			for sentence in summarizer(parser.document, 3):
				urlContent = urlContent + str(sentence) + '\n'
		except:
			#print (sys.exc_info()[0])
			urlContent = ''
	content = inputFile.read()
	parser = PlaintextParser.from_string(content, Tokenizer(LANGUAGE))
	#summarizer = LsaSummarizer(stem_word)
	#summarizer.stop_words = get_stop_words(LANGUAGE)
	#summary = 'Event Summary: \n'
	summary = ''
	try:
		for sentence in summarizer(parser.document, SENTENCES_COUNT_1):
			summary = summary + str(sentence) + '\n'
	except AssertionError:
		return None
	if url != None:
		return summary + urlContent
	return summary
Example #21
0
    def compute(self, text):
        text = text.replace("\t", " ").replace("\f", " ").replace("\n",
                                                                  " ").strip()

        parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        summ = " ".join([
            sentence._text
            for sentence in summarizer(parser.document, self.limit_sentences)
        ])

        if (len(summ) > 0):
            keywords = self.keyword_extractor.extract_keywords(summ)
        else:
            keywords = self.keyword_extractor.extract_keywords(text)

        keywords = [kw[0] for kw in keywords]

        metadata = {
            "keywords": keywords,
            "summary_orig": summ,
            "summary": self.fix_summary(summ)
        }

        return metadata
Example #22
0
def get_sumy(text):
    """
    获取摘要
    """
    LANGUAGE = "chinese"
    SENTENCES_COUNT = 5
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    l = []
    # print(parser.document)
    items = []
    # for s in parser.document:
    #         items.append(s)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        l.append(str(sentence))
    # del sentence
    # try:
    #     for s in parser.document:
    #         items.append(s)
    # except:
    #     pass

    return l, items
    def edmunson(self, text):

        # Sprache wählen
        language = "german"
        # Die Prozentzahl vom Schieberegler ziehen
        divident = 100 / self.scale.get()

        # Den Text tokenizen und einen Stemmer zum Summarizer hinzufügen
        parser = PlaintextParser.from_string(text, Tokenizer(language))
        stemmer = Stemmer(language)
        summarizer = Summarizer(stemmer)

        # Spezifische Wortlisten definieren
        # Die bonus, stigma und null words sollen nicht genutzt werden aber es wird kein leerer Input akzeptiert
        summarizer.stop_words = get_stop_words(language)
        summarizer.bonus_words = ["nsdgdf"]
        summarizer.stigma_words = ["mtrtf"]
        summarizer.null_words = ["zngg"]

        summary = ""
        count = 0

        # Anzahl der Sätzte zählen
        for sentence in summarizer(parser.document, 10000000000):
            count += 1

        # Die Satzanzahl aus dem Przentanteil ermitteln
        sentence_number = round(count / divident)

        # Die Sätze zu einem Text zusammenfügen
        for sentence in summarizer(parser.document, sentence_number):
            summary += " " + str(sentence)

        return summary
Example #24
0
    def summarize_with_info(self, corpus, length, algorithm):
        parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE))

        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE))
            summarizer.bonus_words = parser.significant_words
            summarizer.stigma_words = parser.stigma_words
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(self.LANGUAGE))
        else:
            raise NotImplemented("Summary algorithm is not available")

        summarizer.stop_words = get_stop_words(self.LANGUAGE)

        return summarizer(parser.document, length)
Example #25
0
    def filter_stop(words):
        words_stop = list()
        for w in words:
            if w not in get_stop_words('CZECH'):
                words_stop.append(w)

        return words_stop
Example #26
0
    def post(self):
        """
        Extract summary (key sentences) from text
        """
        # data = api.payload
        data = request.json
        text = data['text']
        num_sentences = data['num_sentences']
        num_sentences = num_sentences if isinstance(
            num_sentences, int) else DEFAULT_NUM_SENTENCES
        log.debug('num_sentences={}'.format(num_sentences))

        # log.debug('text: {}'.format(text))

        # TODO: check for minimum number of sentences in text?

        summary_sentences = []
        if text:
            parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))

            stemmer = Stemmer(LANGUAGE)
            summarizer = TextRankSummarizer(stemmer)
            summarizer.stop_words = get_stop_words(LANGUAGE)

            summary = summarizer(parser.document, num_sentences)
            # summary_text = ' '.join([sentence._text for sentence in summary])
            summary_sentences = [sentence._text for sentence in summary]

        log.debug('response body:\n{}'.format(summary_sentences))
        return summary_sentences, 200, {'Access-Control-Allow-Origin': '*'}
Example #27
0
    def __init__(self,
                 modelfn=None,
                 classnames=None,
                 language="english",
                 explainer=None,
                 summarizer=None,
                 fm=962,
                 topfeaturescount=100,
                 sentencescount=6,
                 logger=None):
        self.fm = fm
        self.modelfn = modelfn
        self.classnames = classnames
        self.topfeaturescount = topfeaturescount
        self.language = language
        self.sentencescount = sentencescount

        if explainer is not None:
            self.explainer = explainer
        else:
            self.explainer = lime_text.LimeTextExplainer(
                class_names=self.classnames)

        if summarizer is not None:
            self.summarizer = summarizer
        else:
            self.summarizer = TextRankSummarizer(Stemmer(self.language))
            self.summarizer.stop_words = get_stop_words(self.language)

        if logger is not None:
            self.log = logger
        else:
            self.log = logging.getLogger()
Example #28
0
def get_sumy(
            sentences_count: int = 10,
            body: str = "",
            url: Optional[str] = None
    ) -> str:
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.summarizers.lsa import LsaSummarizer as Summarizer
    from sumy.nlp.stemmers import Stemmer
    from sumy.utils import get_stop_words

    if url is None:
        from sumy.parsers.plaintext import PlaintextParser as Parser
        item = (body)
    else:
        from sumy.parsers.html import HtmlParser as Parser
        item = (body, url)
        DBG(f'Sumy HTML, url: {url}')

    tokenizer = Tokenizer(LANGUAGE)
    parser = Parser.from_string(*item, tokenizer)
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    summary = summarizer(parser.document, sentences_count)
    summary = [str(sentence) for sentence in summary]
    summary = ' '.join(summary)
    return summary
def models_LUHN_LEX_LSA_2(dataframe):
    LANGUAGE = "english"
    stop = get_stop_words(LANGUAGE)
    size = len(dataframe)
    stemmer = Stemmer(LANGUAGE)

    for i in range(0, size):
        article = dataframe.loc[i, "post_content"]

        parser = PlaintextParser.from_string(article, Tokenizer(LANGUAGE))

        summarizerLUHN = LUHN(stemmer)
        summarizerLUHN.stop_words = stop

        summarizerLEX = LEX(stemmer)
        summarizerLEX.stop_words = stop

        summarizerLSA = LSA(stemmer)
        summarizerLSA.stop_words = stop

        LUHNsentence = summarizerLUHN(parser.document, 1) #summarize document with one sentence
        LEXsentence = summarizerLEX(parser.document, 1) #summarize document with one sentence
        LSAsentence = summarizerLSA(parser.document, 1) #summarize document with one sentence

        for sentence1 in LUHNsentence:
            LUHNsummary = sentence1
        for sentence2 in LEXsentence:
            LEXsummary = sentence2
        for sentence3 in LSAsentence:
            LSAsummary = sentence3

        dataframe.loc[i, "LUHN"] = LUHNsummary
        dataframe.loc[i, "LEX"] = LEXsummary
        dataframe.loc[i, "LSA"] = LSAsummary
Example #30
0
    def store_summary(self):
        
        for item in self.doc_id_url:
            if item < len(self.document_info):
                #soup = self.document_info[item]
                s = requests.Session()
                response = s.get(self.doc_id_url[item])
                if response.status_code != 404:
                    parser = HtmlParser.from_url(self.doc_id_url[item], Tokenizer("english"))
                    text = ""
                    """
                    for tag in soup.findAll('p'):
                        text = text + tag.text
                    """
                    stemmer = Stemmer("english")

                    summarizer = Summarizer(stemmer)
                    summarizer.stop_words = get_stop_words("english")
                    
                    for sentence in summarizer(parser.document, 5):
                        print sentence
                        if item in self.summary:
                            self.summary[item] = self.summary[item] + sentence
                        else:
                            self.summary[item] = sentence
Example #31
0
def get_sum():

    # news_text = "Encoder contains the input words that want to be transformed (translate, generate summary), and each word is a vector that go through forward and backward activation with bi-directional RNN. Then calculate the attention value for each words in encoder reflects its importance in a sentence. Decoder generates the output word one at a time, by taking dot product of the feature vector and their corresponding attention for each timestamp."
    new_list, news_text = get_news_link_content()

    LANGUAGE = "english"
    SENTENCES_COUNT = 4

    # print(news_text)

    # if __name__ == "__main__":
    # url = "https://en.wikipedia.org/wiki/Automatic_summarization"
    # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # import os
    # sss = os.path.join(os.path.dirname(__file__))
    # parser = PlaintextParser.from_file(sss+"/xixi.txt", Tokenizer(LANGUAGE))

    parser = PlaintextParser.from_string(news_text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    sum_newss = ""

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        sum_newss += str(sentence)

    return sum_newss
Example #32
0
def print_news(url, content='title'):
    #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    g = Goose()

    article = g.extract(url=url)

    #If there is a meta description available, print that else go for
    #summarize
    if content == 'full' and article.meta_description:
        print(article.meta_description)
        return

    news_text = article.cleaned_text

    parser = PlaintextParser.from_string(news_text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    if content == 'title' or content == 'full':
        #Print article title
        print('\t* ' + str(article.title.encode('ascii', 'ignore')))

    if content == 'full':
        #Print a n-line summary
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            print(sentence)
    return
Example #33
0
def find_summary_stopwords():
    p = PlaintextParser.from_file("testtext.txt", Tokenizer("english"))
    s_lsa = LsaSummarizer()
    s_lsa = LsaSummarizer(Stemmer("english"))
    s_lsa.stop_words = get_stop_words("english")
    for s in s_lsa(p.document, 2):
        print(s)
Example #34
0
def sum_from_string(string, language="english", sentences_cout=100):
    parser = PlaintextParser.from_string(string, Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = Summarizer(stemmer)
    summarizer.stem_words = get_stop_words(language)
    sentences = summarizer(parser.document, sentences_cout)
    return sentences
Example #35
0
def createSummary (text, language="english", num_sentences=3, method="lexrank"):
    #LANGUAGE = "english"
    #SENTENCES_COUNT = 5
    # url = "https://en.wikipedia.org/wiki/Automatic_summarization"
    # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    

    # Language tokenizer
    tokenizer = Tokenizer(language)
    parser = PlaintextParser.from_string(text, tokenizer)
    # word stemming
    stemmer = Stemmer(language)

    if (method == "lexrank"):
        summarizer = LexRankSummarizer(stemmer)
    elif (method == "lsa"):
        summarizer = LSASummarizer(stemmer)
    elif (method == "luhn"):
        summarizer = LuhnSummarizer(stemmer)
    elif (method == "kl"):
        summarizer = KLSummarizer(stemmer)
    else:
        raise Exception (f'Unknown summarization method: ${method}')

    summarizer.stop_words = get_stop_words(language)

    result = []
    for sentence in summarizer(parser.document, num_sentences):
        result.append (str(sentence))
    
    return result
Example #36
0
def get_data_list(URL, file_type=""):
    SUMMARY_SENTENCES_COUNT = 5
    sentences = []
    try:
        LANGUAGE = "english"
        # parser = None
        if file_type == "txt":
            parser = HtmlParser.from_string(URL, None, Tokenizer(LANGUAGE))
        elif file_type == "pdf":
            content = read_pdf(URL)
            parser = HtmlParser.from_string(content, None, Tokenizer(LANGUAGE))
        else:
            parser = HtmlParser.from_url(URL, Tokenizer(LANGUAGE))

        document = parser.document
        stemmer = Stemmer(LANGUAGE)

        from sumy.summarizers.luhn import LuhnSummarizer

        LHS = LuhnSummarizer(stemmer)
        LHS.stop_words = get_stop_words(LANGUAGE)
        print("\nSummary using Luhn Summarizer")
        print("*******************************")
        for sentence in LHS(document, SUMMARY_SENTENCES_COUNT):
            sentences.append(str(sentence))
    except Exception as e:
        print(str(e))
    finally:
        return sentences
Example #37
0
def summarize_news(news):
    """
    

    Parameters
    ----------
    news : LIST(str)
        List of news data to be summarized.

    Returns
    -------
    sentence : str
        Most important sentence in the given articles.

    """
    LANGUAGE = "english"
    SENTENCES_COUNT = 1
    parsed = []
    for data in news:
        parser = PlaintextParser.from_string(data, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            parsed.append(sentence)
    return parsed
Example #38
0
def summarize(string, summary_length = 1, language = "english"):
    string = string.lower() if string.isupper() else string
    parser = PlaintextParser.from_string(string, Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(language)

    return ". ".join([str(sentence) for sentence in summarizer(parser.document, summary_length)]) 
Example #39
0
    def luhn(self,text_parser):
        assert isinstance(text_parser,plaintext.PlaintextParser)

        summarizer=Luhn()
        #EnglishStemmer())
        #summarizer.stop_words=stopwords.words("english")

        summarizer.stop_words=get_stop_words(settings.SUMMARIZER_LANGUAGE)
        return summarizer(text_parser.document,settings.SUMMARIZER_TOP_X_SENTENCES)
Example #40
0
def summarize(url):
    summary = []
    parser = HtmlParser.from_url(url,Tokenizer(lang))
    stemmer = Stemmer(lang)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(lang)
    for sentence in summarizer(parser.document,sent):
        summary.append(sentence._text)
    return ' '.join(summary)
Example #41
0
def summarize(text):
    total = ""
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        total += str(sentence)
    return total
Example #42
0
def lsa(comment,parser,num):
	summarizer = LsaSummarizer(stemmer)
	summarizer.stop_words = get_stop_words(LANGUAGE)

	LSAstr = ''
	for sentence in summarizer(parser.document,num):
		LSAstr += str(sentence)

	return LSAstr
Example #43
0
def retreive_sumy(url):
    # "http://en.wikipedia.org/wiki/Automatic_summarization"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)	
    return summarizer(parser.document, SENTENCES_COUNT)
def summarize(filename, num_sentences):
    with open (filename, "r") as myfile:
        data=myfile.read()
    parser = PlaintextParser.from_string(data, Tokenizer('english')) 
    summarizer = LsaSummarizer(stem_word)
    summarizer.stop_words = get_stop_words("english")
    summary = ""
    for sentence in summarizer(parser.document, num_sentences):
        summary += sentence.__unicode__().encode('ascii', 'ignore').replace('\"', '').replace('\'', '').strip() + " " 
    return summary
Example #45
0
def summary(text, summarizer_class):
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = summarizer_class(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
        yield sentence
def summarize(content):
    parser = PlaintextParser.from_string(content.body, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    text = '\n'.join(
        [str(sentence) for sentence in summarizer(parser.document, COUNT)]
    )
    summary = Summary(content=content, summary=text)
    summary.save()
 def summary(self, int1, int2):
     # int1, int2 are the places between which to look for
     # the summary to be taken (slicing the corpus as a string)
     parser = PlaintextParser(self.corpus[int1:int2], Tokenizer("english"))
     summarizer = LsaSummarizer(stem_word)
     summarizer.stop_words = get_stop_words("english")
     self.summary_text = " ".join(
         map(lambda x:x._text,
             summarizer(parser.document, 20)))
     return self.summary_text
Example #48
0
 def summarizeText(self, body, numSentences = 10):
     """Summarizes body of text to numSentences
     """
     #parser = PlaintextParser.from_string(body, Tokenizer(self.LANG))
     parser = PlaintextParser.from_string(body, Tokenizer(self.LANG))        
     stemmer = Stemmer(self.LANG)
     summarizer = SumySummarizer(stemmer)
     summarizer.stop_words = get_stop_words(self.LANG)
     summary = ' '.join([str(sentence).decode('utf-8') for sentence in summarizer(parser.document, numSentences)])
     return summary
Example #49
0
def summary(text):

    stemmer = Stemmer(LANGUAGE)
    parser = PlaintextParser(text, Tokenizer(LANGUAGE))
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    short = ""
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        short = short + ">" + "* " + str(sentence).decode('ascii','ignore') + "\n\n"
        #print(sentence)
    return short
Example #50
0
    def test_real_example(self):
        """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
        parser = PlaintextParser.from_string(
            load_resource("snippets/prevko.txt"),
            Tokenizer("czech")
        )
        summarizer = LsaSummarizer(Stemmer("czech"))
        summarizer.stop_words = get_stop_words("czech")

        sentences = summarizer(parser.document, 2)
        self.assertEqual(len(sentences), 2)
Example #51
0
 def summarizeUrl(self, url, numSentences = 10):
     """Summarizes text at a given url to numSentences
     """
     #parser = PlaintextParser.from_string(body, Tokenizer(self.LANG))
     parser = HtmlParser.from_url(url, Tokenizer(self.LANG))        
     stemmer = Stemmer(self.LANG)
     summarizer = SumySummarizer(stemmer)
     summarizer.stop_words = get_stop_words(self.LANG)
     summary = ' '.join([str(sentence).decode('utf-8') for sentence in summarizer(parser.document, numSentences)])
     return summary
     
Example #52
0
def summarize(parser, sentences_count):
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    sentences = ""
    for sentence in summarizer(parser.document, sentences_count):
        sentences += " " + str(sentence)

    return sentences
Example #53
0
    def test_issue_5_sigma_can_multiply_matrix_v(self):
        """Source: https://github.com/miso-belica/sumy/issues/5"""
        parser = PlaintextParser.from_string(
            load_resource("articles/sigma_can_multiply_matrix_v.txt"),
            Tokenizer("english")
        )
        summarizer = LsaSummarizer(english_stemmer)
        summarizer.stop_words = get_stop_words("english")

        sentences = summarizer(parser.document, 20)
        self.assertEqual(len(sentences), 20)
Example #54
0
def summarize(text):
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    result = ""

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        result += str(sentence) + " "

    return result
Example #55
0
def test_article_example():
    """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
    parser = PlaintextParser.from_string(
        load_resource("articles/prevko_cz_1.txt"),
        Tokenizer("czech")
    )
    summarizer = LsaSummarizer(Stemmer("czech"))
    summarizer.stop_words = get_stop_words("czech")

    sentences = summarizer(parser.document, 20)
    assert len(sentences) == 20
Example #56
0
    def lsa(self,text_parser):
        assert isinstance(text_parser,plaintext.PlaintextParser)

        #process the text
        summarizer=LSA()
        #EnglishStemmer())
        #summarizer.stop_words=stopwords.words("english")

        #we have to specify stop words
        summarizer.stop_words=get_stop_words(settings.SUMMARIZER_LANGUAGE)
        return summarizer(text_parser.document,settings.SUMMARIZER_TOP_X_SENTENCES)
Example #57
0
def get_summary(text, max_sentences=5):
	parser = PlaintextParser.from_string(text, Tokenizer("english"))
	stemmer = Stemmer("english")

	summarizer = Summarizer(stemmer)
	summarizer.stop_words = get_stop_words("english")

	summary = []
	for sentence in summarizer(parser.document, max_sentences): # sentence count set to 10
		summary.append(str(sentence._text.encode('ascii', 'ignore')))

	return summary
Example #58
0
def get_lexrank(tweets):
    sens = [Sentence(t, TwokenizeWrapper()) for t in tweets]
    tweet_document = ObjectDocumentModel([Paragraph(sens)])
    LANGUAGE = "english"
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    SENTENCES_COUNT = len(sens)
    lex_ranked = summarizer(tweet_document, SENTENCES_COUNT)
    if len(sens) != len(lex_ranked):
        print('lr error')
    return [lex_ranked[s] for s in sens]
Example #59
0
def test_issue_5_svd_converges():
    """Source: https://github.com/miso-belica/sumy/issues/5"""
    pytest.skip("Can't reproduce the issue.")

    parser = PlaintextParser.from_string(
        load_resource("articles/svd_converges.txt"),
        Tokenizer("english")
    )
    summarizer = LsaSummarizer(Stemmer("english"))
    summarizer.stop_words = get_stop_words("english")

    sentences = summarizer(parser.document, 20)
    assert len(sentences) == 20
Example #60
0
def summarize(method, length, url):
    html_content = fetch_url(url)
    iso_lang = detect_language(html_content)
    language = SUMY_LANGUAGES[iso_lang]
    stemmer = Stemmer(language)
    parser = HtmlParser.from_string(html_content, url, Tokenizer(language))

    summarizer_class = AVAILABLE_METHODS[method]
    summarizer = build_summarizer(summarizer_class, get_stop_words(language), stemmer, parser)

    sentences = summarizer(parser.document, ItemsCount(length))
    summary = ' '.join([unicode(sentence) for sentence in sentences])
    return summary, iso_lang