def test_idf_metrics(self): summarizer = LexRankSummarizer() sentences = [ ("this", "sentence", "is", "simple", "sentence",), ("this", "is", "simple", "sentence", "yes", "is", "too", "too", "too",), ("not", "every", "sentence", "makes", "me", "happy",), ("yes",), (), ("every", "day", "is", "happy", "day",), ] metrics = summarizer._compute_idf(sentences) expected = { "this": 6/2, "is": 6/3, "yes": 6/2, "simple": 6/2, "sentence": 6/3, "too": 6/1, "not": 6/1, "every": 6/2, "makes": 6/1, "me": 6/1, "happy": 6/2, "day": 6/1, } self.assertEqual(expected, metrics)
def models_LUHN_LEX_LSA_2(dataframe): LANGUAGE = "english" stop = get_stop_words(LANGUAGE) size = len(dataframe) stemmer = Stemmer(LANGUAGE) for i in range(0, size): article = dataframe.loc[i, "post_content"] parser = PlaintextParser.from_string(article, Tokenizer(LANGUAGE)) summarizerLUHN = LUHN(stemmer) summarizerLUHN.stop_words = stop summarizerLEX = LEX(stemmer) summarizerLEX.stop_words = stop summarizerLSA = LSA(stemmer) summarizerLSA.stop_words = stop LUHNsentence = summarizerLUHN(parser.document, 1) #summarize document with one sentence LEXsentence = summarizerLEX(parser.document, 1) #summarize document with one sentence LSAsentence = summarizerLSA(parser.document, 1) #summarize document with one sentence for sentence1 in LUHNsentence: LUHNsummary = sentence1 for sentence2 in LEXsentence: LEXsummary = sentence2 for sentence3 in LSAsentence: LSAsummary = sentence3 dataframe.loc[i, "LUHN"] = LUHNsummary dataframe.loc[i, "LEX"] = LEXsummary dataframe.loc[i, "LSA"] = LSAsummary
def summarize_sentences(nlp, text, sentences_count=3): text1 = text.replace('\n', '') corpus = [] originals = [] doc = nlp(text1) for s in doc.sents: originals.append(s) tokens = [] for t in s: tokens.append(t.lemma_) corpus.append(' '.join(tokens)) del doc # 連結したcorpusを再度tinysegmenterでトークナイズさせる parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) summarizer = LexRankSummarizer() summarizer.stop_words = [' '] # スペースも1単語として認識されるため # sentences_count に要約後の文の数を指定します。 summary = summarizer(document=parser.document, sentences_count=sentences_count) # 元の文を表示 return "".join([ originals[corpus.index(sentence.__str__())].text for sentence in summary ])
def tense_analyze(self, text, sentences_count): # 1行1文となっているため、改行コードで分離 # sentences = [t for t in text.split('\n')] sentences = [t for t in text.split('。')] # 形態素解析器を作る analyzer = Analyzer( [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ') ], # ()「」、。は全>てスペースに置き換える JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ] # 名詞>・形容詞・副詞・動詞の原型のみ ) # 抽出された単語をスペースで連結 # 末尾の'。'は、この後使うtinysegmenterで文として分離させるため。 corpus = [' '.join(analyzer.analyze(s)) + '。' for s in sentences] # 連結したcorpusを再度tinysegmenterでトークナイズさせる parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) # LexRankで要約を2文抽出 summarizer = LexRankSummarizer() summarizer.stop_words = [' '] # スペースも1単語として認識されるため、ストップワードにすることで除外>する summary = summarizer(document=parser.document, sentences_count=sentences_count) return sentences, corpus, summary
def test_idf_metrics(): summarizer = LexRankSummarizer() sentences = [ ("this", "sentence", "is", "simple", "sentence",), ("this", "is", "simple", "sentence", "yes", "is", "too", "too", "too",), ("not", "every", "sentence", "makes", "me", "happy",), ("yes",), (), ("every", "day", "is", "happy", "day",), ] metrics = summarizer._compute_idf(sentences) expected = { "this": math.log(6/3), "is": math.log(6/4), "yes": math.log(6/3), "simple": math.log(6/3), "sentence": math.log(6/4), "too": math.log(6/2), "not": math.log(6/2), "every": math.log(6/3), "makes": math.log(6/2), "me": math.log(6/2), "happy": math.log(6/3), "day": math.log(6/2), } assert expected == metrics
def summarize(text): sentences = [t for t in text.split('\n')] analyzer = Analyzer( [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ') ], # ()「」、。は全てスペースに置き換える JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ] # 名詞・形容詞・副詞・動詞の原型のみ ) corpus = [' '.join(analyzer.analyze(s)) + '。' for s in sentences] parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) summarizer = LexRankSummarizer() summarizer.stop_words = [' '] summary = summarizer(document=parser.document, sentences_count=3) x = "" for sentence in summary: x += sentences[corpus.index(sentence.__str__())] return x
def janome_document_summarize(document): # 形態素解析(単語単位に分割する) analyzer = Analyzer(char_filters=[ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ') ], tokenizer=JanomeTokenizer(), token_filters=[ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ]) text = re.findall("[^。]+。?", document.replace('\n', '')) corpus = [' '.join(analyzer.analyze(sentence)) + u'。' for sentence in text] parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) summarizer = LexRankSummarizer() summarizer.stop_words = [' ', '。', '\n'] N = int(len(corpus) / 10 * 3) if N <= 0: N = 3 summary = summarizer(document=parser.document, sentences_count=N) rst = '' print('\n要約:') for sentence in summary: print(text[corpus.index(sentence.__str__())]) rst += text[corpus.index(sentence.__str__())] return summary, rst
def node_page(): nid = request.args.get('id') KDB = client.kg_scrapy items = KDB.kg_content.find_one({'_id': nid}) if items == None: return "没有内容" else: LANGUAGE = "chinese" SENTENCES_COUNT = 10 stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) if len(items['content']) > 500: SENTENCES_COUNT = 5 else: SENTENCES_COUNT = 3 parser = PlaintextParser.from_string(items['content'], Tokenizer(LANGUAGE)) summary = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): summary.append(str(sentence)) titles = [] titles_p = DB.pre_titles.find({"parent": items['_id']}) for item in titles_p: irank, grade, softmax = get_rank(item['title']) # print(irank,grade,softmax) # print((items[i])) item['rank'] = irank item['softmax'] = softmax item['grade'] = grade titles.append(item) return render_template("node.html", **locals())
def test_cosine_similarity_sentences_with_no_common_word_should_be_zero(): """ We compute similarity of the sentences without single common word. These are considered dissimilar so have similarity close to 0.0. see https://github.com/miso-belica/sumy/issues/58 """ sentence1 = ["this", "sentence", "is", "simple", "sentence"] tf1 = {"this": 1 / 2, "sentence": 1.0, "is": 1 / 2, "simple": 1 / 2} sentence2 = ["that", "paragraph", "has", "some", "words"] tf2 = { "that": 1.0, "paragraph": 1.0, "has": 1.0, "some": 1.0, "words": 1.0 } idf = { "this": 2 / 1, "sentence": 2 / 1, "is": 2 / 1, "simple": 2 / 1, "that": 2 / 1, "paragraph": 2 / 1, "has": 2 / 1, "some": 2 / 1, "words": 2 / 1, } summarizer = LexRankSummarizer() cosine = summarizer.cosine_similarity(sentence1, sentence2, tf1, tf2, idf) assert abs(0.0 - cosine) < 0.00001
def test_cosine_similarity_sentences_with_no_common_word_should_be_zero(): """ We compute similarity of the sentences without single common word. These are considered dissimilar so have similarity close to 0.0. see https://github.com/miso-belica/sumy/issues/58 """ sentence1 = ["this", "sentence", "is", "simple", "sentence"] tf1 = {"this": 1/2, "sentence": 1.0, "is": 1/2, "simple": 1/2} sentence2 = ["that", "paragraph", "has", "some", "words"] tf2 = {"that": 1.0, "paragraph": 1.0, "has": 1.0, "some": 1.0, "words": 1.0} idf = { "this": 2/1, "sentence": 2/1, "is": 2/1, "simple": 2/1, "that": 2/1, "paragraph": 2/1, "has": 2/1, "some": 2/1, "words": 2/1, } summarizer = LexRankSummarizer() cosine = summarizer.cosine_similarity(sentence1, sentence2, tf1, tf2, idf) assert abs(0.0 - cosine) < 0.00001
def summarize_sentences(sentences: str, language="english") -> list: """ Prepares the summary of sentences. Calls preprocessing for generating a list of processed sentences. Uses LexRank Summarization for preparing summary. :param sentences: Sentences form the text file :param language: Language used, default=English :return: Summary of the source file """ # Preparation sentences corpus_maker = EnglishCorpus() preprocessed_sentences = corpus_maker.preprocessing(sentences) preprocessed_sentence_list = corpus_maker.make_sentence_list( preprocessed_sentences) corpus = corpus_maker.make_corpus() parser = PlaintextParser.from_string(" ".join(corpus), Tokenizer(language)) # Using Rank system for tokenizing the Headwords summarizer = LexRankSummarizer() # Generating stopwords, i.e. words which are not affecting the context of the text. summarizer.stop_words = get_stop_words(language) # Limiting the summary to one-fifth of the article (See README) summary = summarizer(document=parser.document, sentences_count=len(corpus) * 2 // 10) return summary
def fn_start_document_summarize(text): # 形態素解析(単語単位に分割する) tokenizer = JanomeTokenizer('japanese') char_filters=[UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ')] token_filters=[POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form')] analyzer = Analyzer( char_filters=char_filters, tokenizer=tokenizer, token_filters=token_filters ) corpus = [' '.join(analyzer.analyze(sentence)) + u'。' for sentence in text] #print(corpus, len(corpus)) # 文書要約処理実行 parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) # LexRankで要約を原文書の3割程度抽出 summarizer = LexRankSummarizer() summarizer.stop_words = [' '] # 文書の重要なポイントは2割から3割といわれている?ので、それを参考にsentences_countを設定する。 N = 3 summary = summarizer(document=parser.document, sentences_count = N if len(corpus) < 100 else int(len(corpus)/100)) #summary = summarizer(document=parser.document, sentences_count=1) str = '' for sentence in summary: str += (text[corpus.index(sentence.__str__())]) return str
def test_modified_cosine_computation(self): summarizer = LexRankSummarizer() sentence1 = ["this", "sentence", "is", "simple", "sentence"] tf1 = {"this": 1 / 2, "sentence": 1.0, "is": 1 / 2, "simple": 1 / 2} sentence2 = [ "this", "is", "simple", "sentence", "yes", "is", "too", "too" ] tf2 = { "this": 1 / 2, "is": 1.0, "simple": 1 / 2, "sentence": 1 / 2, "yes": 1 / 2, "too": 1.0 } idf = { "this": 2 / 2, "sentence": 2 / 2, "is": 2 / 2, "simple": 2 / 2, "yes": 2 / 1, "too": 2 / 1, } numerator = sum(tf1[t] * tf2[t] * idf[t]**2 for t in ["this", "sentence", "is", "simple"]) denominator1 = math.sqrt(sum((tf1[t] * idf[t])**2 for t in sentence1)) denominator2 = math.sqrt(sum((tf2[t] * idf[t])**2 for t in sentence2)) expected = numerator / (denominator1 * denominator2) cosine = summarizer._compute_cosine(sentence1, sentence2, tf1, tf2, idf) self.assertEqual(expected, cosine)
def generate_summary(content): if content is None: return "" language = "english" stemmer = Stemmer(language) summarizer = LexRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(language) summary = "" # encoding and decoding clears up some issues with ascii # codec parsing. sentence_list = [ unicode(sentence) for sentence in summarizer( PlaintextParser.from_string( content.encode('utf-8').strip().decode('utf-8'), Tokenizer(language)).document, settings.DEFAULT_SENTENCE_COUNT) ] for sentence in sentence_list: excluded = [ exclude for exclude in settings.DEFAULT_EXCLUDE_SENTENCES if exclude.lower() in sentence.lower() ] word_list = sentence.split(' ') if settings.TIME_EXCLUSION_REGEX.search(sentence) is None \ and len(summary) < settings.DEFAULT_SUMMARY_LENGTH \ and len(excluded) == 0 \ and len(word_list) > 1: summary += " " + sentence return summary.replace('>', '').strip()
def test_tf_metrics(self): summarizer = LexRankSummarizer() sentences = [ ("this", "sentence", "is", "simple", "sentence"), ("this", "is", "simple", "sentence", "yes", "is", "too", "too", "too"), ] metrics = summarizer._compute_tf(sentences) expected = [ { "this": 1 / 2, "is": 1 / 2, "simple": 1 / 2, "sentence": 1.0 }, { "this": 1 / 3, "is": 2 / 3, "yes": 1 / 3, "simple": 1 / 3, "sentence": 1 / 3, "too": 1.0 }, ] self.assertEqual(expected, metrics)
def summarize(url=None, LANGUAGE='English', SENTENCES_COUNT=2): parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) result = '' for sentence in summarizer(parser.document, SENTENCES_COUNT): result = result + ' ' + str(sentence) try: result = result + ' ' + str(sentence) except: print( '\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n' ) sys.stdout.flush() return ( '\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n' ) print('\n\n' + str(url) + '\n\n' + str(result)) sys.stdout.flush() return result
def lexs(parser,sentence_count): summarizer = LexRankSummarizer(Stemmer(language)) summarizer.stop_words = get_stop_words(language) summary = summarizer(parser.document, sentence_count) temp = '' for sentence in summary: temp = temp + str(sentence) return (temp)
def test_article_example(): parser = PlaintextParser.from_string( load_resource("articles/prevko_cz_1.txt"), Tokenizer("czech")) summarizer = LexRankSummarizer(stem_word) summarizer.stop_words = get_stop_words("czech") sentences = summarizer(parser.document, 20) assert len(sentences) == 20
def __init__(): LANGUAGE = "english" SENTENCES_COUNT = 1 stemmer = Stemmer(LANGUAGE) lsaSummarizer = Lsa(stemmer) lsaSummarizer.stop_words = get_stop_words(LANGUAGE) luhnSummarizer = Luhn(stemmer) luhnSummarizer.stop_words = get_stop_words(LANGUAGE) # edmundsonSummarizer.bonus_words = get_bonus_words lexrankSummarizer = LexRank(stemmer) lexrankSummarizer.stop_words = get_stop_words(LANGUAGE) textrankSummarizer = TxtRank(stemmer) textrankSummarizer.stop_words = get_stop_words(LANGUAGE) sumbasicSummarizer = SumBasic(stemmer) sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE) klSummarizer = KL(stemmer) klSummarizer.stop_words = get_stop_words(LANGUAGE) parser = HtmlParser.from_string(text, 0, Tokenizer(LANGUAGE)) allvariations = [] for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT): # print("Summarizing text via LSA: ") print((str(sentence))) allvariations.append(sentence) for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Luhn: ") print(str(sentence)) allvariations.append(sentence) for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Lexrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Textrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Sumbasic: ") print(str(sentence)) allvariations.append(sentence) for sentence in klSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via klSum: ") print(str(sentence)) allvariations.append(sentence) return allvariations
def test_idf_metrics(self): summarizer = LexRankSummarizer() sentences = [ ( "this", "sentence", "is", "simple", "sentence", ), ( "this", "is", "simple", "sentence", "yes", "is", "too", "too", "too", ), ( "not", "every", "sentence", "makes", "me", "happy", ), ("yes", ), (), ( "every", "day", "is", "happy", "day", ), ] metrics = summarizer._compute_idf(sentences) expected = { "this": 6 / 2, "is": 6 / 3, "yes": 6 / 2, "simple": 6 / 2, "sentence": 6 / 3, "too": 6 / 1, "not": 6 / 1, "every": 6 / 2, "makes": 6 / 1, "me": 6 / 1, "happy": 6 / 2, "day": 6 / 1, } self.assertEqual(expected, metrics)
def summarize(url): summary = [] parser = HtmlParser.from_url(url,Tokenizer(lang)) stemmer = Stemmer(lang) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(lang) for sentence in summarizer(parser.document,sent): summary.append(sentence._text) return ' '.join(summary)
def lexrank_summarizer(text, stemmer, language, sentences_count): parser = PlaintextParser.from_string((text), Tokenizer(language)) summarizer_LexRank = LexRankSummarizer(stemmer) summarizer_LexRank.stop_words = get_stop_words(language) sentences = [] for sentence in summarizer_LexRank(parser.document, sentences_count): a = sentence sentences.append(str(a)) return "\n".join(sentences)
def test_article_example(self): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" parser = PlaintextParser.from_string( load_resource("articles/prevko_cz_1.txt"), Tokenizer("czech")) summarizer = LexRankSummarizer(stem_word) summarizer.stop_words = get_stop_words("czech") sentences = summarizer(parser.document, 20) self.assertEqual(len(sentences), 20)
def lexrank_summarizer(text, stemmer, LANGUAGE, SENTENCES_COUNT): parser = PlaintextParser.from_string((text), sumytoken(LANGUAGE)) summarizer_LexRank = LexRankSummarizer(stemmer) summarizer_LexRank.stop_words = get_stop_words(LANGUAGE) sentences = [] for sentence in summarizer_LexRank(parser.document, SENTENCES_COUNT): a = sentence sentences.append(str(a)) return " ".join(sentences)
def summarize(text, SENTENCES_COUNT=3, LANGUAGE="english"): parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = LexRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) output = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): output.append(sentence._text + ' ') return ''.join(output)
def sumy_lex_rank_summarizer(docx): parser = PlaintextParser.from_string(docx, Tokenizer("english")) lex_summarizer = LexRankSummarizer() lex_summarizer = LexRankSummarizer(Stemmer("english")) lex_summarizer.stop_words = get_stop_words("english") #Summarize the document with 2 sentences summary = lex_summarizer(parser.document, 2) summary_list = [str(sentence) for sentence in summary] result = ' '.join(summary_list) return result
def main(debug=False): file_name = "../data/report.txt" doc = load_data(file_name) sentences, corpus = preprocess(doc, debug) parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) summarizer = LexRankSummarizer() summarizer.stop_words = [' '] summary = summarizer(document=parser.document, sentences_count=3) for sentence in summary: print(sentences[corpus.index(sentence.__str__())])
def summary(TEXT,LANGUAGE,SENTENCES_COUNT): parser = PlaintextParser.from_string(TEXT, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) resume = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): resume += str(sentence) return resume
def test_article_example(self): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" parser = PlaintextParser.from_string( load_resource("articles/prevko_cz_1.txt"), Tokenizer("czech") ) summarizer = LexRankSummarizer(stem_word) summarizer.stop_words = get_stop_words("czech") sentences = summarizer(parser.document, 20) self.assertEqual(len(sentences), 20)
def lexrankReferenceSummary(path): sentencesList = [] parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = LexRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): #print(sentence._text) sentencesList.append(sentence._text) return sentencesList
def get_lexrank(tweets): sens = [Sentence(t, TwokenizeWrapper()) for t in tweets] tweet_document = ObjectDocumentModel([Paragraph(sens)]) LANGUAGE = "english" stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) SENTENCES_COUNT = len(sens) lex_ranked = summarizer(tweet_document, SENTENCES_COUNT) if len(sens) != len(lex_ranked): print('lr error') return [lex_ranked[s] for s in sens]
def runsumy(method, num, ip_file_path, op_file_path): parser = PlaintextParser.from_file(ip_file_path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) # f = codecs.open(op_file_path, 'w', 'utf-8') s = "" for word in summarizer(parser.document, int(num)): s += word._text.encode('utf-8').decode('utf-8') # print(word._text.encode('utf-8'), file=f) # not outputing in the designated file return s
def __init__(self, content): sentence_length = '50%' parser = PlaintextParser.from_string(content, Tokenizer(self.LANGUAGE)) stemmer = Stemmer(self.LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(self.LANGUAGE) summarized = summarizer(parser.document, sentence_length) for sentence in summarized: self.SUMMARY += "%s\n\n" % self._sentence(sentence) self.WORD_COUNT = self._word_counter(content) self.SUMMARY_COUNT = self._word_counter(self.SUMMARY)
def summarize_text(text, sentences_count=3, language=settings.DEFAULT_LANGUAGE, as_list=False): parser = PlaintextParser.from_string(text, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(language) sentences = [ unicode(sentence) for sentence in summarizer(parser.document, sentences_count) ] return sentences if as_list else '\n'.join(sentences)
def get_quotes(raw_text): parser = PlaintextParser.from_string(clean_text(raw_text), Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): sentences.append(sentence)
def summarize_text(text): language = "english" # Create a parser from the string parser = PlaintextParser.from_string(text, Tokenizer(language)) summarizer = LexRankSummarizer(Stemmer(language)) summarizer.stop_words = sumy.utils.get_stop_words(language) summary_text = "" for sentence in summarizer(parser.document, 5): summary_text += str(sentence) + " " return summary_text
def lexrankReferenceSummary(path): sentencesList=[] parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = LexRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): #print(sentence._text) sentencesList.append(sentence._text) return sentencesList
def test_tf_metrics(self): summarizer = LexRankSummarizer() sentences = [ ("this", "sentence", "is", "simple", "sentence"), ("this", "is", "simple", "sentence", "yes", "is", "too", "too", "too"), ] metrics = summarizer._compute_tf(sentences) expected = [ {"this": 1/2, "is": 1/2, "simple": 1/2, "sentence": 1.0}, {"this": 1/3, "is": 2/3, "yes": 1/3, "simple": 1/3, "sentence": 1/3, "too": 1.0}, ] self.assertEqual(expected, metrics)
def summarize_file(file_name): #url = "http://www.zsstritezuct.estranky.cz/clanky/predmety/cteni/jak-naucit-dite-spravne-cist.html" #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files parser = PlaintextParser.from_file(file_name, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = summarizer(parser.document, SENTENCES_COUNT) list_sentences = [] for sentence in sentences: list_sentences.append(str(sentence)) return list_sentences
def test_document_is_all_in_upper_case(): """ When all words is in upper case Plaintext parser first line as heading and LexRank algorithm raises exception "ZeroDivisionError: float division by zero" because there is no sentence to summarize. See https://github.com/miso-belica/sumy/issues/25 """ parser = PlaintextParser.from_string( "JUST WRITING SOME TEXT. TO TEST CASE. WITH ZERO SENTENCES RETURNED. FROM TOKENIZER.", Tokenizer("english") ) summarizer = LexRankSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") sentences = summarizer(parser.document, 20) assert len(sentences) == 0
def test_cosine_similarity_for_the_same_sentence_with_duplicate_words_should_be_one(): """ We compute similarity of the same sentences. These should be exactly the same and therefor have similarity close to 1.0. see https://github.com/miso-belica/sumy/issues/58 """ sentence1 = ["this", "sentence", "is", "simple", "sentence"] tf1 = {"this": 1/2, "sentence": 1.0, "is": 1/2, "simple": 1/2} sentence2 = ["this", "sentence", "is", "simple", "sentence"] tf2 = {"this": 1/2, "sentence": 1.0, "is": 1/2, "simple": 1/2} idf = { "this": 2/2, "sentence": 2/2, "is": 2/2, "simple": 2/2, } summarizer = LexRankSummarizer() cosine = summarizer.cosine_similarity(sentence1, sentence2, tf1, tf2, idf) assert abs(1.0 - cosine) < 0.00001
def test_power_method_should_return_different_scores_for_sentences(): """See https://github.com/miso-belica/sumy/issues/26""" matrix = numpy.array([ [0.1, 0.2, 0.3, 0.6, 0.9], [0.45, 0, 0.3, 0.6, 0], [0.5, 0.6, 0.3, 1, 0.9], [0.7, 0, 0, 0.6, 0], [0.5, 0.123, 0, 0.111, 0.9], ]) scores = LexRankSummarizer.power_method(matrix, LexRankSummarizer.epsilon) assert len(frozenset(scores.tolist())) > 1
def test_modified_cosine_computation(self): summarizer = LexRankSummarizer() sentence1 = ["this", "sentence", "is", "simple", "sentence"] tf1 = {"this": 1/2, "sentence": 1.0, "is": 1/2, "simple": 1/2} sentence2 = ["this", "is", "simple", "sentence", "yes", "is", "too", "too"] tf2 = {"this": 1/2, "is": 1.0, "simple": 1/2, "sentence": 1/2, "yes": 1/2, "too": 1.0} idf = { "this": 2/2, "sentence": 2/2, "is": 2/2, "simple": 2/2, "yes": 2/1, "too": 2/1, } numerator = sum(tf1[t]*tf2[t]*idf[t]**2 for t in ["this", "sentence", "is", "simple"]) denominator1 = math.sqrt(sum((tf1[t]*idf[t])**2 for t in sentence1)) denominator2 = math.sqrt(sum((tf2[t]*idf[t])**2 for t in sentence2)) expected = numerator / (denominator1 * denominator2) cosine = summarizer._compute_cosine(sentence1, sentence2, tf1, tf2, idf) self.assertEqual(expected, cosine)
def do_work(self, worker_id, work): url = work """Greenlet to fetch analyze URL content """ print '[+] {0}: Starting crawl of {1}'.format(worker_id, url) """Using urllib2 via geventhttpclient. Selenium with PhantomJS or a real browser would be probably better but slower and more expensive. Could have also used scrapy, but thats way to heavy for this use-case.""" body = urlopen(url).read() """Using Sumy (built on nltk) for page summaries since it supports a number of ranking algorithms. It's not perfect though, it was written for czech and so its missing some important English-specific things (e.g. bonus/significant words for Edmundson Summarizers) https://pypi.python.org/pypi/sumy TextBlob might be a better alternative, but it didn't seem to provide overall summary information. https://textblob.readthedocs.org/en/latest/ """ parser = HtmlParser.from_string(body, None, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) words = [] for sentence in summarizer(parser.document, 10): words = str(sentence).split() # Send the results self.work_done(worker_id, words)
summarizer.stop_words = get_stop_words(LANGUAGE) SENTENCES_COUNT = num_tweets #for sentence in summarizer(parser.document, SENTENCES_COUNT): # print(sentence) return summarizer(parser.document, SENTENCES_COUNT) """ tweets = [l.strip('\n').split('\t')[1] for l in open('../data/Add_A_Woman_Improve_A_Movie').readlines()] #tweets_string = 'HEADER\n\n'+'\n\n'.join(tweets)+'\n' sens = [Sentence(t, TwokenizeWrapper()) for t in tweets] tweet_document = ObjectDocumentModel( [Paragraph(sens)] ) LANGUAGE = "english" stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) SENTENCES_COUNT = len(sens) print summarizer(tweet_document, SENTENCES_COUNT)[sens[0]] #print SENTENCES_COUNT #print len(summarizer(tweet_document, SENTENCES_COUNT)) #print len(tweets)#_string #print len(run_lexrank(tweets_string, len(tweets)))