def sumbasic(parser, sentence_count): summarizer_5 = SumBasicSummarizer(Stemmer(language)) summarizer_5.stop_words = get_stop_words(language) summary_5 = summarizer_5(parser.document, 5) temp = '' for sentence in summary_5: temp = temp + str(sentence) return (temp)
def __init__(): LANGUAGE = "english" SENTENCES_COUNT = 1 stemmer = Stemmer(LANGUAGE) lsaSummarizer = Lsa(stemmer) lsaSummarizer.stop_words = get_stop_words(LANGUAGE) luhnSummarizer = Luhn(stemmer) luhnSummarizer.stop_words = get_stop_words(LANGUAGE) # edmundsonSummarizer.bonus_words = get_bonus_words lexrankSummarizer = LexRank(stemmer) lexrankSummarizer.stop_words = get_stop_words(LANGUAGE) textrankSummarizer = TxtRank(stemmer) textrankSummarizer.stop_words = get_stop_words(LANGUAGE) sumbasicSummarizer = SumBasic(stemmer) sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE) klSummarizer = KL(stemmer) klSummarizer.stop_words = get_stop_words(LANGUAGE) parser = HtmlParser.from_string(text, 0, Tokenizer(LANGUAGE)) allvariations = [] for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT): # print("Summarizing text via LSA: ") print((str(sentence))) allvariations.append(sentence) for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Luhn: ") print(str(sentence)) allvariations.append(sentence) for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Lexrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Textrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Sumbasic: ") print(str(sentence)) allvariations.append(sentence) for sentence in klSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via klSum: ") print(str(sentence)) allvariations.append(sentence) return allvariations
def sumbasic_summarizer(text, stemmer, language, sentences_count): parser = PlaintextParser.from_string(text, Tokenizer(language)) summarizer_luhn = SumBasicSummarizer(stemmer) summarizer_luhn.stop_words = get_stop_words(language) sentences = [] for sentence in summarizer_luhn(parser.document, sentences_count): a = sentence sentences.append(str(a)) return "\n".join(sentences)
def sumbasicReferenceSummary(path): sentencesList = [] parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = SumBasicSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): #print(sentence._text) sentencesList.append(sentence._text) return sentencesList
def sumbasicReferenceSummary(path): sentencesList=[] parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = SumBasicSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): #print(sentence._text) sentencesList.append(sentence._text) return sentencesList
def SumBasic(rsc_file, dst_file, count): language = "chinese" parser = PlaintextParser.from_file(rsc_file, Tokenizer(language), encoding='utf-8') stemmer = Stemmer(language) # 语言容器 summarizer = SumBasicSummarizer(stemmer) # LSA算法 summarizer.stop_words = get_stop_words(language) with open(dst_file, 'w', encoding='utf-8') as f: for sentence in summarizer(parser.document, count): f.write(str(sentence)) f.write('\n') print(sentence)
def summary(url): parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) res = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): print(type(sentence)) res.append(sentence._text) return res
def basic_sum(file, test_ratio=0.10, israndom=True): # extract test files file_lines = file.read().splitlines() nsamples = len(file_lines) ntests = int(nsamples * test_ratio) if israndom: seq = np.random.permutation(nsamples) else: seq = np.arange(nsamples) # summerizer stemmer = Stemmer(_language) summarizer = Summarizer (stemmer) summarizer.stop_words = get_stop_words(_language) # rouge rouge = Rouge155() scores = defaultdict(list) for i in range(ntests): line = file_lines[seq[i]] sample = json.loads(line) content = sample['content'] title = sample['title'] ref_text = {'A': title} doc = ' '.join(content) parser = PlaintextParser.from_string(doc, Tokenizer(_language)) sum_sents = summarizer(parser.document, _sent_count) if len(sum_sents) != _sent_count: continue summary = str(sum_sents[0]) score = rouge.score_summary(summary, ref_text) for k, v in score.items(): scores[k].append(v) print('{} / {} processed.'.format(i, ntests), end='\r') result = {} for k, v in scores.items(): result[k] = mean(v) return result
def _build_summarizer(stop_words, stemmer=None): summarizer = SumBasicSummarizer( ) if stemmer is None else SumBasicSummarizer(stemmer) summarizer.stop_words = stop_words return summarizer
def _build_summarizer(self, stop_words): summarizer = SumBasicSummarizer() summarizer.stop_words = stop_words return summarizer
import os #create folder def createFolder(directory): try: if not os.path.exists(directory): os.makedirs(directory) except OSError: print('Error: Creating directory. ' + directory) LANGUAGE = "bangla" SENTENCES_COUNT = 2 if __name__ == "__main__": createFolder('Dataset/NCTB/SumBasicSummary/') for i in range(1, 140): serial_no = str(i) path = "Dataset/NCTB/Source/" + serial_no + ".txt" parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): summary = summary + " " + str(sentence) fi = open('Dataset/NCTB/SumBasicSummary/' + serial_no + '.txt', '+w') fi.write(summary)
def run_SumBasic(stemmer, document, n): luhn = SumBasicSummarizer(stemmer) luhn.stop_words = get_stop_words(LANGUAGE) print("SumBasic: {}".format(n)) res = luhn(document, SENTENCES_COUNT) return " ".join(str(res[0]).split()[:n])
def sum_basic(text, config={'summary_length': 1}): summarizer = SumBasicSummarizer(stemmer.lemmatize) summarizer.stop_words = STOP_WORDS parser = PlaintextParser.from_string(text, Tokenizer('english')) summary = summarizer(parser.document, config['summary_length']) return ' '.join([str(s) for s in summary])
def __summarize(self, parser): summarizer = SumBasicSummarizer(Stemmer(self.__language)) summarizer.stop_words = get_stop_words(self.__language) final_sentences = summarizer(parser.document, self.__sentences_count) return self.__join_sentences(final_sentences)
def _build_summarizer(self, stop_words): summarizer = SumBasicSummarizer() summarizer.stop_words = stop_words return summarizer
def textteaser_test(): summary = open("summary_list.txt", "a", encoding='utf-8-sig') sys.stdout = summary # obtain the input article from url #url = "http://www.nytimes.com/2016/11/17/us/politics/donald-trump-administration-twitter.html?ref=politics" #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # obtain the input article from plain text files parser = PlaintextParser.from_file("input_sample.txt", Tokenizer(LANGUAGE)) # define the language, by dafult it is English stemmer = Stemmer(LANGUAGE) # SumBasic algorithm summarizer = SumBasicSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print("SumBasic:") for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("\n") # LSA algorithm summarizer = LsaSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print("Latent Semantic Analysis:") for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("\n") # TextRank algorithm summarizer = TextRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print("TextRank:") for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("\n") # LexRank algorithm summarizer = LexRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print("LexRank:") for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("\n") #Featured-LexRank algorithm with open('input_sample.txt', 'r', encoding='utf-8-sig') as f: first_line = f.readline() title = first_line with open('input_sample.txt', 'r', encoding='utf-8-sig') as f: text = f.read() tt = TextTeaser() sentences = tt.summarize(title, text) file = open("tt.txt", "w", encoding='utf-8-sig') print("Featured-LexRank:") for sentence in sentences: file.write("%s\n" % sentence) file.close() parser = PlaintextParser.from_file("tt.txt", Tokenizer(LANGUAGE)) summarizer = LexRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("\n") summary.close()
def build_sum_basic(parser, language): summarizer = SumBasicSummarizer(Stemmer(language)) summarizer.stop_words = get_stop_words(language) return summarizer
from __future__ import absolute_import from __future__ import division, print_function, unicode_literals from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer from sumy.summarizers.sum_basic import SumBasicSummarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words import sys LANGUAGE = "english" SENTENCES_COUNT = int(sys.argv[2]) text_file = sys.argv[1] if __name__ == "__main__": parser = PlaintextParser.from_file(text_file, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = SumBasicSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)