def edmunson(self, text): # Sprache wählen language = "german" # Die Prozentzahl vom Schieberegler ziehen divident = 100 / self.scale.get() # Den Text tokenizen und einen Stemmer zum Summarizer hinzufügen parser = PlaintextParser.from_string(text, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) # Spezifische Wortlisten definieren # Die bonus, stigma und null words sollen nicht genutzt werden aber es wird kein leerer Input akzeptiert summarizer.stop_words = get_stop_words(language) summarizer.bonus_words = ["nsdgdf"] summarizer.stigma_words = ["mtrtf"] summarizer.null_words = ["zngg"] summary = "" count = 0 # Anzahl der Sätzte zählen for sentence in summarizer(parser.document, 10000000000): count += 1 # Die Satzanzahl aus dem Przentanteil ermitteln sentence_number = round(count / divident) # Die Sätze zu einem Text zusammenfügen for sentence in summarizer(parser.document, sentence_number): summary += " " + str(sentence) return summary
def summarize(text, sentence_count, bonus_words, language='english'): ''' ''' summarizer = EdmundsonSummarizer(Stemmer(language)) summarizer.stop_words = get_stop_words(language) summarizer.bonus_words = bonus_words summarizer.stigma_words = ['zdfgthdvndadv'] summarizer.null_words = stopwords.words('english') summary = summarizer( PlaintextParser(text, Tokenizer(language)).document, sentence_count) return summary
def edmundson_summarizer(text, stemmer, language, sentences_count): parser = PlaintextParser.from_string(text, Tokenizer(language)) summarizer_luhn = EdmundsonSummarizer(stemmer) summarizer_luhn.stop_words = get_stop_words(language) summarizer_luhn.bonus_words = ("computing", "learning", "mobile") summarizer_luhn.stigma_words = ("another", "and", "some", "next") summarizer_luhn.null_words = ("another", "and", "some", "next") sentences = [] for sentence in summarizer_luhn(parser.document, sentences_count): a = sentence sentences.append(str(a)) return "\n".join(sentences)
def summarize(srt_file, summarizer, n_sentences, language, bonusWords, stigmaWords): # Converting the srt file to a plain text document and passing in to Sumy library(The text summarization library) functions. ##print(srt_to_doc(srt_file)) parser = PlaintextParser.from_string(srt_to_doc(srt_file), Tokenizer(language)) if (summarizer == 'ED'): summarizer = EdmundsonSummarizer() with open(bonusWords, "r+") as f: bonus_wordsList = f.readlines() bonus_wordsList = [x.strip() for x in bonus_wordsList] f.close() with open(stigmaWords, "r+") as f: stigma_wordsList = f.readlines() stigma_wordsList = [x.strip() for x in stigma_wordsList] f.close() summarizer.bonus_words = (bonus_wordsList) summarizer.stigma_words = (stigma_wordsList) summarizer.null_words = get_stop_words(language) else: stemmer = Stemmer(language) summarizer = SUMMARIZERS[summarizer](stemmer) summarizer.stop_words = get_stop_words(language) ret = [] summarizedSubtitles = [] # Now the the document passed is summarized and we can access the filtered sentences along with the no of sentence # for sentence in parser.document: # print("sentence ",sentence) # print("cod ",srt_file) # for ob in srt_file: # sent=srt_to_doc([ob]) # print("sent ",sent[4:]) for sentence in summarizer(parser.document, n_sentences): # Index of the sentence # print("sentence ",sentence) index = int(re.findall("\(([0-9]+)\)", str(sentence))[0]) # Using the index we determine the subtitle to be selected item = srt_file[index] # print("item ",item) summarizedSubtitles.append(item) # add the selected subtitle to the result array ret.append(srt_item_to_range(item)) return ret, summarizedSubtitles
def Edmundson(rsc_file, dst_file, count): language = "chinese" parser = PlaintextParser.from_file(rsc_file, Tokenizer(language), encoding='utf-8') stemmer = Stemmer(language) # 语言容器 summarizer = EdmundsonSummarizer(stemmer) # Luhn算法 summarizer.stop_words = get_stop_words(language) with open(dst_file, 'w', encoding='utf-8') as f: for sentence in summarizer(parser.document, count): f.write(str(sentence)) f.write('\n') print(sentence)
def main(req: func.HttpRequest) -> func.HttpResponse: ret = "" logging.info('Python HTTP trigger function processed a request.') text = str(req.get_body()) soup = BeautifulSoup(text, features="lxml") souped = soup.get_text() SENTENCES_COUNT = math.log2(souped.count('.')) parser = PlaintextParser.from_string(souped, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): ret += str(sentence) return func.HttpResponse(re.sub(r'\\\w{3}', '', ret))
sys.setdefaultencoding('utf8') """ nltk.data.path.append('/home/kariminf/Data/NLTK/') for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) """ file = open(SIZE_FILE, 'r') while 1: line = file.readline() if line == '': break; parts = line.split(",") sizes[parts[0]] = int(parts[1]) file.close() nltk.data.path.append('/home/kariminf/Data/NLTK/') for eval in sizes: txt_path = "src/body/text/en/" + eval parser = PlaintextParser.from_file(txt_path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = extract(summarizer, sizes[eval]) fout = open("baselines/EdmundsonSummarizer/en/" + eval[:-9] + ".txt", "w") fout.write(summary) fout.close()