def text_rank(text, language): sentences = [] words = [] if (language == 'ukrainian'): morph = MorphAnalyzer(lang='uk') sentences = sent_tokenizer_ua(text) if len(sentences) < 2: s = sentences[0] return [(1, 0, s)] words = [set(morph.parse(word)[0].normalized for word in word_tokenizer.tokenize(sentence.lower()) if word not in stop_words_ua) for sentence in sentences] else: morph = MorphAnalyzer() sentences = sent_tokenizer_ru(text) if len(sentences) < 2: s = sentences[0] return [(1, 0, s)] words = [set(morph.parse(word)[0].normalized for word in word_tokenizer.tokenize(sentence.lower()) if word not in stop_words_ru) for sentence in sentences] pairs = combinations(range(len(sentences)), 2) scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs] scores = filter(lambda x: x[2], scores) pr = rank_graph(scores) return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True)
def text_rank(text, language): sentences = [] a = [] if (language == 'ukrainian'): morph = MorphAnalyzer(lang='uk') sentences = sent_tokenizer_ua(text) if len(sentences) < 2: s = sentences[0] return [(1, 0, s)] a = tfidf(text, language, sent_tokenizer_ua, stop_words_ua) else: morph = MorphAnalyzer() sentences = sent_tokenizer_ru(text) if len(sentences) < 2: s = sentences[0] return [(1, 0, s)] a = tfidf(text, language, sent_tokenizer_ru, stop_words_ru) pairs = combinations(range(len(sentences)), 2) scores = [(i, j, similarity(a[i, :], a[j, :])) for i, j in pairs] scores = filter(lambda x: x[2], scores) pr = rank_graph(scores) return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True) # Сортировка по убыванию ранга тройки
def test(language): filename = 'ru_gazeta.csv' if language == 'ukrainian': filename = 'ua_gazeta.csv' testData = [] with io.open(filename, 'r', encoding="utf-8") as file: for row in csv.reader(file): testData.append({"id": row[0], "title": row[1], "text": row[2]}) row_list = [["Id", "Title", "Body", "Summary"]] for testDataRow in testData: testedText = testDataRow["text"] result = '' if language == 'ukrainian': sentences = sent_tokenizer_ua(testedText) if len(sentences) < 20: result = GeneralSummarizer.summarize(testedText, language, 2) else: result = GeneralSummarizer.summarize(testedText, language, len(sentences) // 10) else: sentences = sent_tokenizer_ru(testedText) if len(sentences) < 20: result = GeneralSummarizer.summarize(testedText, language, 2) else: result = GeneralSummarizer.summarize(testedText, language, len(sentences) // 10) testDataRow["result"] = result row_list.append([ testDataRow["id"], testDataRow["title"], testDataRow["text"], testDataRow["result"] ]) with io.open('result' + filename, 'w', newline='', encoding="utf-8") as file: writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) writer.writerows(row_list)
def test(language): filename = 'RU_dataset.csv' if language == 'ukrainian': filename = 'UA_dataset_full.csv' testData = [] with io.open(filename, 'r', encoding="utf-8") as file: for row in csv.reader(file): testData.append({"id": row[0], "title": row[1], "text": row[2]}) row_list = [[ "Id", "Title", "Body", "Summary20", "Cosine20", "Summary40", "Cosine40" ]] for testDataRow in testData: testedText = testDataRow["text"] result20 = '' result40 = '' if language == 'ukrainian': sentences = sent_tokenizer_ua(testedText) if len(sentences) < 10: result20 = GeneralSummarizer.summarize(testedText, language, 2) if (len(sentences) < 5): result40 = GeneralSummarizer.summarize( testedText, language, 2) else: result40 = GeneralSummarizer.summarize( testedText, language, 4) elif len(sentences) < 20: result20 = GeneralSummarizer.summarize(testedText, language, 4) result40 = GeneralSummarizer.summarize(testedText, language, 8) else: result20 = GeneralSummarizer.summarize(testedText, language, len(sentences) // 5) result40 = GeneralSummarizer.summarize(testedText, language, len(sentences) // 3) else: sentences = sent_tokenizer_ru(testedText) if len(sentences) < 10: result20 = GeneralSummarizer.summarize(testedText, language, 2) if (len(sentences) < 5): result40 = GeneralSummarizer.summarize( testedText, language, 2) else: result40 = GeneralSummarizer.summarize( testedText, language, 4) elif len(sentences) < 20: result20 = GeneralSummarizer.summarize(testedText, language, 4) result40 = GeneralSummarizer.summarize(testedText, language, 8) else: result20 = GeneralSummarizer.summarize(testedText, language, len(sentences) // 5) result40 = GeneralSummarizer.summarize(testedText, language, len(sentences) // 3) cosine20 = get_cosine(text_to_vector(result20), text_to_vector(testedText)) cosine40 = get_cosine(text_to_vector(result40), text_to_vector(testedText)) testDataRow["result20"] = result20 testDataRow["result40"] = result40 row_list.append([ testDataRow["id"], testDataRow["title"], testDataRow["text"], testDataRow["result20"], str(cosine20), testDataRow["result40"], str(cosine40) ]) with io.open('result' + filename, 'w', newline='', encoding="utf-8") as file: writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) writer.writerows(row_list)