def summarize(text, language, n=5): tr = text_rank(text, language) top_n = sorted(tr[:n]) text_rank_result = ' '.join(x[2] for x in top_n) lsa_result = LSASummarizer.summarize(text, language, n) cosine_text_rank = get_cosine(text_to_vector(text), text_to_vector(text_rank_result)) cosine_lsa=get_cosine(text_to_vector(text), text_to_vector(lsa_result)) if cosine_lsa > cosine_text_rank: return lsa_result return text_rank_result
def abstract(): form = AbstractForm() if form.is_submitted(): form.result = summarize(form.body.data, form.language.data, int(form.rows.data)) form.cosine = get_cosine(text_to_vector(form.body.data), text_to_vector(form.result.data)) return render_template('abstract.html', form=form, template='form-template') return render_template('abstract.html', form=form, template='form-template')
def test(language): filename = 'RU_dataset.csv' if language == 'ukrainian': filename = 'UA_dataset_full.csv' testData = [] with io.open(filename, 'r', encoding="utf-8") as file: for row in csv.reader(file): testData.append({"id": row[0], "title": row[1], "text": row[2]}) row_list = [[ "Id", "Title", "Body", "Summary20", "Cosine20", "Summary40", "Cosine40" ]] for testDataRow in testData: testedText = testDataRow["text"] result20 = '' result40 = '' if language == 'ukrainian': sentences = sent_tokenizer_ua(testedText) if len(sentences) < 10: result20 = GeneralSummarizer.summarize(testedText, language, 2) if (len(sentences) < 5): result40 = GeneralSummarizer.summarize( testedText, language, 2) else: result40 = GeneralSummarizer.summarize( testedText, language, 4) elif len(sentences) < 20: result20 = GeneralSummarizer.summarize(testedText, language, 4) result40 = GeneralSummarizer.summarize(testedText, language, 8) else: result20 = GeneralSummarizer.summarize(testedText, language, len(sentences) // 5) result40 = GeneralSummarizer.summarize(testedText, language, len(sentences) // 3) else: sentences = sent_tokenizer_ru(testedText) if len(sentences) < 10: result20 = GeneralSummarizer.summarize(testedText, language, 2) if (len(sentences) < 5): result40 = GeneralSummarizer.summarize( testedText, language, 2) else: result40 = GeneralSummarizer.summarize( testedText, language, 4) elif len(sentences) < 20: result20 = GeneralSummarizer.summarize(testedText, language, 4) result40 = GeneralSummarizer.summarize(testedText, language, 8) else: result20 = GeneralSummarizer.summarize(testedText, language, len(sentences) // 5) result40 = GeneralSummarizer.summarize(testedText, language, len(sentences) // 3) cosine20 = get_cosine(text_to_vector(result20), text_to_vector(testedText)) cosine40 = get_cosine(text_to_vector(result40), text_to_vector(testedText)) testDataRow["result20"] = result20 testDataRow["result40"] = result40 row_list.append([ testDataRow["id"], testDataRow["title"], testDataRow["text"], testDataRow["result20"], str(cosine20), testDataRow["result40"], str(cosine40) ]) with io.open('result' + filename, 'w', newline='', encoding="utf-8") as file: writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) writer.writerows(row_list)