def get_plain_text(cleaned_html_node, summary_sentences_qty): """ Summarizes text from html element. :param cleaned_html_node: html node to extract text sentences :param summary_sentences_qty: quantity of sentences of summarized text :return: summarized text, two-digit language code """ clean_text = "" # tokenizer for splitting text by sentences sent_tokenizer = PunktSentenceTokenizer() # assembling text only with complete sentences, ended with respective punctuations. for node in cleaned_html_node.iter('p'): if node.text is not None and len(node.text.strip(' \n\b\t')) > 0: sentences = sent_tokenizer.tokenize(node.text) for sentence in sentences: sentence = sentence.strip(' \n\b\t') if len(sentence) > 0 and sentence[-1:] in ['.', '!', '?', '…'] and \ not sentence.strip(' .!?…').isdigit(): clean_text = clean_text + ' ' + sentence # creating summary, obtaining language code and total sentences quantity final_result, lang_code, sent_qty = create_referat(clean_text, '', summary_sentences_qty) return final_result, lang_code
def textrank(text, hdr): sent_tokenizer = PunktSentenceTokenizer() sentences = sent_tokenizer.tokenize(text) word_tokenizer = RegexpTokenizer(r'\w+') # finding out the most possible language of the text lang_code = lang_identifier.classify(' '.join([hdr, text]))[0] stemmer = snowballstemmer.stemmer(LANG_CODES.get(lang_code, 'english')) words = [set(stemmer.stemWord(word) for word in word_tokenizer.tokenize(sentence.lower())) for sentence in sentences] pairs = combinations(range(len(sentences)), 2) scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs] scores = filter(lambda x: x[2], scores) g = nx.Graph() g.add_weighted_edges_from(scores) pr = nx.pagerank(g) return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True), lang_code