def text_decomposition(text, lang='de'): if lang == 'de': nlp = spacy.load('de_core_news_md') elif lang == 'en': nlp = spacy.load("en_core_web_md") elif lang == 'ru': nlp = Russian() sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) else: print("Unsupported language. Choose from ['en', 'de', 'ru']") return doc = nlp(text) sentences = list() for sent in doc.sents: sentences.append(sent.text) return sentences
def spacy_sentence_scores(self) -> Dict[str, float]: nlp = Russian() sentencizer = nlp.create_pipe('sentencizer') nlp.add_pipe(sentencizer) raw_text = self.text docx = nlp(raw_text) stopwords = list(STOP_WORDS) word_frequencies = {} for word in docx: if word.text not in stopwords: word = MORPH.parse(word.text)[0].normalized if not ('PREP' in word.tag or 'CONJ' in word.tag or 'PRCL' in word.tag or 'INTJ' in word.tag): if word.word not in word_frequencies.keys(): word_frequencies[word.word] = 1 else: word_frequencies[word.word] += 1 maximum_frequency = max(word_frequencies.values()) for word in word_frequencies.keys(): word_frequencies[word] = (word_frequencies[word] / maximum_frequency) sentence_list = [sentence for sentence in docx.sents] sentence_scores = {} for sent in sentence_list: for word in sent: word = MORPH.parse(word.text)[0].normalized if not ('PREP' in word.tag or 'CONJ' in word.tag or 'PRCL' in word.tag or 'INTJ' in word.tag): if word.word in word_frequencies.keys(): if sent not in sentence_scores.keys(): sentence_scores[sent] = word_frequencies[word.word] else: sentence_scores[sent] += word_frequencies[word.word] return sentence_scores