def text_rank(text, language): sentences = [] words = [] if (language == 'ukrainian'): morph = MorphAnalyzer(lang='uk') sentences = sent_tokenizer_ua(text) if len(sentences) < 2: s = sentences[0] return [(1, 0, s)] words = [set(morph.parse(word)[0].normalized for word in word_tokenizer.tokenize(sentence.lower()) if word not in stop_words_ua) for sentence in sentences] else: morph = MorphAnalyzer() sentences = sent_tokenizer_ru(text) if len(sentences) < 2: s = sentences[0] return [(1, 0, s)] words = [set(morph.parse(word)[0].normalized for word in word_tokenizer.tokenize(sentence.lower()) if word not in stop_words_ru) for sentence in sentences] pairs = combinations(range(len(sentences)), 2) scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs] scores = filter(lambda x: x[2], scores) pr = rank_graph(scores) return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True)
def create_matrix(text, sent_tokenizer, morph, dictionary): sentences = sent_tokenizer(text) words_count = len(dictionary) sentences_count = len(sentences) matrix = numpy.zeros((words_count, sentences_count)) for col, sentence in enumerate(sentences): for word in word_tokenizer.tokenize(sentence.lower()): word = morph.parse(word)[0].normalized if word in dictionary: row = dictionary[word] matrix[row, col] += 1 rows, cols = matrix.shape if rows and cols: word_count = numpy.sum(matrix) for row in range(rows): unique_word_count = numpy.sum(matrix[row, :]) for col in range(cols): if matrix[row, col]: matrix[row, col] = unique_word_count / word_count else: matrix = numpy.zeros((1, 1)) return matrix
def __call__(self, doc): return [ self.morph.parse(t)[0].normalized for t in word_tokenizer.tokenize(doc.lower()) ]
def create_dictionary(text, morph, stop_words): words = set( morph.parse(word)[0].normalized for word in word_tokenizer.tokenize(text.lower()) if word not in stop_words) return dict((w, i) for i, w in enumerate(words))