Example #1
0
def text_rank(text, language):
    sentences = []
    words = []

    if (language == 'ukrainian'):
        morph = MorphAnalyzer(lang='uk')
        sentences = sent_tokenizer_ua(text)
        if len(sentences) < 2:
            s = sentences[0]
            return [(1, 0, s)]
        words = [set(morph.parse(word)[0].normalized for word in word_tokenizer.tokenize(sentence.lower())
                    if word not in stop_words_ua) for sentence in sentences]
    else:
        morph = MorphAnalyzer()
        sentences = sent_tokenizer_ru(text)
        if len(sentences) < 2:
            s = sentences[0]
            return [(1, 0, s)]
        words = [set(morph.parse(word)[0].normalized for word in word_tokenizer.tokenize(sentence.lower())
                     if word not in stop_words_ru) for sentence in sentences]

    pairs = combinations(range(len(sentences)), 2)
    scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
    scores = filter(lambda x: x[2], scores)
    pr = rank_graph(scores)

    return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
                  key=lambda x: pr[x[0]], reverse=True)
Example #2
0
def create_matrix(text, sent_tokenizer, morph, dictionary):
    sentences = sent_tokenizer(text)

    words_count = len(dictionary)
    sentences_count = len(sentences)

    matrix = numpy.zeros((words_count, sentences_count))
    for col, sentence in enumerate(sentences):
        for word in word_tokenizer.tokenize(sentence.lower()):
            word = morph.parse(word)[0].normalized
            if word in dictionary:
                row = dictionary[word]
                matrix[row, col] += 1
    rows, cols = matrix.shape
    if rows and cols:
        word_count = numpy.sum(matrix)
        for row in range(rows):
            unique_word_count = numpy.sum(matrix[row, :])
            for col in range(cols):
                if matrix[row, col]:
                    matrix[row, col] = unique_word_count / word_count
    else:
        matrix = numpy.zeros((1, 1))
    return matrix
 def __call__(self, doc):
     return [
         self.morph.parse(t)[0].normalized
         for t in word_tokenizer.tokenize(doc.lower())
     ]
Example #4
0
def create_dictionary(text, morph, stop_words):
    words = set(
        morph.parse(word)[0].normalized
        for word in word_tokenizer.tokenize(text.lower())
        if word not in stop_words)
    return dict((w, i) for i, w in enumerate(words))