Python tokenize Examples

Programming Language: Python

Namespace/Package Name: TextPreprocessor.word_tokenizer

Method/Function: tokenize

Examples at hotexamples.com: 4

Python tokenize - 4 examples found. These are the top rated real world Python examples of TextPreprocessor.word_tokenizer.tokenize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def text_rank(text, language):
    sentences = []
    words = []

    if (language == 'ukrainian'):
        morph = MorphAnalyzer(lang='uk')
        sentences = sent_tokenizer_ua(text)
        if len(sentences) < 2:
            s = sentences[0]
            return [(1, 0, s)]
        words = [set(morph.parse(word)[0].normalized for word in word_tokenizer.tokenize(sentence.lower())
                    if word not in stop_words_ua) for sentence in sentences]
    else:
        morph = MorphAnalyzer()
        sentences = sent_tokenizer_ru(text)
        if len(sentences) < 2:
            s = sentences[0]
            return [(1, 0, s)]
        words = [set(morph.parse(word)[0].normalized for word in word_tokenizer.tokenize(sentence.lower())
                     if word not in stop_words_ru) for sentence in sentences]

    pairs = combinations(range(len(sentences)), 2)
    scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
    scores = filter(lambda x: x[2], scores)
    pr = rank_graph(scores)

    return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
                  key=lambda x: pr[x[0]], reverse=True)

Example #2

Show file

def create_matrix(text, sent_tokenizer, morph, dictionary):
    sentences = sent_tokenizer(text)

    words_count = len(dictionary)
    sentences_count = len(sentences)

    matrix = numpy.zeros((words_count, sentences_count))
    for col, sentence in enumerate(sentences):
        for word in word_tokenizer.tokenize(sentence.lower()):
            word = morph.parse(word)[0].normalized
            if word in dictionary:
                row = dictionary[word]
                matrix[row, col] += 1
    rows, cols = matrix.shape
    if rows and cols:
        word_count = numpy.sum(matrix)
        for row in range(rows):
            unique_word_count = numpy.sum(matrix[row, :])
            for col in range(cols):
                if matrix[row, col]:
                    matrix[row, col] = unique_word_count / word_count
    else:
        matrix = numpy.zeros((1, 1))
    return matrix

Example #3

Show file

File: VectorMeasuresCalculator.py Project: AlexanderYakimchuk/summarizer

 def __call__(self, doc):
     return [
         self.morph.parse(t)[0].normalized
         for t in word_tokenizer.tokenize(doc.lower())
     ]

Example #4

Show file

def create_dictionary(text, morph, stop_words):
    words = set(
        morph.parse(word)[0].normalized
        for word in word_tokenizer.tokenize(text.lower())
        if word not in stop_words)
    return dict((w, i) for i, w in enumerate(words))