Ejemplo n.º 1
0
def test_morph():
    tokenizer = MorphTokenizer()
    tokens = list(tokenizer('dvd-диски'))
    assert tokens == [
        Token('dvd', (0, 3), LATIN),
        Token('-', (3, 4), PUNCT),
        MorphToken('диски', (4, 9),
                   RUSSIAN,
                   forms=[
                       Form('диск',
                            Grams({'NOUN', 'inan', 'masc', 'nomn', 'plur'})),
                       Form('диск',
                            Grams({'NOUN', 'accs', 'inan', 'masc', 'plur'}))
                   ])
    ]
Ejemplo n.º 2
0
def test_types():
    tokenizer = Tokenizer()
    tokens = list(tokenizer('Ростов-на-Дону'))
    assert tokens == [
        Token('Ростов', (0, 6), RUSSIAN),
        Token('-', (6, 7), PUNCT),
        Token('на', (7, 9), RUSSIAN),
        Token('-', (9, 10), PUNCT),
        Token('Дону', (10, 14), RUSSIAN)
    ]

    tokens = list(tokenizer('vk.com'))
    assert tokens == [
        Token('vk', (0, 2), LATIN),
        Token('.', (2, 3), PUNCT),
        Token('com', (3, 6), LATIN)
    ]

    tokens = list(tokenizer('1 500 000$'))
    assert tokens == [
        Token('1', (0, 1), INT),
        Token('500', (2, 5), INT),
        Token('000', (6, 9), INT),
        Token('$', (9, 10), PUNCT)
    ]

    tokens = list(tokenizer('π'))
    assert tokens == [Token('π', (0, 1), OTHER)]
Ejemplo n.º 3
0
 def tokenize(self, doc):
     for token in doc:
         span = Span(token.idx, token.idx + len(token.text))
         token_type = self.get_token_type(token)
         yield token, Token(token.text, span, token_type)