def test_change_rules():
    tokenizer = Tokenizer().add_rules(EMAIL_RULE)
    values = tokenizer.split('mailto:[email protected]')
    assert values == ['mailto', ':', '*****@*****.**']

    tokenizer = Tokenizer().remove_types(EOL)
    text = """
hi,

the
"""
    values = tokenizer.split(text)
    assert values == ['hi', ',', 'the']
Example #2
0
def test_check_type():
    tokenizer = Tokenizer()
    with pytest.raises(ValueError):
        tokenizer.check_type('UNK')

    tokenizer.remove_types(EOL)
    with pytest.raises(ValueError):
        tokenizer.check_type(EOL)
Example #3
0
def test_types():
    tokenizer = Tokenizer()
    tokens = list(tokenizer('Ростов-на-Дону'))
    assert tokens == [
        Token('Ростов', (0, 6), RUSSIAN),
        Token('-', (6, 7), PUNCT),
        Token('на', (7, 9), RUSSIAN),
        Token('-', (9, 10), PUNCT),
        Token('Дону', (10, 14), RUSSIAN)
    ]

    tokens = list(tokenizer('vk.com'))
    assert tokens == [
        Token('vk', (0, 2), LATIN),
        Token('.', (2, 3), PUNCT),
        Token('com', (3, 6), LATIN)
    ]

    tokens = list(tokenizer('1 500 000$'))
    assert tokens == [
        Token('1', (0, 1), INT),
        Token('500', (2, 5), INT),
        Token('000', (6, 9), INT),
        Token('$', (9, 10), PUNCT)
    ]

    tokens = list(tokenizer('π'))
    assert tokens == [Token('π', (0, 1), OTHER)]
def test_check_type():
    tokenizer = Tokenizer()
    with pytest.raises(ValueError):
        tokenizer.check_type('UNK')

    tokenizer.remove_types(EOL)
    with pytest.raises(ValueError):
        tokenizer.check_type(EOL)
 def token_fgos(self, text):
     tokenizer = Tokenizer()
     fgos_rule = TokenRule(
         'FOS', '[А-Я]+К+-+[0-9]+')  # букв не больше 3 и последняя к
     tokenizer.remove_types('EOL', 'RU', 'PUNCT', 'OTHER', 'INT', 'LATIN')
     tokenizer.add_rules(fgos_rule)
     return list(tokenizer(text))
Example #6
0
 def __init__(self, rand_seed=42):
     self.execution_timer = Timer()
     self.environment_features = {
         'punct_between': 0,
         'preposition_between': 0,
         'preposition_before': 0,
         'conjunction_between': 0
     }
     self.feature_vector_size = len(feature_func_patterns) * 2 + len(
         self.environment_features)
     self.network = FeedforwardNetwork(self.feature_vector_size,
                                       2, (200, 50, 5),
                                       rand_seed=rand_seed)
     self.tokenizer = Tokenizer()
Example #7
0
def test_change_rules():
    tokenizer = Tokenizer().add_rules(EMAIL_RULE)
    values = tokenizer.split('mailto:[email protected]')
    assert values == ['mailto', ':', '*****@*****.**']

    tokenizer = Tokenizer().remove_types(EOL)
    text = """
hi,

the
"""
    values = tokenizer.split(text)
    assert values == ['hi', ',', 'the']
Example #8
0
def test_join_tokens():
    tokenizer = Tokenizer()
    tokens = tokenizer('pi =        3.14')
    assert join_tokens(tokens) == 'pi = 3.14'
Example #9
0
def tokenizer():
    return Tokenizer()
 def token_direction_of_preparation(self, text):
     CODE_RULE = TokenRule('Code', '\d{2}.\d{2}.\d{2}(?!\d)')
     tokenizer = Tokenizer()
     tokenizer.remove_types('EOL', 'LATIN', 'RU', 'INT', 'PUNCT', 'OTHER')
     tokenizer.add_rules(CODE_RULE)
     return list(tokenizer(text))
Example #11
0
 def __init__(self, typos={}):
     self.tokenizer = Tokenizer()
     self.typos = typos
Example #12
0
def tokenize(string):
    from yargy.tokenizer import Tokenizer

    tokenizer = Tokenizer()
    return list(tokenizer(string))
import os


def GetDocuments():
    documents = []
    for filename in os.listdir(baseDir):
        if filename.endswith('.docx'):
            documents.append(filename)
    return documents


from yargy.tokenizer import TokenRule
from yargy.tokenizer import Tokenizer

CODE_RULE = TokenRule('Code', '\d{2}.\d{2}.\d{2}(?!\d)')
tokenizer = Tokenizer()
tokenizer.remove_types('EOL', 'LATIN', 'RU', 'INT', 'PUNCT', 'OTHER')
tokenizer.add_rules(CODE_RULE)

isRPD = rule(and_(dictionary({'рабочая'}), is_title()),
             dictionary({'программа'}))

isRPD2 = rule(dictionary({'дисциплина'}))

rpdRule = Parser(isRPD)

# print(rpdRule.find('Рабочая программа дисциплины'))

baseDir = r'C:\Users\Katia\Desktop\рпд'
documents = GetDocuments()
path = baseDir + '\\' + documents[0]