def test_change_rules(): tokenizer = Tokenizer().add_rules(EMAIL_RULE) values = tokenizer.split('mailto:[email protected]') assert values == ['mailto', ':', '*****@*****.**'] tokenizer = Tokenizer().remove_types(EOL) text = """ hi, the """ values = tokenizer.split(text) assert values == ['hi', ',', 'the']
def test_check_type(): tokenizer = Tokenizer() with pytest.raises(ValueError): tokenizer.check_type('UNK') tokenizer.remove_types(EOL) with pytest.raises(ValueError): tokenizer.check_type(EOL)
def test_types(): tokenizer = Tokenizer() tokens = list(tokenizer('Ростов-на-Дону')) assert tokens == [ Token('Ростов', (0, 6), RUSSIAN), Token('-', (6, 7), PUNCT), Token('на', (7, 9), RUSSIAN), Token('-', (9, 10), PUNCT), Token('Дону', (10, 14), RUSSIAN) ] tokens = list(tokenizer('vk.com')) assert tokens == [ Token('vk', (0, 2), LATIN), Token('.', (2, 3), PUNCT), Token('com', (3, 6), LATIN) ] tokens = list(tokenizer('1 500 000$')) assert tokens == [ Token('1', (0, 1), INT), Token('500', (2, 5), INT), Token('000', (6, 9), INT), Token('$', (9, 10), PUNCT) ] tokens = list(tokenizer('π')) assert tokens == [Token('π', (0, 1), OTHER)]
def token_fgos(self, text): tokenizer = Tokenizer() fgos_rule = TokenRule( 'FOS', '[А-Я]+К+-+[0-9]+') # букв не больше 3 и последняя к tokenizer.remove_types('EOL', 'RU', 'PUNCT', 'OTHER', 'INT', 'LATIN') tokenizer.add_rules(fgos_rule) return list(tokenizer(text))
def __init__(self, rand_seed=42): self.execution_timer = Timer() self.environment_features = { 'punct_between': 0, 'preposition_between': 0, 'preposition_before': 0, 'conjunction_between': 0 } self.feature_vector_size = len(feature_func_patterns) * 2 + len( self.environment_features) self.network = FeedforwardNetwork(self.feature_vector_size, 2, (200, 50, 5), rand_seed=rand_seed) self.tokenizer = Tokenizer()
def test_join_tokens(): tokenizer = Tokenizer() tokens = tokenizer('pi = 3.14') assert join_tokens(tokens) == 'pi = 3.14'
def tokenizer(): return Tokenizer()
def token_direction_of_preparation(self, text): CODE_RULE = TokenRule('Code', '\d{2}.\d{2}.\d{2}(?!\d)') tokenizer = Tokenizer() tokenizer.remove_types('EOL', 'LATIN', 'RU', 'INT', 'PUNCT', 'OTHER') tokenizer.add_rules(CODE_RULE) return list(tokenizer(text))
def __init__(self, typos={}): self.tokenizer = Tokenizer() self.typos = typos
def tokenize(string): from yargy.tokenizer import Tokenizer tokenizer = Tokenizer() return list(tokenizer(string))
import os def GetDocuments(): documents = [] for filename in os.listdir(baseDir): if filename.endswith('.docx'): documents.append(filename) return documents from yargy.tokenizer import TokenRule from yargy.tokenizer import Tokenizer CODE_RULE = TokenRule('Code', '\d{2}.\d{2}.\d{2}(?!\d)') tokenizer = Tokenizer() tokenizer.remove_types('EOL', 'LATIN', 'RU', 'INT', 'PUNCT', 'OTHER') tokenizer.add_rules(CODE_RULE) isRPD = rule(and_(dictionary({'рабочая'}), is_title()), dictionary({'программа'})) isRPD2 = rule(dictionary({'дисциплина'})) rpdRule = Parser(isRPD) # print(rpdRule.find('Рабочая программа дисциплины')) baseDir = r'C:\Users\Katia\Desktop\рпд' documents = GetDocuments() path = baseDir + '\\' + documents[0]