def test_checks(): tokenizer = MorphTokenizer() with pytest.raises(ValueError): gram('UNK').activate(tokenizer) with pytest.raises(ValueError): custom(lambda _: True, types='UNK').activate(tokenizer)
def __init__(self, logger=None, env='local'): self.env = env if logger is None: self.logger = logging.getLogger("OGRNExtractor") self.logger.setLevel(logging.DEBUG) handler = RotatingFileHandler("ogrn_extractor.log", mode='a', encoding='utf-8', backupCount=5, maxBytes=1 * 1024 * 1024) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) else: self.logger = logger self.tokenizer = MorphTokenizer() OGRN = morph_pipeline([ 'огрн', 'основной государственный регистрационный номер', 'огрнип' ]) INT = type('INT') OGRN_NUMBER = rule(OGRN, INT) self.full_ogrn_parser = Parser(OGRN_NUMBER) self.ogrn_num_parser = Parser(rule(INT))
def __init__(self, rule, morph): # wraps pymorphy subclass # add methods check_gram, normalized # uses parse method that is cached morph = MorphAnalyzer(morph) tokenizer = MorphTokenizer(morph=morph) YargyParser.__init__(self, rule, tokenizer=tokenizer)
def prepare_for_dataset(sentences, create_function): tokenizer = MorphTokenizer() sentences = [ tokenize_sentence(tokenizer, sentence) for sentence in sentences ] max_len = max([len(i) for i in sentences]) words = list(set([word.lower() for sent in sentences for word in sent])) dataset, id2word, word2id = create_function(words, sentences) return dataset, id2word, word2id, max_len
def __init__(self, logger = None, env = 'local'): self.env = env if logger is None: self.logger = logging.getLogger("AdsExtractor") self.logger.setLevel(logging.DEBUG) handler = RotatingFileHandler("ads_extractor.log", mode='a', encoding='utf-8', backupCount=5, maxBytes=1 * 1024 * 1024) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) else: self.logger = logger self.texttools = texttools.TextTools(self.logger) self.tokenizer = MorphTokenizer() self.morph = pymorphy2.MorphAnalyzer() EXCLUDE = morph_pipeline([ 'без', 'не', 'вправе отказаться', 'может отказаться', 'услуга' ]) AGREEMENT = morph_pipeline([ 'соглашаться с получением' ]) SUBJECT = morph_pipeline([ 'рассылка', 'предложение' ]) KIND = morph_pipeline([ 'рекламный' ]) SPECIALS = morph_pipeline([ 'рекламныя цель' ]) ADS = or_( rule(KIND, SUBJECT), rule(SUBJECT, KIND), or_(SPECIALS, AGREEMENT) ) self.ads_parser = Parser(ADS) self.exclude_parser = Parser(rule(EXCLUDE))
def test_predicate(): tokenizer = MorphTokenizer() predicate = or_(normalized('московским'), and_(gram('NOUN'), not_(gram('femn')))) predicate = predicate.activate(tokenizer) tokens = tokenizer('московский зоопарк') values = [predicate(_) for _ in tokens] assert values == [True, True] tokens = tokenizer('московская погода') values = [predicate(_) for _ in tokens] assert values == [True, False]
def test_activate(): from yargy.pipelines import pipeline from yargy.predicates import gram from yargy.tokenizer import MorphTokenizer tokenizer = MorphTokenizer() A = pipeline(['a']).named('A') B = A.activate(tokenizer) assert_bnf(B, 'A -> pipeline') A = rule(gram('NOUN')).named('A') B = A.activate(tokenizer) assert_bnf(B, "A -> gram('NOUN')")
def test_morph(): tokenizer = MorphTokenizer() tokens = list(tokenizer('dvd-диски')) assert tokens == [ Token('dvd', (0, 3), LATIN), Token('-', (3, 4), PUNCT), MorphToken('диски', (4, 9), RUSSIAN, forms=[ Form('диск', Grams({'NOUN', 'inan', 'masc', 'nomn', 'plur'})), Form('диск', Grams({'NOUN', 'accs', 'inan', 'masc', 'plur'})) ]) ]
def __init__(self, logger=None, env='local'): self.env = env if logger is None: self.logger = logging.getLogger("ThirdPartyExtractor") self.logger.setLevel(logging.DEBUG) handler = RotatingFileHandler("thirdparty_extractor.log", mode='a', encoding='utf-8', backupCount=5, maxBytes=1 * 1024 * 1024) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) else: self.logger = logger self.texttools = texttools.TextTools(self.logger) self.tokenizer = MorphTokenizer() self.morph = pymorphy2.MorphAnalyzer() EXCLUDE = morph_pipeline(['не передавать']) SUBJECT = morph_pipeline( ['передача третьим лицам', 'поручать аффилированным лицам']) SPECIALS = morph_pipeline([ # 'рекламныя цель', # 'получение сообщений', # 'рассылка', # 'предложение услуг', # 'продвижение товаров', # 'продвижение услуг' ]) ADS = or_(rule(SUBJECT), rule(SPECIALS)) self.thirdp_parser = Parser(ADS) self.exclude_parser = Parser(rule(EXCLUDE))
def get_tokenizer(): from yargy.tokenizer import MorphTokenizer return MorphTokenizer()
for span in tagged_text.spans: if span.type in ('ORG', 'PER'): name = span.text name = re.sub(r'[\n\r\t\x0c]+', ' ', name) orgnames.add(name) return orgnames class IdTokenizer(Tokenizer): def __init__(self, tokenizer): super().__init__() self.tokenizer = tokenizer def split(self, text): return self.tokenizer.split(text) def check_type(self, type): return self.tokenizer.check_type(type) @property def morph(self): return self.tokenizer.morph def __call__(self, tokens): return tokens TOKENIZER = MorphTokenizer().remove_types(EOL) ID_TOKENIZER = IdTokenizer(TOKENIZER)
syns = idx2syns[el[0]] for child in cohyps: for parent in json.loads(prestr(el[2])): ed = g.add_edge(child, idx2syns[parent], label="is a") plt.figure(figsize=(15, 15)) pos = nx.nx_agraph.graphviz_layout(g) nx.draw(g, with_labels=True, pos=pos) # edge_labels=nx.draw_networkx_edge_labels(g,pos=pos) plt.show() button.on_click(graphdraw) from yargy.tokenizer import MorphTokenizer tokenizer = MorphTokenizer() text = '''Ростов-на-Дону Длительностью 18ч. 10мин. Яндекс.Такси π ≈ 3.1415 1 500 000$ http://vk.com ''' for line in text.splitlines(): print([_.value for _ in tokenizer(line)]) from yargy import or_, rule from yargy.predicates import normalized RULE = or_( rule(normalized('dvd'), '-', normalized('диск')),
def __init__(self, logger=None, env='local'): self.env = env if logger is None: self.logger = logging.getLogger("LegalEntitiesExtractor") self.logger.setLevel(logging.DEBUG) handler = RotatingFileHandler("legal_entities_extractor.log", mode='a', encoding='utf-8', backupCount=5, maxBytes=1 * 1024 * 1024) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) else: self.logger = logger self.tokenizer = MorphTokenizer() self.morph = pymorphy2.MorphAnalyzer() self.NOUNS_TO_NORMALIZE = [ 'общество', 'объединение', 'учреждение', 'предприятие', 'департамент', 'организация', 'союз', 'центр' ] self.ADJ_TO_NORMALIZE_TO_NEUT = [ 'акционерный', 'публичный', 'музейный', 'государственный', 'казенный', 'казённый', 'унитарный' ] # LegalName = fact('LegalName', ['shortname', 'fullname']) # LegalForm = fact('LegalForm', ['shortform', 'fullform']) # LegalEnity = fact('LegalEnity', ['LegalForm','LegalName']) LEGAL_FORM_FULL = morph_pipeline([ 'общество с ограниченной ответственностью', 'акционерное общество', 'закрытое акционерное общество', 'открытое акционерное общество', 'акционерное общество управляющая компания', 'управляющая компания', 'публичное акционерное общество', 'музейное объединение', 'государственное казенное учреждение', 'государственное унитарное предприятие', 'департамент' ]) LEGAL_FORM_SHORT = morph_pipeline(['ПАО', 'ЗАО', 'ОАО', 'АО', 'ООО']) LEGAL_FORM = or_(LEGAL_FORM_SHORT, LEGAL_FORM_FULL) OPEN_QUOTE = or_(eq('\"'), eq('«'), eq('\'')) CLOSE_QUOTE = or_(eq('\"'), eq('»'), eq('\'')) INT = type('INT') LATIN = type('LATIN') FULL_NAME_SIMBOLS = or_(eq('&'), OPEN_QUOTE) SHORT_NAME_SIMBOLS = or_(eq('+'), eq('!'), eq('№')) LATIN_NAME_SIMBOLS = or_(eq('.'), eq('&')) GEO_TAG = rule(gram('NOUN'), gram('Geox')) WORD_IN_NAME = or_(gram('NOUN'), gram('ADJF'), gram('ADJS')) WORD_NOT_IN_SHORT_NAME = or_(eq('ИНН'), eq('ОГРН')) WORD_IN_SHORT_NAME = or_(gram('NOUN'), gram('ADJF')) WORD_IN_SHORT_NAME_FINAL = and_(WORD_IN_SHORT_NAME, not_(WORD_NOT_IN_SHORT_NAME)) WORD_IN_LATIN_NAME = or_(LATIN, LATIN_NAME_SIMBOLS) LATIN_NAME = rule(WORD_IN_LATIN_NAME.repeatable(min=2)) FULL_LEGAL_ENTITY = rule(LEGAL_FORM, GEO_TAG.optional(), OPEN_QUOTE, WORD_IN_NAME.repeatable(), CLOSE_QUOTE) SIMPLE_LEGAL_ENTITY = rule(LEGAL_FORM_SHORT, WORD_IN_SHORT_NAME_FINAL) GOV_ENTITY = rule(LEGAL_FORM_FULL, WORD_IN_SHORT_NAME.repeatable(min=1)) LEGAL_ENTITY = or_(FULL_LEGAL_ENTITY, SIMPLE_LEGAL_ENTITY, GOV_ENTITY) self.full_legal_parser = Parser(LEGAL_ENTITY) self.legal_form_parser = Parser(LEGAL_FORM) self.legal_latin_parser = Parser(LATIN_NAME)
or_, rule ) from yargy.predicates import ( eq, in_, dictionary, type, gram ) INT = type('INT') NOUN = gram('NOUN') ADJF = gram('ADJF') PRTF = gram('PRTF') GENT = gram('gent') NUMR = gram('NUMR') DOT = eq('.') TOKENIZER = MorphTokenizer() morph_vocab = MorphVocab() parser = Parser(DATE) #dates_extractor = DATE(morph_vocab) line = ' за квартал ' split_on_date = re.split(r'с |по | до ', line ) for split in split_on_date: date = ExtractDate() matches = parser.extract(split) viz(date , split, matches, len(split_on_date)) import re str_num = 1 for line in all_tests:
from yargy.interpretation import fact, attribute from yargy.relations import gnc_relation from yargy.tokenizer import MorphTokenizer from yargy.utils import Record Record.means = lambda self, *args, **kwargs: self.interpretation( *args, **kwargs) TOKENIZER = MorphTokenizer() # todo move to notebook gnc = gnc_relation() Array = fact('Array', [attribute('element').repeatable()])
def __init__(self): self.analyzer = pmh.MorphAnalyzer() self.price_rules = [PRICE_FROM, PRICE_TO] self.tokenizer = MorphTokenizer() self.dict = dict()