Example #1
0
def test_checks():
    tokenizer = MorphTokenizer()
    with pytest.raises(ValueError):
        gram('UNK').activate(tokenizer)

    with pytest.raises(ValueError):
        custom(lambda _: True, types='UNK').activate(tokenizer)
    def __init__(self, logger=None, env='local'):

        self.env = env

        if logger is None:
            self.logger = logging.getLogger("OGRNExtractor")
            self.logger.setLevel(logging.DEBUG)
            handler = RotatingFileHandler("ogrn_extractor.log",
                                          mode='a',
                                          encoding='utf-8',
                                          backupCount=5,
                                          maxBytes=1 * 1024 * 1024)
            formatter = logging.Formatter(
                '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)
        else:
            self.logger = logger

        self.tokenizer = MorphTokenizer()

        OGRN = morph_pipeline([
            'огрн', 'основной государственный регистрационный номер', 'огрнип'
        ])

        INT = type('INT')

        OGRN_NUMBER = rule(OGRN, INT)

        self.full_ogrn_parser = Parser(OGRN_NUMBER)
        self.ogrn_num_parser = Parser(rule(INT))
    def __init__(self, rule, morph):
        # wraps pymorphy subclass
        # add methods check_gram, normalized
        # uses parse method that is cached
        morph = MorphAnalyzer(morph)

        tokenizer = MorphTokenizer(morph=morph)
        YargyParser.__init__(self, rule, tokenizer=tokenizer)
Example #4
0
def prepare_for_dataset(sentences, create_function):
    tokenizer = MorphTokenizer()
    sentences = [
        tokenize_sentence(tokenizer, sentence) for sentence in sentences
    ]
    max_len = max([len(i) for i in sentences])
    words = list(set([word.lower() for sent in sentences for word in sent]))
    dataset, id2word, word2id = create_function(words, sentences)
    return dataset, id2word, word2id, max_len
Example #5
0
    def __init__(self, logger = None, env = 'local'):

        self.env = env

        if logger is None:
            self.logger = logging.getLogger("AdsExtractor")
            self.logger.setLevel(logging.DEBUG)
            handler = RotatingFileHandler("ads_extractor.log", mode='a', encoding='utf-8', backupCount=5,
                                     maxBytes=1 * 1024 * 1024)
            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)
        else:
            self.logger = logger

        self.texttools = texttools.TextTools(self.logger)

        self.tokenizer = MorphTokenizer()
        self.morph = pymorphy2.MorphAnalyzer()

        EXCLUDE = morph_pipeline([
            'без',
            'не',
            'вправе отказаться',
            'может отказаться',
            'услуга'
        ])

        AGREEMENT = morph_pipeline([
            'соглашаться с получением'
        ])

        SUBJECT = morph_pipeline([
            'рассылка',
            'предложение'
        ])

        KIND = morph_pipeline([
            'рекламный'
        ])

        SPECIALS = morph_pipeline([
            'рекламныя цель'
        ])

        ADS = or_(
            rule(KIND, SUBJECT),
            rule(SUBJECT, KIND),
            or_(SPECIALS, AGREEMENT)
        )

        self.ads_parser = Parser(ADS)
        self.exclude_parser = Parser(rule(EXCLUDE))
Example #6
0
def test_predicate():
    tokenizer = MorphTokenizer()
    predicate = or_(normalized('московским'),
                    and_(gram('NOUN'), not_(gram('femn'))))
    predicate = predicate.activate(tokenizer)

    tokens = tokenizer('московский зоопарк')
    values = [predicate(_) for _ in tokens]
    assert values == [True, True]

    tokens = tokenizer('московская погода')
    values = [predicate(_) for _ in tokens]
    assert values == [True, False]
Example #7
0
def test_activate():
    from yargy.pipelines import pipeline
    from yargy.predicates import gram
    from yargy.tokenizer import MorphTokenizer

    tokenizer = MorphTokenizer()

    A = pipeline(['a']).named('A')
    B = A.activate(tokenizer)
    assert_bnf(B, 'A -> pipeline')

    A = rule(gram('NOUN')).named('A')
    B = A.activate(tokenizer)
    assert_bnf(B, "A -> gram('NOUN')")
Example #8
0
def test_morph():
    tokenizer = MorphTokenizer()
    tokens = list(tokenizer('dvd-диски'))
    assert tokens == [
        Token('dvd', (0, 3), LATIN),
        Token('-', (3, 4), PUNCT),
        MorphToken('диски', (4, 9),
                   RUSSIAN,
                   forms=[
                       Form('диск',
                            Grams({'NOUN', 'inan', 'masc', 'nomn', 'plur'})),
                       Form('диск',
                            Grams({'NOUN', 'accs', 'inan', 'masc', 'plur'}))
                   ])
    ]
Example #9
0
    def __init__(self, logger=None, env='local'):

        self.env = env

        if logger is None:
            self.logger = logging.getLogger("ThirdPartyExtractor")
            self.logger.setLevel(logging.DEBUG)
            handler = RotatingFileHandler("thirdparty_extractor.log",
                                          mode='a',
                                          encoding='utf-8',
                                          backupCount=5,
                                          maxBytes=1 * 1024 * 1024)
            formatter = logging.Formatter(
                '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)
        else:
            self.logger = logger

        self.texttools = texttools.TextTools(self.logger)

        self.tokenizer = MorphTokenizer()
        self.morph = pymorphy2.MorphAnalyzer()

        EXCLUDE = morph_pipeline(['не передавать'])

        SUBJECT = morph_pipeline(
            ['передача третьим лицам', 'поручать аффилированным лицам'])

        SPECIALS = morph_pipeline([
            # 'рекламныя цель',
            # 'получение сообщений',
            # 'рассылка',
            # 'предложение услуг',
            # 'продвижение товаров',
            # 'продвижение услуг'
        ])

        ADS = or_(rule(SUBJECT), rule(SPECIALS))

        self.thirdp_parser = Parser(ADS)
        self.exclude_parser = Parser(rule(EXCLUDE))
Example #10
0
def get_tokenizer():
    from yargy.tokenizer import MorphTokenizer
    return MorphTokenizer()
Example #11
0
    for span in tagged_text.spans:
        if span.type in ('ORG', 'PER'):
            name = span.text
            name = re.sub(r'[\n\r\t\x0c]+', ' ', name)
            orgnames.add(name)

    return orgnames


class IdTokenizer(Tokenizer):
    def __init__(self, tokenizer):
        super().__init__()
        self.tokenizer = tokenizer

    def split(self, text):
        return self.tokenizer.split(text)

    def check_type(self, type):
        return self.tokenizer.check_type(type)

    @property
    def morph(self):
        return self.tokenizer.morph

    def __call__(self, tokens):
        return tokens


TOKENIZER = MorphTokenizer().remove_types(EOL)
ID_TOKENIZER = IdTokenizer(TOKENIZER)
Example #12
0
        syns = idx2syns[el[0]]
        for child in cohyps:
            for parent in json.loads(prestr(el[2])):
                ed = g.add_edge(child, idx2syns[parent], label="is a")

    plt.figure(figsize=(15, 15))
    pos = nx.nx_agraph.graphviz_layout(g)
    nx.draw(g, with_labels=True, pos=pos)
    #     edge_labels=nx.draw_networkx_edge_labels(g,pos=pos)
    plt.show()


button.on_click(graphdraw)

from yargy.tokenizer import MorphTokenizer
tokenizer = MorphTokenizer()
text = '''Ростов-на-Дону
Длительностью 18ч. 10мин.
Яндекс.Такси
π ≈ 3.1415
1 500 000$
http://vk.com
'''
for line in text.splitlines():
    print([_.value for _ in tokenizer(line)])

from yargy import or_, rule
from yargy.predicates import normalized

RULE = or_(
    rule(normalized('dvd'), '-', normalized('диск')),
    def __init__(self, logger=None, env='local'):

        self.env = env

        if logger is None:
            self.logger = logging.getLogger("LegalEntitiesExtractor")
            self.logger.setLevel(logging.DEBUG)
            handler = RotatingFileHandler("legal_entities_extractor.log",
                                          mode='a',
                                          encoding='utf-8',
                                          backupCount=5,
                                          maxBytes=1 * 1024 * 1024)
            formatter = logging.Formatter(
                '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)
        else:
            self.logger = logger

        self.tokenizer = MorphTokenizer()
        self.morph = pymorphy2.MorphAnalyzer()

        self.NOUNS_TO_NORMALIZE = [
            'общество', 'объединение', 'учреждение', 'предприятие',
            'департамент', 'организация', 'союз', 'центр'
        ]
        self.ADJ_TO_NORMALIZE_TO_NEUT = [
            'акционерный', 'публичный', 'музейный', 'государственный',
            'казенный', 'казённый', 'унитарный'
        ]

        # LegalName = fact('LegalName', ['shortname', 'fullname'])
        # LegalForm = fact('LegalForm', ['shortform', 'fullform'])
        # LegalEnity = fact('LegalEnity', ['LegalForm','LegalName'])

        LEGAL_FORM_FULL = morph_pipeline([
            'общество с ограниченной ответственностью', 'акционерное общество',
            'закрытое акционерное общество', 'открытое акционерное общество',
            'акционерное общество управляющая компания',
            'управляющая компания', 'публичное акционерное общество',
            'музейное объединение', 'государственное казенное учреждение',
            'государственное унитарное предприятие', 'департамент'
        ])

        LEGAL_FORM_SHORT = morph_pipeline(['ПАО', 'ЗАО', 'ОАО', 'АО', 'ООО'])

        LEGAL_FORM = or_(LEGAL_FORM_SHORT, LEGAL_FORM_FULL)

        OPEN_QUOTE = or_(eq('\"'), eq('«'), eq('\''))
        CLOSE_QUOTE = or_(eq('\"'), eq('»'), eq('\''))

        INT = type('INT')
        LATIN = type('LATIN')
        FULL_NAME_SIMBOLS = or_(eq('&'), OPEN_QUOTE)
        SHORT_NAME_SIMBOLS = or_(eq('+'), eq('!'), eq('№'))
        LATIN_NAME_SIMBOLS = or_(eq('.'), eq('&'))

        GEO_TAG = rule(gram('NOUN'), gram('Geox'))

        WORD_IN_NAME = or_(gram('NOUN'), gram('ADJF'), gram('ADJS'))

        WORD_NOT_IN_SHORT_NAME = or_(eq('ИНН'), eq('ОГРН'))

        WORD_IN_SHORT_NAME = or_(gram('NOUN'), gram('ADJF'))

        WORD_IN_SHORT_NAME_FINAL = and_(WORD_IN_SHORT_NAME,
                                        not_(WORD_NOT_IN_SHORT_NAME))

        WORD_IN_LATIN_NAME = or_(LATIN, LATIN_NAME_SIMBOLS)

        LATIN_NAME = rule(WORD_IN_LATIN_NAME.repeatable(min=2))

        FULL_LEGAL_ENTITY = rule(LEGAL_FORM, GEO_TAG.optional(), OPEN_QUOTE,
                                 WORD_IN_NAME.repeatable(), CLOSE_QUOTE)
        SIMPLE_LEGAL_ENTITY = rule(LEGAL_FORM_SHORT, WORD_IN_SHORT_NAME_FINAL)
        GOV_ENTITY = rule(LEGAL_FORM_FULL,
                          WORD_IN_SHORT_NAME.repeatable(min=1))

        LEGAL_ENTITY = or_(FULL_LEGAL_ENTITY, SIMPLE_LEGAL_ENTITY, GOV_ENTITY)

        self.full_legal_parser = Parser(LEGAL_ENTITY)
        self.legal_form_parser = Parser(LEGAL_FORM)
        self.legal_latin_parser = Parser(LATIN_NAME)
Example #14
0
    or_, rule
)
from yargy.predicates import (
    eq, in_, dictionary,
    type, gram
)

INT = type('INT')
NOUN = gram('NOUN')
ADJF = gram('ADJF')
PRTF = gram('PRTF')
GENT = gram('gent')
NUMR = gram('NUMR')
DOT = eq('.')

TOKENIZER = MorphTokenizer()
morph_vocab = MorphVocab()
parser = Parser(DATE)
#dates_extractor = DATE(morph_vocab)

line = ' за квартал '
split_on_date = re.split(r'с |по | до ', line )
for split in split_on_date:
  date = ExtractDate()
  matches = parser.extract(split)
  viz(date , split, matches, len(split_on_date))

import re

str_num = 1
for line in all_tests:
Example #15
0
from yargy.interpretation import fact, attribute
from yargy.relations import gnc_relation
from yargy.tokenizer import MorphTokenizer
from yargy.utils import Record

Record.means = lambda self, *args, **kwargs: self.interpretation(
    *args, **kwargs)

TOKENIZER = MorphTokenizer()  # todo move to notebook
gnc = gnc_relation()

Array = fact('Array', [attribute('element').repeatable()])
 def __init__(self):
     self.analyzer = pmh.MorphAnalyzer()
     self.price_rules = [PRICE_FROM, PRICE_TO]
     self.tokenizer = MorphTokenizer()
     self.dict = dict()