Example #1
0
def get_rules():
    RU = type('RU')
    INT = type('INT')
    NONE = type('NONE')
    NOUN = gram('NOUN')
    ADJF = gram('ADJF')
    NAME = gram('Name')
    PREP = gram('PREP')
    #GEO=gram('Geox')
    GEO = rule(
        and_(
            gram('Geox'),
            not_(
                or_(
                    eq('артема'),
                    eq('фармана'),
                    eq('оскол'),
                    eq('мунарева'),
                ))))

    NAME_OR_NOUN = or_(NAME, NOUN)

    CITY = morph_pipeline(['город', 'Нижний', 'новгород'])

    CITY_EXEP = rule(morph_pipeline(['артем', 'фармана', 'оскол']))

    CITY_NOT = rule(not_(or_(eq('артем'), eq('фармана'), eq('оскол'), INT)))

    CITY_PITER = rule(eq('санкт'), eq('-'), eq('петербург'))

    COMPLICATED_CITY = or_(rule(CITY.optional(), GEO), CITY_PITER)

    FINAL_CITY = or_(COMPLICATED_CITY)
    return FINAL_CITY
Example #2
0
def get_second_rules():
    RU = type('RU')
    INT = type('INT')
    NONE = type('NONE')
    NOUN= gram('NOUN')
    ADJF = gram('ADJF')
    ANIM=gram('anim')
    GENT=gram('gent')
    SGTM=gram('Sgtm')
    FEMN=gram('femn')
    CONJ=gram('CONJ')
    PATR=gram('Patr')
    NAME = gram('Name')
    PREP=gram('PREP')


    SURNAME_CONST=rule(
        and_(
        SGTM,
        ANIM,
        not_(NAME),
        not_ (PATR),
        not_(eq('по')),
        not_(eq('ленина')),
        not_(eq('ульянова'))
        )
    )

    SURNAME=or_(
        SURNAME_CONST,
        rule(eq('Иванов')),
        rule(eq('левченко')),
        rule(eq('эйхвальд')),
        rule(eq('зимина')),
        rule(eq('хитарьян')),
        rule(eq('моторин')),
        rule(eq('рукавишников')),
        rule(eq('деткино')),
        rule(eq('буланцев')),
        rule(eq('багров')),
        rule(eq('шерл')),
        rule(eq('белоцерковский')),
        rule(eq('степанов')),
        rule(eq('шляхов')),
        rule(eq('моисеев')),
        rule(eq('пузанков')),
        rule(eq('попиченко')),
        rule(eq('сергеев')),
        rule(eq('удовенко')),
        rule(eq('тютин')),
        rule(eq('удовенко'))
    )

    COMPLICATED=rule(
        SURNAME.repeatable()
    )


    FINAL = or_(COMPLICATED)
    return FINAL
Example #3
0
def get_rules():
    RU = type('RU')
    INT = type('INT')
    NONE = type('NONE')
    NOUN = gram('NOUN')
    ADJF = gram('ADJF')
    GEO = gram('Geox')
    PREP = gram('PREP')
    CONJ = gram('CONJ')

    NAME = rule(and_(gram('Name'), not_(PREP), not_(GEO)))

    NOUN_NOT_CONJ = rule(and_(NOUN, not_(CONJ)))

    STREET_SUFFIXS = morph_pipeline([
        'улица', 'тракт', 'бульвар', 'проспект', 'микрорайон', 'проезд',
        'шоссе', 'парк'
    ])

    SPECIAL_STREET_SUFFIXS = morph_pipeline(['шоссе', 'тракт'])

    SIMPLE_STREETS_FROM_ARRAY = morph_pipeline([
        'краснопресненская', 'республике', 'маршала захарова', 'доватора',
        'мичурина', 'зеленые аллеи', 'бехтеева', 'октябрьская',
        'новогиреевская', 'югорская', 'артема', 'парковая', 'зеленые аллеи',
        'алтуфьевское', 'горького', 'Кавказский', 'хамовнический вал',
        'Кусковская', 'марьинский парк', 'московская', 'береговая',
        'антонова овсиенко', 'школьная', 'юнтоловский', 'гагарина'
    ])

    EXCEPTIONAL_STREET_CONST = morph_pipeline(['Кавказский'])

    NOUN_NOT_APPART = rule(not_(or_(eq('дом'), eq('квартира'), INT, CONJ)))

    COMPLICATED_STREETS = or_(
        rule(STREET_SUFFIXS, INT, NOUN, NOUN),
        rule(STREET_SUFFIXS, INT, ADJF, NOUN),
        rule(STREET_SUFFIXS, NOUN_NOT_CONJ, NOUN_NOT_APPART, NAME.optional()),
        rule(NAME, NOUN_NOT_APPART), rule(ADJF, NAME),
        rule(STREET_SUFFIXS, ADJF, NOUN_NOT_APPART),
        rule(STREET_SUFFIXS, CONJ, NOUN, NOUN))

    SIMPLE_STREETS_WITH_STREET_SUFFIX = rule(STREET_SUFFIXS, NOUN_NOT_APPART)
    SPECIAL_SIMPLE_STREETS_WITH_STREET_SUFFIX = rule(ADJF,
                                                     SPECIAL_STREET_SUFFIXS)

    SIMPLE_STREETS = or_(SPECIAL_SIMPLE_STREETS_WITH_STREET_SUFFIX,
                         SIMPLE_STREETS_WITH_STREET_SUFFIX,
                         SIMPLE_STREETS_FROM_ARRAY)

    FINAL_STREET = or_(COMPLICATED_STREETS, SIMPLE_STREETS)

    return FINAL_STREET
Example #4
0
def get_first_rules():
    RU = type('RU')
    INT = type('INT')
    NONE = type('NONE')
    NOUN= gram('NOUN')
    ADJF = gram('ADJF')
    ANIM=gram('anim')
    GENT=gram('gent')
    SGTM=gram('Sgtm')
    CONJ=gram('CONJ')
    PATR=gram('Patr')
    NAME = gram('Name')
    PREP=gram('PREP')


    STATE=or_(
        eq('моторин'),
        eq('юрок'),
        eq('вакула'),
        eq('эйхвальд'),
        eq('иммуно'),
        eq('из'),
        eq('славы'),
        eq('хайбулаев'),
        eq('михална'),
        eq('валиде'),
        eq('шиян'),
        eq('сим'),
        eq('мазитов'),
        eq('хамидов')
    )

    NAME_CONST=rule(
        and_(
        NAME,
        ANIM,
        not_(
            SGTM
        ),
            not_(STATE)
        )
    )

    COMPLICATED=rule(
        NAME_CONST.repeatable()
    )

    FINAL = or_(COMPLICATED)
    return FINAL
    def __init__(self, logger=None, env='local'):

        self.env = env

        if logger is None:
            self.logger = logging.getLogger("OGRNExtractor")
            self.logger.setLevel(logging.DEBUG)
            handler = RotatingFileHandler("ogrn_extractor.log",
                                          mode='a',
                                          encoding='utf-8',
                                          backupCount=5,
                                          maxBytes=1 * 1024 * 1024)
            formatter = logging.Formatter(
                '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)
        else:
            self.logger = logger

        self.tokenizer = MorphTokenizer()

        OGRN = morph_pipeline([
            'огрн', 'основной государственный регистрационный номер', 'огрнип'
        ])

        INT = type('INT')

        OGRN_NUMBER = rule(OGRN, INT)

        self.full_ogrn_parser = Parser(OGRN_NUMBER)
        self.ogrn_num_parser = Parser(rule(INT))
Example #6
0
def get_rules():
    RU = type('RU')
    INT = type('INT')
    NONE = type('NONE')
    NOUN = gram('NOUN')
    ADJF = gram('ADJF')
    CONJ = gram('CONJ')
    NAME = gram('Name')
    PREP = gram('PREP')
    NPRO = gram('NPRO')
    #GEO=gram('Geox')
    GEO = rule(
        and_(
            gram('Geox'),
            not_(
                or_(
                    eq('артема'),
                    eq('фармана'),
                    eq('оскол'),
                    eq('мунарева'),
                ))))

    NAME_OR_NOUN = or_(NAME, NOUN)

    HOUSE = morph_pipeline(['дом', 'корпус', 'квартира', 'строение', 'ст'])

    CITY_EXEP = rule(morph_pipeline(['артем', 'фармана', 'оскол']))

    HOUSE_NOT = rule(and_(not_(ADJF)))
    HOUSE1 = morph_pipeline(['a', 'а', '/', 'б'])

    UNIT1 = or_(
        rule(and_(INT, not_(eq('3'))), HOUSE1.optional(), HOUSE_NOT.optional(),
             INT.optional()))

    DOUBLED = rule(RU, RU)

    UNIT = or_(rule(HOUSE.optional(), UNIT1))

    COMPLICATED_HOUSE = rule(UNIT.repeatable())

    FINAL_HOUSE = or_(COMPLICATED_HOUSE)
    return FINAL_HOUSE
Example #7
0
def get_mid_rules():
    RU = type('RU')
    INT = type('INT')
    NONE = type('NONE')
    NOUN= gram('NOUN')
    ADJF = gram('ADJF')
    CONJ=gram('CONJ')
    PATR=gram('Patr')
    NAME = gram('Name')
    PREP=gram('PREP')

    PATRONYMIC=rule(
        PATR
    )

    COMPLICATED=rule(
        PATRONYMIC.repeatable()
    )

    FINAL = or_(COMPLICATED)
    return FINAL
Example #8
0
Building = fact(
    'Building',
    ['number', 'type']
)
Room = fact(
    'Room',
    ['number', 'type']
)


DASH = eq('-')
DOT = eq('.')

ADJF = gram('ADJF')
NOUN = gram('NOUN')
INT = type('INT')
TITLE = is_title()

ANUM = rule(
    INT,
    DASH.optional(),
    in_caseless({
        'я', 'й', 'е',
        'ое', 'ая', 'ий', 'ой'
    })
)


#########
#
#  STRANA
Example #9
0
from yargy.interpretation import (fact, const, attribute)
from yargy.predicates import (eq, length_eq, in_, in_caseless, type,
                              normalized, caseless, dictionary)

Part = fact('Part', ['part'])

Money = fact('Money', [
    'integer_min',
    attribute('integer_max', -1),
    attribute('currency', '-'),
    attribute('multiplier', -1),
    attribute('period', '-')
])

DOT = eq('.')
INT = type('INT')

########
#
#   CURRENCY
#
##########

EURO = or_(normalized('евро'), normalized('euro'), eq('€'),
           caseless('EUR')).interpretation(const('EUR'))

DOLLARS = or_(normalized('доллар'), normalized('дол'), normalized('dollar'),
              eq('$'), caseless('USD')).interpretation(const('USD'))

RUBLES = or_(
    rule(normalized('ruble')),
        if not min.currency:
            min.currency = max.currency
        # if (min.currency is not None) and (min.currency != 'RUB') and (max.currency is not None):
        #     max.currency
        elif min.currency != max.currency:
            min.currency = max.currency
        # для рублевых вилок типа 150-250 без указания тысяч домножаем на тысячу
        if (max.amount < 1000) and (min.amount < 1000) and (max.currency
                                                            == 'RUB'):
            min.amount *= 1000
            max.amount *= 1000
        return dsl.Range(min, max)


DOT = eq('.')
INT = type('INT')

########
#
#   CURRENCY
#
##########

# EURO = or_(
#     normalized('евро'),
#     #in_(['€', 'EUR'])
#     eq('€'),
#     #eq('EUR')
# ).interpretation(
#     const(dsl.EURO)
# )
Example #11
0
def yargy_parser(path):
    RULE = fact(
        'RULE',
        ['name', 'tresh', 'num']
    )
    INT = type('INT')
    PUNCT = type('PUNCT')

    DOT = or_(eq('.'), eq(','))

    NAME_mtbf = morph_pipeline(
        [
            'MTTF',
            'MTBF',
            'mean time between',
            'mean time between failures',
            'mean time between failure',
        ]
    ).interpretation(
        RULE.name
    )

    NAME_mttr = morph_pipeline(
        [
            'MTTR',
            'mean time to',
            'Mean Time To Repair',
            'repair time',
        ]
    ).interpretation(
        RULE.name
    )

    NUM_MTBF = or_(rule(INT, DOT, INT), rule(INT),
                   rule(INT, DOT, INT, DOT, INT),
                   rule(INT, INT), rule(INT, INT, INT))

    UNIT_mtbf = morph_pipeline(
        [
            'year',
            'years',
            'hour',
            'hours',
            'год',
            'час',
            'h',
            'ч',
            'тыс. часов'
        ]
    )

    UNIT_mttr = morph_pipeline(
        [
            'hour',
            'hours',
            'час',
            'h',
            'ч'
        ]
    )

    X_mtbf = rule(NUM_MTBF, UNIT_mtbf.optional()
                 ).interpretation(
                     RULE.num
                 )

    X_mttr = rule(INT, UNIT_mttr.optional()
                 ).interpretation(
                     RULE.num
                 )

    TRESH = rule(and_(not_(eq(NUM_MTBF)), or_(not_(eq(NAME_mttr)),
                                              not_(eq(NAME_mtbf))),
                      not_(eq(UNIT_mtbf)), not_(eq(DOT)),
                      not_(eq(INT)), not_(eq(X_mttr)), not_(eq(X_mtbf)))
                ).interpretation(
                    RULE.tresh
                )

    rule_1 = (rule(NAME_mtbf, (TRESH.optional()).repeatable(), X_mtbf).repeatable()
             ).interpretation(
                 RULE
             )

    rule_2 = (rule(NAME_mttr, (TRESH.optional()).repeatable(), X_mttr).repeatable()
             ).interpretation(
                 RULE
             )

    f = open(path, 'r')
    text = f.read()
    #Remove line separators
    text = re.sub("^\s+|\n|\r|\s+$", '', text)
    line = text
    #Temporary workaround. Remove it as the performance grows
    n = 10000
    text = [line[i-5 if i-5 > 0 else 0:i+n+5 if i+n+5 < len(line)
                 else len(line) -1] for i in range(0, len(line), n)]
    MEASURE = rule(or_(X_mttr, X_mtbf, NAME_mttr, NAME_mtbf))
    new_line = []
    #Parser #1 text preprocessing
    parser = Parser(MEASURE)
    for line in text:
        matches = list(parser.findall(line))
        spans = [_.span for _ in matches]
        new_span = [0, 0]
        if spans != [] and len(spans) >= 2:
            for i in range(0, len(spans)-1, 1):
                mini = 1000000
                maxi = 0
                if spans[i][0] < mini:
                    new_span[0] = spans[i][0]
                    mini = spans[i][0]
                if spans[i+1][1] > maxi:
                    new_span[1] = spans[i+1][1]
                    maxi = spans[i+1][1]
                for i in range(new_span[0], new_span[1]):
                    new_line.append(line[i])
                new_line.append(' \n ')
    new_line = ''.join(new_line)
    new_line = new_line.split('\n')
    LIST = []
    MEASURE = or_(rule_1, rule_2).interpretation(
        RULE
    )
    #Parser #2 Parsing reliability metrics.
    parser = Parser(MEASURE)
    for line in new_line:
        try:
            matches = list(parser.findall(line))
            spans = [_.span for _ in matches]
            if spans != []:
                if matches:
                    for match in matches:
                        LIST.append(match.fact)
        except:
            print('Yargy failure: you normally don`t need to report that to us.')
    print(LIST)
    return LIST
    def __init__(self, logger=None, env='local'):

        self.env = env

        if logger is None:
            self.logger = logging.getLogger("LegalEntitiesExtractor")
            self.logger.setLevel(logging.DEBUG)
            handler = RotatingFileHandler("legal_entities_extractor.log",
                                          mode='a',
                                          encoding='utf-8',
                                          backupCount=5,
                                          maxBytes=1 * 1024 * 1024)
            formatter = logging.Formatter(
                '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)
        else:
            self.logger = logger

        self.tokenizer = MorphTokenizer()
        self.morph = pymorphy2.MorphAnalyzer()

        self.NOUNS_TO_NORMALIZE = [
            'общество', 'объединение', 'учреждение', 'предприятие',
            'департамент', 'организация', 'союз', 'центр'
        ]
        self.ADJ_TO_NORMALIZE_TO_NEUT = [
            'акционерный', 'публичный', 'музейный', 'государственный',
            'казенный', 'казённый', 'унитарный'
        ]

        # LegalName = fact('LegalName', ['shortname', 'fullname'])
        # LegalForm = fact('LegalForm', ['shortform', 'fullform'])
        # LegalEnity = fact('LegalEnity', ['LegalForm','LegalName'])

        LEGAL_FORM_FULL = morph_pipeline([
            'общество с ограниченной ответственностью', 'акционерное общество',
            'закрытое акционерное общество', 'открытое акционерное общество',
            'акционерное общество управляющая компания',
            'управляющая компания', 'публичное акционерное общество',
            'музейное объединение', 'государственное казенное учреждение',
            'государственное унитарное предприятие', 'департамент'
        ])

        LEGAL_FORM_SHORT = morph_pipeline(['ПАО', 'ЗАО', 'ОАО', 'АО', 'ООО'])

        LEGAL_FORM = or_(LEGAL_FORM_SHORT, LEGAL_FORM_FULL)

        OPEN_QUOTE = or_(eq('\"'), eq('«'), eq('\''))
        CLOSE_QUOTE = or_(eq('\"'), eq('»'), eq('\''))

        INT = type('INT')
        LATIN = type('LATIN')
        FULL_NAME_SIMBOLS = or_(eq('&'), OPEN_QUOTE)
        SHORT_NAME_SIMBOLS = or_(eq('+'), eq('!'), eq('№'))
        LATIN_NAME_SIMBOLS = or_(eq('.'), eq('&'))

        GEO_TAG = rule(gram('NOUN'), gram('Geox'))

        WORD_IN_NAME = or_(gram('NOUN'), gram('ADJF'), gram('ADJS'))

        WORD_NOT_IN_SHORT_NAME = or_(eq('ИНН'), eq('ОГРН'))

        WORD_IN_SHORT_NAME = or_(gram('NOUN'), gram('ADJF'))

        WORD_IN_SHORT_NAME_FINAL = and_(WORD_IN_SHORT_NAME,
                                        not_(WORD_NOT_IN_SHORT_NAME))

        WORD_IN_LATIN_NAME = or_(LATIN, LATIN_NAME_SIMBOLS)

        LATIN_NAME = rule(WORD_IN_LATIN_NAME.repeatable(min=2))

        FULL_LEGAL_ENTITY = rule(LEGAL_FORM, GEO_TAG.optional(), OPEN_QUOTE,
                                 WORD_IN_NAME.repeatable(), CLOSE_QUOTE)
        SIMPLE_LEGAL_ENTITY = rule(LEGAL_FORM_SHORT, WORD_IN_SHORT_NAME_FINAL)
        GOV_ENTITY = rule(LEGAL_FORM_FULL,
                          WORD_IN_SHORT_NAME.repeatable(min=1))

        LEGAL_ENTITY = or_(FULL_LEGAL_ENTITY, SIMPLE_LEGAL_ENTITY, GOV_ENTITY)

        self.full_legal_parser = Parser(LEGAL_ENTITY)
        self.legal_form_parser = Parser(LEGAL_FORM)
        self.legal_latin_parser = Parser(LATIN_NAME)
Example #13
0
    GENT_GROUP,
    or_(
        rule(normalized('имя')),
        rule(caseless('им'),
             eq('.').optional()),
    ),
    or_(
        NAME,
        PERSON,
    ),
)

LATIN = rule(
    TYPE,
    or_(rule(and_(
        type('LATIN'),
        is_capitalized(),
    )), rule(
        type('LATIN'),
        in_({'&', '/', '.'}),
        type('LATIN'),
    )).repeatable())

KNOWN = rule(
    gram('Orgn'),
    GENT_GROUP,
)

ORGANISATION_ = or_(
    TRIPLE_QUOTED,
    QUOTED,
Example #14
0
    """
    def __call__(self, token):
        return len(token.value) > self.value


Lector = fact('Lector', ['name', 'degree'])

Name = fact(
    'Name',
    [attribute('first', ''),
     attribute('middle', ''),
     attribute('last', '')])

DOT = eq('.')

LAST = and_(type('RU'), is_capitalized(),
            length_grt(1)).interpretation(Name.last.custom(str.capitalize))

FIRST = and_(gram('Name'),
             length_grt(1)).interpretation(Name.first.custom(str.capitalize))

MIDDLE = and_(gram('Patr'),
              length_grt(1)).interpretation(Name.middle.custom(str.capitalize))
ABBR = and_(length_eq(1), is_capitalized())

FIRST_ABBR = ABBR.interpretation(Name.first.custom(str.upper))

MIDDLE_ABBR = ABBR.interpretation(Name.middle.custom(str.upper))

unsubstantial = {'Бен Режеб Т.Б.К.'}
Example #15
0
    or_(
        rule(normalized('имя')),
        rule(caseless('им'), eq('.').optional()),
    ),
    or_(
        NAME,
        PERSON,
    ),
)

LATIN = rule(
    TYPE,
    or_(
        rule(
            and_(
                type('LATIN'),
                is_capitalized(),
            )
        ),
        rule(
            type('LATIN'),
            in_({'&', '/', '.'}),
            type('LATIN'),
        )
    ).repeatable()
)

KNOWN = rule(
    gram('Orgn'),
    GENT_GROUP,
)
Example #16
0
    is_title
)
from yargy.interpretation import fact
from yargy.relations import gnc_relation
from yargy.pipelines import morph_pipeline
from yargy.tokenizer import QUOTES

# Несколько констант для более чистого кода
DASH = eq('-')
DOT = eq('.')
SEMICOLON = eq(';')
SPACEBAR = eq(' ')

ADJF = gram('ADJF')
NOUN = gram('NOUN')
INT = type('INT')
GEN = gram('gent')
APRO = gram('Apro')
PREP = gram('PREP')
CONJ = gram('CONJ')

# А творение ниже сделано для критикалов и береговой
SIMPLE_WILDCARD = rule(
  NOUN.repeatable().optional(),
  ADJF.optional(),
  INT.optional()
)

TITLE = is_title()
ANUM = rule(
    INT,