def get_rules(): RU = type('RU') INT = type('INT') NONE = type('NONE') NOUN = gram('NOUN') ADJF = gram('ADJF') NAME = gram('Name') PREP = gram('PREP') #GEO=gram('Geox') GEO = rule( and_( gram('Geox'), not_( or_( eq('артема'), eq('фармана'), eq('оскол'), eq('мунарева'), )))) NAME_OR_NOUN = or_(NAME, NOUN) CITY = morph_pipeline(['город', 'Нижний', 'новгород']) CITY_EXEP = rule(morph_pipeline(['артем', 'фармана', 'оскол'])) CITY_NOT = rule(not_(or_(eq('артем'), eq('фармана'), eq('оскол'), INT))) CITY_PITER = rule(eq('санкт'), eq('-'), eq('петербург')) COMPLICATED_CITY = or_(rule(CITY.optional(), GEO), CITY_PITER) FINAL_CITY = or_(COMPLICATED_CITY) return FINAL_CITY
def get_second_rules(): RU = type('RU') INT = type('INT') NONE = type('NONE') NOUN= gram('NOUN') ADJF = gram('ADJF') ANIM=gram('anim') GENT=gram('gent') SGTM=gram('Sgtm') FEMN=gram('femn') CONJ=gram('CONJ') PATR=gram('Patr') NAME = gram('Name') PREP=gram('PREP') SURNAME_CONST=rule( and_( SGTM, ANIM, not_(NAME), not_ (PATR), not_(eq('по')), not_(eq('ленина')), not_(eq('ульянова')) ) ) SURNAME=or_( SURNAME_CONST, rule(eq('Иванов')), rule(eq('левченко')), rule(eq('эйхвальд')), rule(eq('зимина')), rule(eq('хитарьян')), rule(eq('моторин')), rule(eq('рукавишников')), rule(eq('деткино')), rule(eq('буланцев')), rule(eq('багров')), rule(eq('шерл')), rule(eq('белоцерковский')), rule(eq('степанов')), rule(eq('шляхов')), rule(eq('моисеев')), rule(eq('пузанков')), rule(eq('попиченко')), rule(eq('сергеев')), rule(eq('удовенко')), rule(eq('тютин')), rule(eq('удовенко')) ) COMPLICATED=rule( SURNAME.repeatable() ) FINAL = or_(COMPLICATED) return FINAL
def get_rules(): RU = type('RU') INT = type('INT') NONE = type('NONE') NOUN = gram('NOUN') ADJF = gram('ADJF') GEO = gram('Geox') PREP = gram('PREP') CONJ = gram('CONJ') NAME = rule(and_(gram('Name'), not_(PREP), not_(GEO))) NOUN_NOT_CONJ = rule(and_(NOUN, not_(CONJ))) STREET_SUFFIXS = morph_pipeline([ 'улица', 'тракт', 'бульвар', 'проспект', 'микрорайон', 'проезд', 'шоссе', 'парк' ]) SPECIAL_STREET_SUFFIXS = morph_pipeline(['шоссе', 'тракт']) SIMPLE_STREETS_FROM_ARRAY = morph_pipeline([ 'краснопресненская', 'республике', 'маршала захарова', 'доватора', 'мичурина', 'зеленые аллеи', 'бехтеева', 'октябрьская', 'новогиреевская', 'югорская', 'артема', 'парковая', 'зеленые аллеи', 'алтуфьевское', 'горького', 'Кавказский', 'хамовнический вал', 'Кусковская', 'марьинский парк', 'московская', 'береговая', 'антонова овсиенко', 'школьная', 'юнтоловский', 'гагарина' ]) EXCEPTIONAL_STREET_CONST = morph_pipeline(['Кавказский']) NOUN_NOT_APPART = rule(not_(or_(eq('дом'), eq('квартира'), INT, CONJ))) COMPLICATED_STREETS = or_( rule(STREET_SUFFIXS, INT, NOUN, NOUN), rule(STREET_SUFFIXS, INT, ADJF, NOUN), rule(STREET_SUFFIXS, NOUN_NOT_CONJ, NOUN_NOT_APPART, NAME.optional()), rule(NAME, NOUN_NOT_APPART), rule(ADJF, NAME), rule(STREET_SUFFIXS, ADJF, NOUN_NOT_APPART), rule(STREET_SUFFIXS, CONJ, NOUN, NOUN)) SIMPLE_STREETS_WITH_STREET_SUFFIX = rule(STREET_SUFFIXS, NOUN_NOT_APPART) SPECIAL_SIMPLE_STREETS_WITH_STREET_SUFFIX = rule(ADJF, SPECIAL_STREET_SUFFIXS) SIMPLE_STREETS = or_(SPECIAL_SIMPLE_STREETS_WITH_STREET_SUFFIX, SIMPLE_STREETS_WITH_STREET_SUFFIX, SIMPLE_STREETS_FROM_ARRAY) FINAL_STREET = or_(COMPLICATED_STREETS, SIMPLE_STREETS) return FINAL_STREET
def get_first_rules(): RU = type('RU') INT = type('INT') NONE = type('NONE') NOUN= gram('NOUN') ADJF = gram('ADJF') ANIM=gram('anim') GENT=gram('gent') SGTM=gram('Sgtm') CONJ=gram('CONJ') PATR=gram('Patr') NAME = gram('Name') PREP=gram('PREP') STATE=or_( eq('моторин'), eq('юрок'), eq('вакула'), eq('эйхвальд'), eq('иммуно'), eq('из'), eq('славы'), eq('хайбулаев'), eq('михална'), eq('валиде'), eq('шиян'), eq('сим'), eq('мазитов'), eq('хамидов') ) NAME_CONST=rule( and_( NAME, ANIM, not_( SGTM ), not_(STATE) ) ) COMPLICATED=rule( NAME_CONST.repeatable() ) FINAL = or_(COMPLICATED) return FINAL
def __init__(self, logger=None, env='local'): self.env = env if logger is None: self.logger = logging.getLogger("OGRNExtractor") self.logger.setLevel(logging.DEBUG) handler = RotatingFileHandler("ogrn_extractor.log", mode='a', encoding='utf-8', backupCount=5, maxBytes=1 * 1024 * 1024) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) else: self.logger = logger self.tokenizer = MorphTokenizer() OGRN = morph_pipeline([ 'огрн', 'основной государственный регистрационный номер', 'огрнип' ]) INT = type('INT') OGRN_NUMBER = rule(OGRN, INT) self.full_ogrn_parser = Parser(OGRN_NUMBER) self.ogrn_num_parser = Parser(rule(INT))
def get_rules(): RU = type('RU') INT = type('INT') NONE = type('NONE') NOUN = gram('NOUN') ADJF = gram('ADJF') CONJ = gram('CONJ') NAME = gram('Name') PREP = gram('PREP') NPRO = gram('NPRO') #GEO=gram('Geox') GEO = rule( and_( gram('Geox'), not_( or_( eq('артема'), eq('фармана'), eq('оскол'), eq('мунарева'), )))) NAME_OR_NOUN = or_(NAME, NOUN) HOUSE = morph_pipeline(['дом', 'корпус', 'квартира', 'строение', 'ст']) CITY_EXEP = rule(morph_pipeline(['артем', 'фармана', 'оскол'])) HOUSE_NOT = rule(and_(not_(ADJF))) HOUSE1 = morph_pipeline(['a', 'а', '/', 'б']) UNIT1 = or_( rule(and_(INT, not_(eq('3'))), HOUSE1.optional(), HOUSE_NOT.optional(), INT.optional())) DOUBLED = rule(RU, RU) UNIT = or_(rule(HOUSE.optional(), UNIT1)) COMPLICATED_HOUSE = rule(UNIT.repeatable()) FINAL_HOUSE = or_(COMPLICATED_HOUSE) return FINAL_HOUSE
def get_mid_rules(): RU = type('RU') INT = type('INT') NONE = type('NONE') NOUN= gram('NOUN') ADJF = gram('ADJF') CONJ=gram('CONJ') PATR=gram('Patr') NAME = gram('Name') PREP=gram('PREP') PATRONYMIC=rule( PATR ) COMPLICATED=rule( PATRONYMIC.repeatable() ) FINAL = or_(COMPLICATED) return FINAL
Building = fact( 'Building', ['number', 'type'] ) Room = fact( 'Room', ['number', 'type'] ) DASH = eq('-') DOT = eq('.') ADJF = gram('ADJF') NOUN = gram('NOUN') INT = type('INT') TITLE = is_title() ANUM = rule( INT, DASH.optional(), in_caseless({ 'я', 'й', 'е', 'ое', 'ая', 'ий', 'ой' }) ) ######### # # STRANA
from yargy.interpretation import (fact, const, attribute) from yargy.predicates import (eq, length_eq, in_, in_caseless, type, normalized, caseless, dictionary) Part = fact('Part', ['part']) Money = fact('Money', [ 'integer_min', attribute('integer_max', -1), attribute('currency', '-'), attribute('multiplier', -1), attribute('period', '-') ]) DOT = eq('.') INT = type('INT') ######## # # CURRENCY # ########## EURO = or_(normalized('евро'), normalized('euro'), eq('€'), caseless('EUR')).interpretation(const('EUR')) DOLLARS = or_(normalized('доллар'), normalized('дол'), normalized('dollar'), eq('$'), caseless('USD')).interpretation(const('USD')) RUBLES = or_( rule(normalized('ruble')),
if not min.currency: min.currency = max.currency # if (min.currency is not None) and (min.currency != 'RUB') and (max.currency is not None): # max.currency elif min.currency != max.currency: min.currency = max.currency # для рублевых вилок типа 150-250 без указания тысяч домножаем на тысячу if (max.amount < 1000) and (min.amount < 1000) and (max.currency == 'RUB'): min.amount *= 1000 max.amount *= 1000 return dsl.Range(min, max) DOT = eq('.') INT = type('INT') ######## # # CURRENCY # ########## # EURO = or_( # normalized('евро'), # #in_(['€', 'EUR']) # eq('€'), # #eq('EUR') # ).interpretation( # const(dsl.EURO) # )
def yargy_parser(path): RULE = fact( 'RULE', ['name', 'tresh', 'num'] ) INT = type('INT') PUNCT = type('PUNCT') DOT = or_(eq('.'), eq(',')) NAME_mtbf = morph_pipeline( [ 'MTTF', 'MTBF', 'mean time between', 'mean time between failures', 'mean time between failure', ] ).interpretation( RULE.name ) NAME_mttr = morph_pipeline( [ 'MTTR', 'mean time to', 'Mean Time To Repair', 'repair time', ] ).interpretation( RULE.name ) NUM_MTBF = or_(rule(INT, DOT, INT), rule(INT), rule(INT, DOT, INT, DOT, INT), rule(INT, INT), rule(INT, INT, INT)) UNIT_mtbf = morph_pipeline( [ 'year', 'years', 'hour', 'hours', 'год', 'час', 'h', 'ч', 'тыс. часов' ] ) UNIT_mttr = morph_pipeline( [ 'hour', 'hours', 'час', 'h', 'ч' ] ) X_mtbf = rule(NUM_MTBF, UNIT_mtbf.optional() ).interpretation( RULE.num ) X_mttr = rule(INT, UNIT_mttr.optional() ).interpretation( RULE.num ) TRESH = rule(and_(not_(eq(NUM_MTBF)), or_(not_(eq(NAME_mttr)), not_(eq(NAME_mtbf))), not_(eq(UNIT_mtbf)), not_(eq(DOT)), not_(eq(INT)), not_(eq(X_mttr)), not_(eq(X_mtbf))) ).interpretation( RULE.tresh ) rule_1 = (rule(NAME_mtbf, (TRESH.optional()).repeatable(), X_mtbf).repeatable() ).interpretation( RULE ) rule_2 = (rule(NAME_mttr, (TRESH.optional()).repeatable(), X_mttr).repeatable() ).interpretation( RULE ) f = open(path, 'r') text = f.read() #Remove line separators text = re.sub("^\s+|\n|\r|\s+$", '', text) line = text #Temporary workaround. Remove it as the performance grows n = 10000 text = [line[i-5 if i-5 > 0 else 0:i+n+5 if i+n+5 < len(line) else len(line) -1] for i in range(0, len(line), n)] MEASURE = rule(or_(X_mttr, X_mtbf, NAME_mttr, NAME_mtbf)) new_line = [] #Parser #1 text preprocessing parser = Parser(MEASURE) for line in text: matches = list(parser.findall(line)) spans = [_.span for _ in matches] new_span = [0, 0] if spans != [] and len(spans) >= 2: for i in range(0, len(spans)-1, 1): mini = 1000000 maxi = 0 if spans[i][0] < mini: new_span[0] = spans[i][0] mini = spans[i][0] if spans[i+1][1] > maxi: new_span[1] = spans[i+1][1] maxi = spans[i+1][1] for i in range(new_span[0], new_span[1]): new_line.append(line[i]) new_line.append(' \n ') new_line = ''.join(new_line) new_line = new_line.split('\n') LIST = [] MEASURE = or_(rule_1, rule_2).interpretation( RULE ) #Parser #2 Parsing reliability metrics. parser = Parser(MEASURE) for line in new_line: try: matches = list(parser.findall(line)) spans = [_.span for _ in matches] if spans != []: if matches: for match in matches: LIST.append(match.fact) except: print('Yargy failure: you normally don`t need to report that to us.') print(LIST) return LIST
def __init__(self, logger=None, env='local'): self.env = env if logger is None: self.logger = logging.getLogger("LegalEntitiesExtractor") self.logger.setLevel(logging.DEBUG) handler = RotatingFileHandler("legal_entities_extractor.log", mode='a', encoding='utf-8', backupCount=5, maxBytes=1 * 1024 * 1024) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) else: self.logger = logger self.tokenizer = MorphTokenizer() self.morph = pymorphy2.MorphAnalyzer() self.NOUNS_TO_NORMALIZE = [ 'общество', 'объединение', 'учреждение', 'предприятие', 'департамент', 'организация', 'союз', 'центр' ] self.ADJ_TO_NORMALIZE_TO_NEUT = [ 'акционерный', 'публичный', 'музейный', 'государственный', 'казенный', 'казённый', 'унитарный' ] # LegalName = fact('LegalName', ['shortname', 'fullname']) # LegalForm = fact('LegalForm', ['shortform', 'fullform']) # LegalEnity = fact('LegalEnity', ['LegalForm','LegalName']) LEGAL_FORM_FULL = morph_pipeline([ 'общество с ограниченной ответственностью', 'акционерное общество', 'закрытое акционерное общество', 'открытое акционерное общество', 'акционерное общество управляющая компания', 'управляющая компания', 'публичное акционерное общество', 'музейное объединение', 'государственное казенное учреждение', 'государственное унитарное предприятие', 'департамент' ]) LEGAL_FORM_SHORT = morph_pipeline(['ПАО', 'ЗАО', 'ОАО', 'АО', 'ООО']) LEGAL_FORM = or_(LEGAL_FORM_SHORT, LEGAL_FORM_FULL) OPEN_QUOTE = or_(eq('\"'), eq('«'), eq('\'')) CLOSE_QUOTE = or_(eq('\"'), eq('»'), eq('\'')) INT = type('INT') LATIN = type('LATIN') FULL_NAME_SIMBOLS = or_(eq('&'), OPEN_QUOTE) SHORT_NAME_SIMBOLS = or_(eq('+'), eq('!'), eq('№')) LATIN_NAME_SIMBOLS = or_(eq('.'), eq('&')) GEO_TAG = rule(gram('NOUN'), gram('Geox')) WORD_IN_NAME = or_(gram('NOUN'), gram('ADJF'), gram('ADJS')) WORD_NOT_IN_SHORT_NAME = or_(eq('ИНН'), eq('ОГРН')) WORD_IN_SHORT_NAME = or_(gram('NOUN'), gram('ADJF')) WORD_IN_SHORT_NAME_FINAL = and_(WORD_IN_SHORT_NAME, not_(WORD_NOT_IN_SHORT_NAME)) WORD_IN_LATIN_NAME = or_(LATIN, LATIN_NAME_SIMBOLS) LATIN_NAME = rule(WORD_IN_LATIN_NAME.repeatable(min=2)) FULL_LEGAL_ENTITY = rule(LEGAL_FORM, GEO_TAG.optional(), OPEN_QUOTE, WORD_IN_NAME.repeatable(), CLOSE_QUOTE) SIMPLE_LEGAL_ENTITY = rule(LEGAL_FORM_SHORT, WORD_IN_SHORT_NAME_FINAL) GOV_ENTITY = rule(LEGAL_FORM_FULL, WORD_IN_SHORT_NAME.repeatable(min=1)) LEGAL_ENTITY = or_(FULL_LEGAL_ENTITY, SIMPLE_LEGAL_ENTITY, GOV_ENTITY) self.full_legal_parser = Parser(LEGAL_ENTITY) self.legal_form_parser = Parser(LEGAL_FORM) self.legal_latin_parser = Parser(LATIN_NAME)
GENT_GROUP, or_( rule(normalized('имя')), rule(caseless('им'), eq('.').optional()), ), or_( NAME, PERSON, ), ) LATIN = rule( TYPE, or_(rule(and_( type('LATIN'), is_capitalized(), )), rule( type('LATIN'), in_({'&', '/', '.'}), type('LATIN'), )).repeatable()) KNOWN = rule( gram('Orgn'), GENT_GROUP, ) ORGANISATION_ = or_( TRIPLE_QUOTED, QUOTED,
""" def __call__(self, token): return len(token.value) > self.value Lector = fact('Lector', ['name', 'degree']) Name = fact( 'Name', [attribute('first', ''), attribute('middle', ''), attribute('last', '')]) DOT = eq('.') LAST = and_(type('RU'), is_capitalized(), length_grt(1)).interpretation(Name.last.custom(str.capitalize)) FIRST = and_(gram('Name'), length_grt(1)).interpretation(Name.first.custom(str.capitalize)) MIDDLE = and_(gram('Patr'), length_grt(1)).interpretation(Name.middle.custom(str.capitalize)) ABBR = and_(length_eq(1), is_capitalized()) FIRST_ABBR = ABBR.interpretation(Name.first.custom(str.upper)) MIDDLE_ABBR = ABBR.interpretation(Name.middle.custom(str.upper)) unsubstantial = {'Бен Режеб Т.Б.К.'}
or_( rule(normalized('имя')), rule(caseless('им'), eq('.').optional()), ), or_( NAME, PERSON, ), ) LATIN = rule( TYPE, or_( rule( and_( type('LATIN'), is_capitalized(), ) ), rule( type('LATIN'), in_({'&', '/', '.'}), type('LATIN'), ) ).repeatable() ) KNOWN = rule( gram('Orgn'), GENT_GROUP, )
is_title ) from yargy.interpretation import fact from yargy.relations import gnc_relation from yargy.pipelines import morph_pipeline from yargy.tokenizer import QUOTES # Несколько констант для более чистого кода DASH = eq('-') DOT = eq('.') SEMICOLON = eq(';') SPACEBAR = eq(' ') ADJF = gram('ADJF') NOUN = gram('NOUN') INT = type('INT') GEN = gram('gent') APRO = gram('Apro') PREP = gram('PREP') CONJ = gram('CONJ') # А творение ниже сделано для критикалов и береговой SIMPLE_WILDCARD = rule( NOUN.repeatable().optional(), ADJF.optional(), INT.optional() ) TITLE = is_title() ANUM = rule( INT,