def test_pipeline(): RULE = rule(pipeline(['a b c', 'b c']), 'd') parser = Parser(RULE) assert parser.match('b c d') assert parser.match('a b c d') RULE = rule(pipeline(['a b']).repeatable(), 'c') parser = Parser(RULE) assert parser.match('a b a b c') RULE = rule(caseless_pipeline(['A B']), 'c') parser = Parser(RULE) assert parser.match('A b c') RULE = morph_pipeline([ 'текст', 'текст песни', 'материал', 'информационный материал', ]) parser = Parser(RULE) matches = list(parser.findall('текстом песни музыкальной группы')) assert len(matches) == 1 match = matches[0] assert [_.value for _ in match.tokens] == ['текстом', 'песни'] matches = list(parser.findall('информационного материала под названием')) assert len(matches) == 1 match = matches[0] assert [_.value for _ in match.tokens] == ['информационного', 'материала'] RULE = morph_pipeline(['1 B.']) parser = Parser(RULE) assert parser.match('1 b .')
def update_rules(orgnames): ORGANIZATION = caseless_pipeline(orgnames).interpretation( Workplace.org_name) WORKPLACE_ELEM = rule(or_(PERIOD, ORGANIZATION, OCCUPATION)) WORKPLACE = rule( PERIOD, or_(rule(ORGANIZATION, OCCUPATION.optional()), rule(ORGANIZATION.optional(), OCCUPATION), rule(OCCUPATION, ORGANIZATION.optional()), rule(OCCUPATION.optional(), ORGANIZATION))).interpretation(Workplace) return WORKPLACE_ELEM, WORKPLACE
def test_pipeline(): RULE = rule( pipeline(['a b c', 'b c']), 'd' ) parser = Parser(RULE) assert parser.match('b c d') assert parser.match('a b c d') RULE = rule( pipeline(['a b']).repeatable(), 'c' ) parser = Parser(RULE) assert parser.match('a b a b c') RULE = rule( caseless_pipeline(['A B']), 'c' ) parser = Parser(RULE) assert parser.match('A b c') RULE = morph_pipeline([ 'текст', 'текст песни', 'материал', 'информационный материал', ]) parser = Parser(RULE) matches = list(parser.findall('текстом песни музыкальной группы')) assert len(matches) == 1 match = matches[0] assert [_.value for _ in match.tokens] == ['текстом', 'песни'] matches = list(parser.findall('информационного материала под названием')) assert len(matches) == 1 match = matches[0] assert [_.value for _ in match.tokens] == ['информационного', 'материала'] RULE = morph_pipeline(['1 B.']) parser = Parser(RULE) assert parser.match('1 b .')
from yargy import (rule, or_) from yargy.interpretation import (fact) from yargy.predicates import (dictionary, is_capitalized, eq, caseless) from yargy.pipelines import caseless_pipeline, morph_pipeline from natasha.extractors import Extractor Position = fact('position', ['level', 'field', 'name']) LEVEL = rule( caseless_pipeline([ 'junior', 'middle', 'senior', 'lead', 'chief', 'head', 'team lead', "старший", "младший", "руководитель направления" ]).interpretation(Position.level)) # TODO: нужно учесть жаргонные варианты (датасаентолог, датасатанист и т.д.) Скорее всего, придется парсить регулярками NAME = rule( or_( caseless_pipeline([ 'data scientist', 'data engineer', 'engineer', 'analyst', 'data analyst', 'data manager', 'scientist', 'researcher', "developer", "intern" ]), rule(dictionary(['DS', 'DE']), is_capitalized()), morph_pipeline(["аналитик", "разработчик", "стажер"])).interpretation(Position.name.inflected())) FIELD = rule( caseless_pipeline([ 'ML', 'DL', 'CV', 'computer vision', 'NLP', 'bi', 'machine learning', 'deep learning', 'software', 'research', 'big data', 'python', 'c++', "scala", "java", 'ios', "android", 'devops', "backend", 'frontend'
DASH = eq('-') RANGE_MONEY = rule(CURRENCY.optional(), AMOUNT, CURRENCY.optional()).interpretation(Money) RANGE_MIN = rule(eq('от').optional(), RANGE_MONEY.interpretation(Range.min)) RANGE_MAX = rule( # eq('до').optional(), RANGE_MONEY.interpretation(Range.max)) # TODO: пока не интерпретируется TAXATION = rule( caseless_pipeline([ 'чистыми', "грязными", "до налогов", "после налогов", "на руки", "gross", "гросс", 'net', "нетто", "до НДФЛ", "после НДФЛ", "до вычета НДФЛ", "после вычета НДФЛ" ])) FORK = rule(dictionary({'fork', 'Вилка', 'ЗП', 'Оклад'}), eq(':').optional()) RANGE = rule( FORK.optional(), RANGE_MIN, or_(DASH, eq('до')), # раньше был DASH.optional(), RANGE_MAX, TAXATION.interpretation(Range.taxation).optional()).interpretation(Range) def parse_money_emojis(message: dict): big_money_emojis = { "moneyparrot", "moneys", "moneybag", "money_with_wings", "printing-money", "money_mouth_face"
'двести': 200, 'триста': 300, 'четыреста': 400, 'пятьсот': 500, 'шестьсот': 600, 'семьсот': 700, 'восемьсот': 800, 'девятьсот': 900, 'тысяча': 10**3, 'миллион': 10**6, 'миллиард': 10**9, 'триллион': 10**12, } DOT = eq('.') INT = type('INT') THOUSANDTH = rule(caseless_pipeline(['тысячных', 'тысячная'])).interpretation(const(10**-3)) HUNDREDTH = rule(caseless_pipeline(['сотых', 'сотая'])).interpretation(const(10**-2)) TENTH = rule(caseless_pipeline(['десятых', 'десятая'])).interpretation(const(10**-1)) THOUSAND = or_( rule(caseless('т'), DOT), rule(caseless('тыс'), DOT.optional()), rule(normalized('тысяча')), rule(normalized('тыща')) ).interpretation(const(10**3)) MILLION = or_( rule(caseless('млн'), DOT.optional()), rule(normalized('миллион')) ).interpretation(const(10**6)) MILLIARD = or_( rule(caseless('млрд'), DOT.optional()), rule(normalized('миллиард'))
from .helpers import TOKENIZER, ID_TOKENIZER, load_named_entities from .education import EducationExtractor from .workplace import WorkplaceExtractor from .hobby import HobbyExtractor from yargy.parser import Parser from yargy.pipelines import pipeline, caseless_pipeline EXP_TITLE = pipeline(['Опыт работы']) EDU_TITLE = pipeline(['Образование']) EXTRA_EDU_TITLE = caseless_pipeline(['Курсы', 'Сертификаты']) HOBBY_TITLE = caseless_pipeline(['Хобби', 'Увлечения']) def parse(text): named_entities = load_named_entities(text) exp_tokens = edu_tokens = hobby_tokens = tokens = list(TOKENIZER(text)) extra_edu_tokens = [] parser = Parser(EXP_TITLE, tokenizer=ID_TOKENIZER) exp_title = parser.find(tokens) parser = Parser(EDU_TITLE, tokenizer=ID_TOKENIZER) edu_title = parser.find(tokens) parser = Parser(HOBBY_TITLE, tokenizer=ID_TOKENIZER) hobby_title = parser.find(tokens) if exp_title:
from yargy import (rule, or_, Parser) from yargy.predicates import (eq, gram) from yargy.pipelines import (caseless_pipeline, pipeline) from yargy.interpretation import (fact, attribute) from .helpers import ID_TOKENIZER, select_span_tokens, show_matches Hobby = fact('Hobby', [attribute('name').repeatable()]) HYPHEN = rule(pipeline(['-', '—', '–'])) COLON = rule(eq(':')) COMMA = rule(eq(',')) DOT = rule(eq('.')) TITLES = caseless_pipeline(['Хобби', 'Увлечения']) TITLE = rule(TITLES, or_(COLON, HYPHEN)) ITEM = rule(or_(gram('NOUN'), gram('ADJF')).repeatable(max=3)).interpretation(Hobby.name) HOBBY_ITEMS = rule(or_(TITLE, ITEM, COMMA, DOT)) HOBBIES = rule( TITLE, rule(ITEM, or_(COMMA, DOT)).repeatable(), ).interpretation(Hobby) class HobbyExtractor:
def __init__(self, filename, university): self.filename = filename self.university = university self.rpd_task_and_goals = morph_pipeline([ 'цели и задачи', 'цели освоения', 'задачи освоения', 'аннотация', 'краткое содержание', 'краткое описание' ]) self.rpd_education_result = morph_pipeline( ['планируемый результат обучение', 'компетенции']) self.rpd_discipline_link = morph_pipeline( ['место учебный дисциплина', 'место дисциплины']) self.rpd_discipline_structure = caseless_pipeline( ['содержание дисциплины', 'структура дисциплины']) self.rpd_lecture_theme = morph_pipeline(['лекции']) self.rpd_practice_theme = morph_pipeline([ 'практические занятия', 'семинар', 'семинарские занятия', 'лабораторные работы' ]) self.rpd_selfwork_theme = morph_pipeline([ 'самостоятельная работа обучающихся по дисциплине', 'самостоятельная работа студентов', 'домашняя работа' ]) self.rpd_education_zyn = rule(dictionary({'Знать', 'Уметь', 'Владеть'})) self.section_rule = rule( dictionary({"раздел", "тема", "дисциплина", "наименование"})) self.prd_lectures = rule( morph_pipeline([ 'тема лекций', 'содержание занятий', 'содержание лекционного занятия' ])) self.prd_practices = rule( morph_pipeline( ['наименование', 'содержание практического занятия', 'тема'])) self.rpd_srs = rule( morph_pipeline([ 'СРС', 'содержание занятий', 'содержание задания', 'тема СРО', 'тема СРС' ])) self.rpd_name = rule( morph_pipeline([ 'рабочая программа дисциплины', 'дисциплина', 'программа дисциплины' ])) self.table_rpd_name = rule(dictionary({'дисциплина'})) self.rpd_lectures_optional = rule(morph_pipeline(['содержание'])) self.rpd_practices_optional = rule( morph_pipeline(['содержание', 'cодержание практического занятия'])) self.rpd_srs_optional = rule( morph_pipeline(['содержание', 'содержание задания'])) self.documentText = dict() self.docs_headers = list() self.fullText = list() parser_RPD_task_and_goals = Parser(self.rpd_task_and_goals) parser_RPD_education_result = Parser(self.rpd_education_result) parser_RPD_discipline_link = Parser(self.rpd_discipline_link) parser_PRD_discipline_structure = Parser(self.rpd_discipline_structure) parser_PRD_lecture_theme = Parser(self.rpd_lecture_theme) parser_RPD_practice_theme = Parser(self.rpd_practice_theme) parser_RPD_selfwork_theme = Parser(self.rpd_selfwork_theme) parser_PRD_zyn_result = Parser(self.rpd_education_zyn) parser_PRD_themes = Parser(self.section_rule) parser_PRD_lectures = Parser(self.prd_lectures) parser_PRD_practices = Parser(self.prd_practices) parser_RPD_srs = Parser(self.rpd_srs) parser_RPD_name = Parser(self.rpd_name) self.parser_table_RPD_name = Parser(self.table_rpd_name) parser_RPD_lectures_desc = Parser(self.rpd_lectures_optional) parser_RPD_practices_desc = Parser(self.rpd_practices_optional) parser_RPD_srs_desc = Parser(self.rpd_srs_optional) self.get_rpd_text(filename) self.documentText['университет'] = self.university self.documentText['название дисциплины'] = self.get_rpd_name( parser_RPD_name) self.documentText[ 'направление подготовки'] = self.get_direction_of_preparation() self.documentText['цели и задачи'] = "".join( self.find_boundries(parser_RPD_task_and_goals)) self.documentText['результаты обучения'] = self.find_boundries( parser_RPD_education_result) fgos_table = "" flag = True if self.documentText['результаты обучения'] is not None: for item in self.documentText['результаты обучения']: if "Таблица: " in item: fgos_table = item[8:] self.documentText['результаты обучения'] = item if fgos_table == "": fgos_table = self.documentText['результаты обучения'] flag = False self.documentText['ЗУН'] = self.get_zyn_results( fgos_table, parser_PRD_zyn_result, flag) temp = "" for key, value in self.documentText['ЗУН'].items(): temp += key + " " for item in value: temp += "".join(item) + " " self.documentText['ЗУН'] = temp.replace("~", "") self.documentText['компетенции'] = self.search_place_fgos( "".join(fgos_table)) temp = "" for key, value in self.documentText['компетенции'].items(): temp += key + " " + value self.documentText['компетенции'] = temp self.documentText['результаты обучения'] = "".join( self.documentText['результаты обучения']).replace("~", '\t').replace( "@", '\n') self.documentText['связь дисциплины'] = "".join( self.find_boundries(parser_RPD_discipline_link)).replace( "Таблица: ", "").replace("~", "\t").replace("@", "\n") self.documentText['структура дисциплины'] = self.find_boundries( parser_PRD_discipline_structure) discipline_themes_table = "" for item in self.documentText['структура дисциплины']: if "Таблица: " in item: discipline_themes_table = item break self.documentText['структура дисциплины'] = "".join( self.documentText['структура дисциплины']).replace( "Таблица: ", '').replace("~", '\t').replace("@", "\n") self.documentText['темы структуры дисципилны'] = "".join( self.convert_string_to_table(discipline_themes_table[8:], parser_PRD_themes)) self.documentText['лекции'] = self.find_boundries( parser_PRD_lecture_theme) if self.documentText['лекции'] is not None: discipline_lectures_table = "" for item in self.documentText['лекции']: if "Таблица: " in item: discipline_lectures_table = item break self.documentText['темы лекций'] = "".join( self.convert_string_to_table(discipline_lectures_table[8:], parser_PRD_lectures)) self.documentText['описание лекций'] = "".join( self.convert_string_to_table(discipline_lectures_table[8:], parser_RPD_lectures_desc)) self.documentText['лекции'] = "".join( self.documentText['лекции']).replace("Таблица: ", '').replace( "~", '\t').replace("@", '\n') self.documentText['практики'] = self.find_boundries( parser_RPD_practice_theme) if self.documentText['практики'] is not None: discipline_practises_table = "" for item in self.documentText['практики']: if "Таблица: " in item: discipline_practises_table = item break self.documentText['темы практик'] = "".join( self.convert_string_to_table(discipline_practises_table[8:], parser_PRD_practices)) self.documentText['описание практик'] = "" # self.convert_string_to_table(discipline_lectures_table[8:],parser_RPD_practices_desc) self.documentText['практики'] = "".join( self.documentText['практики']).replace( "Таблица: ", '').replace("~", '\t').replace("@", '\n') self.documentText['СРС'] = self.find_boundries( parser_RPD_selfwork_theme) if self.documentText['СРС'] is not None: discipline_srs_table = "" for item in self.documentText['СРС']: if "Таблица: " in item: discipline_srs_table = item break self.documentText['темы СРС'] = "".join( self.convert_string_to_table(discipline_srs_table[8:], parser_RPD_srs)) self.documentText['описание СРС'] = "" # self.convert_string_to_table(discipline_srs_table[8:], parser_RPD_srs_desc) self.documentText['СРС'] = "".join( self.documentText['СРС']).replace("Таблица: ", '').replace( "~", '\t').replace("@", '\n')
YEAR = and_(gte(1900), lte(2100)).interpretation(Date.year.custom(int)) DATE = rule(MONTH_NAME, YEAR).interpretation(Date) Work_period = fact('Work_period', ['from_date', 'to_date']) FROM_DATE = DATE.interpretation(Work_period.from_date) TO_DATE = DATE.interpretation(Work_period.to_date) PERIOD = rule( FROM_DATE, HYPHEN.optional(), or_(TO_DATE, pipeline([ 'н.в.', 'настоящее время', 'работает сейчас' ]))).interpretation(Work_period).interpretation(Workplace.period) OCC_NAMES = caseless_pipeline(OCCUPATIONS) OCCUPATION = rule( or_( rule(OCC_NAMES), rule(OCC_NAMES, COMMA, not_(eq(OCC_NAMES)).repeatable(max=3), OCC_NAMES))).interpretation(Workplace.occupation) def update_rules(orgnames): ORGANIZATION = caseless_pipeline(orgnames).interpretation( Workplace.org_name) WORKPLACE_ELEM = rule(or_(PERIOD, ORGANIZATION, OCCUPATION))
'max_load_index', 'season', 'spikes' ]) Vendor = fact('Vendor', ['id', 'name']) # HELPERS SEP = in_({'-', '/', '|', ':', ';', '.'}) NUM = type_('INT') INT = NUM.interpretation(interp.custom(int)) FLOAT = rule(NUM.repeatable(), in_({',', '.'}), NUM, NUM.optional()).interpretation(interp.custom(to_float)) # TIRE_VENDORS VENDORS_NAME, VENDORS_ID = get_vendor_dict(tires_vendors_path) VENDOR = rule( caseless_pipeline(VENDORS_NAME).interpretation( Vendor.name.normalized().custom( VENDORS_NAME.get))).interpretation(Vendor) # TIRE_HELPERS DIAMETER_WITH_LETTER = rule(NUM, or_(eq('С'), eq('C')).optional()) STRUCTURE = or_( rule(or_(INT, FLOAT), SEP, or_(INT, FLOAT), SEP, or_(INT, DIAMETER_WITH_LETTER)), rule(or_(INT, FLOAT), SEP, or_(INT, FLOAT), eq('R'), or_(INT, DIAMETER_WITH_LETTER))) # TIRE_WIDTH WIDTH_PIPELINE = morph_pipeline(['шир', 'ширина', 'width', 'wid']) WIDTH = or_( rule(WIDTH_PIPELINE, SEP, or_(INT, FLOAT).interpretation(Tire.width), SEP),
from .helpers import load_lines, ID_TOKENIZER, select_span_tokens Date = fact('Date', ['month', 'year']) Education = fact('Education', ['year', 'name', 'specialization']) """ Creating dicts """ FOLDER = os.path.dirname(__file__) UNI_NAMES_GEN = load_lines(os.path.join(FOLDER, 'dicts/VUZY.txt')) UNI_NAMES = set(UNI_NAMES_GEN) SPECIALIZATIONS = load_lines(os.path.join(FOLDER, 'dicts/specs_only.txt')) SPECIALIZATION = caseless_pipeline(SPECIALIZATIONS).interpretation( Education.specialization) """ """ YEAR = and_(gte(1900), lte(2100)).interpretation( Date.year.custom(int)).interpretation(Education.year) UNI_NAME_RULE = rule( or_(rule(or_(eq('Филиал'), eq('филиал')), morph_pipeline(UNI_NAMES)), caseless_pipeline(UNI_NAMES))).interpretation(Education.name) ANON_COURSE_NAME = rule( and_( not_(eq('имени')), not_(eq('.')), ).repeatable(max=5)).interpretation(Education.name)
from yargy.interpretation import fact, attribute from yargy.pipelines import caseless_pipeline Type = fact('Type', [attribute('name', 'Практическое занятие')]) TYPES = { 'лаб': 'Лабораторная', 'лаб.': 'Лабораторная', 'пр.з.': 'Практическое занятие', 'пр. з.': 'Практическое занятие', 'лек.': 'Лекция', 'лек': 'Лекция', 'лекц.': 'Лекция', 'лекции': 'Лекция' } TYPE = caseless_pipeline(TYPES).interpretation(Type.name.custom( TYPES.get)).interpretation(Type)
length_grt(1)).interpretation(Name.last.custom(str.capitalize)) FIRST = and_(gram('Name'), length_grt(1)).interpretation(Name.first.custom(str.capitalize)) MIDDLE = and_(gram('Patr'), length_grt(1)).interpretation(Name.middle.custom(str.capitalize)) ABBR = and_(length_eq(1), is_capitalized()) FIRST_ABBR = ABBR.interpretation(Name.first.custom(str.upper)) MIDDLE_ABBR = ABBR.interpretation(Name.middle.custom(str.upper)) unsubstantial = {'Бен Режеб Т.Б.К.'} UNSUBSTANIAL = caseless_pipeline(unsubstantial).interpretation(Name) NAME = or_( rule(UNSUBSTANIAL), rule(LAST, DOT, FIRST_ABBR, DOT, MIDDLE_ABBR, DOT), rule(LAST, FIRST_ABBR, DOT, MIDDLE_ABBR, DOT), rule(FIRST_ABBR, DOT, MIDDLE_ABBR, DOT, LAST), rule(LAST, FIRST_ABBR, DOT), rule(FIRST_ABBR, DOT, LAST), rule(FIRST, MIDDLE, LAST), rule(LAST, FIRST, MIDDLE), rule(FIRST, MIDDLE), ).interpretation(Name) """ degrees = { 'ст.преп.': 'старший перподаватель',
subjectsPath = os.path.join(os.path.dirname(__file__), 'subjects') subjects = {} for subj in set(open(subjectsPath, 'r').read().lower().strip().split('\n')) - {''}: subj = subj.strip() a = subj.split('\\') if len(a) == 2: subj = a[1].strip() subjects[a[0].strip()] = subj subjects[subj] = subj if __name__ == '__main__': """ while True: s = input().lower() for i in subjects: i = i.lower() l = i.split() if len(l) == len(s) and all([s[i] == l[i][0] for i in range(len(s))]): print(i) """ for i in subjects: if i == subjects[i]: print(i) else: print(i, '\\', subjects[i]) SUBJECT = rule( caseless_pipeline(subjects).interpretation( Subject.name.normalized().custom( subjects.get))).interpretation(Subject)