def sections(self, segment='all'): """ attributes: <all> <themes> <lectures> <practices> <srs> """ themes = Parser(self.section_rule) lectures = Parser(self.lectures_rule) practices = Parser(self.pract_rule) srs = Parser(self.srs_rule) found = False for table in self.docxdoc.tables: for column in table.columns: for cell in column.cells: index = 0 if segment == 'all' or segment == 'themes': cell_search_themes = themes.findall(cell.text) for each in cell_search_themes: index += 1 if index > 2: return self.lectures(table, column) if segment != 'all': found = True print("this is theme") break if segment == 'all' or segment == 'lectures': cell_search_lectures = lectures.findall(cell.text) for each in cell_search_lectures: return self.lectures(table, column) if segment != 'all': found = True print("ЛЕКЦИИ") break if segment == 'all' or segment == 'practices': cell_search_practices = practices.findall(cell.text) for each in cell_search_practices: return self.lectures(table, column) if segment != 'all': found = True print("практика") break if segment == 'all' or segment == 'srs': cell_search_srs = srs.findall(cell.text) for each in cell_search_srs: return self.lectures(table, column) if segment != 'all': found = True print("практика") break if found: break if found: break
def test_pipeline(): RULE = rule(pipeline(['a b c', 'b c']), 'd') parser = Parser(RULE) assert parser.match('b c d') assert parser.match('a b c d') RULE = rule(pipeline(['a b']).repeatable(), 'c') parser = Parser(RULE) assert parser.match('a b a b c') RULE = rule(caseless_pipeline(['A B']), 'c') parser = Parser(RULE) assert parser.match('A b c') RULE = morph_pipeline([ 'текст', 'текст песни', 'материал', 'информационный материал', ]) parser = Parser(RULE) matches = list(parser.findall('текстом песни музыкальной группы')) assert len(matches) == 1 match = matches[0] assert [_.value for _ in match.tokens] == ['текстом', 'песни'] matches = list(parser.findall('информационного материала под названием')) assert len(matches) == 1 match = matches[0] assert [_.value for _ in match.tokens] == ['информационного', 'материала'] RULE = morph_pipeline(['1 B.']) parser = Parser(RULE) assert parser.match('1 b .')
class OGRNExtractor: def __init__(self, logger=None, env='local'): self.env = env if logger is None: self.logger = logging.getLogger("OGRNExtractor") self.logger.setLevel(logging.DEBUG) handler = RotatingFileHandler("ogrn_extractor.log", mode='a', encoding='utf-8', backupCount=5, maxBytes=1 * 1024 * 1024) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) else: self.logger = logger self.tokenizer = MorphTokenizer() OGRN = morph_pipeline([ 'огрн', 'основной государственный регистрационный номер', 'огрнип' ]) INT = type('INT') OGRN_NUMBER = rule(OGRN, INT) self.full_ogrn_parser = Parser(OGRN_NUMBER) self.ogrn_num_parser = Parser(rule(INT)) def preprocess(self, line): line = line.replace("\n", " ").replace(""", "\"") return line def extract(self, line): line = self.preprocess(line) matches = list(self.full_ogrn_parser.findall(line)) spans = [_.span for _ in matches] result = [] for span in spans: match = line[span.start:span.stop] int_matches = list(self.ogrn_num_parser.findall(match)) int_spans = [_.span for _ in int_matches] for int_span in int_spans: int_match = match[int_span.start:int_span.stop] result.append(int_match) result = list(set(result)) return result def show_tokens(self, line): line = line.replace("\n", " ").replace(""", "\"") return list(self.tokenizer(line))
def find(self, tokens): parser = Parser(self.WORKPLACE_ELEM, tokenizer=ID_TOKENIZER) matches = parser.findall(tokens) spans = [_.span for _ in matches] tokens = list(select_span_tokens(tokens, spans)) # print([_.value for _ in tokens]) parser = Parser(self.WORKPLACE, tokenizer=ID_TOKENIZER) matches = list(parser.findall(tokens)) return matches
def find(tokens): parser = Parser(HOBBY_ITEMS, tokenizer=ID_TOKENIZER) matches = parser.findall(tokens) spans = [_.span for _ in matches] tokens = list(select_span_tokens(tokens, spans)) # print([_.value for _ in tokens]) parser = Parser(HOBBIES, tokenizer=ID_TOKENIZER) matches = list(parser.findall(tokens)) return matches
def find_feature(feature, RULE, RULE2, space=[40,40]): parser = Parser(RULE) lst = [] for match in parser.findall(text): lst.append((match.span, [_.value for _ in match.tokens])) if lst: add_text = text[list(match.span)[1]-space[0]:list(match.span)[1]+space[1]] parser = Parser(RULE2) lst = [] for match in parser.findall(add_text): lst.append((match.span, [_.value for _ in match.tokens])) if lst: dict_symp[feature] = 1 else: dict_symp[feature] = 0
def test(rule, *lines, tree=False, facts=False): is_at = lambda span, set: any((span == s) for s in set) parser = Parser(rule) for line in lines: if isinstance(line, str): text, expected = line, [] else: text = line[0] expected = [find(text, substr) for substr in line[1:]] matches = list(sorted(parser.findall(text), key=lambda _: _.span)) # display(matches) matched_spans = [_.span for _ in matches] spans = [(s[0], s[1], '#aec7e8' if is_at(s, expected) else '#ff9896') for s in matched_spans] \ + [(s[0], s[1], '#ccc') for s in expected if not is_at((s[0], s[1]), matched_spans)] show_markup(text, [s for s in spans if s[0] < s[1]], LineMarkup) if matches: for _ in matches: if tree: display(matches[0].tree.as_dot) if facts: display(_.fact)
class ConsoleGame: __game = fact( 'Game', ['name', 'version_number', 'version_name', 'console'] ) __amount_of_games = [] def __init__(self, names: list = [], version_numbers: list = [], version_names: list = [], consoles: list = []): rules = rule(morph_pipeline(names).interpretation(self.__game.name.const(names[0])), morph_pipeline(version_numbers).interpretation(self.__game.version_number).optional(), morph_pipeline(version_names).interpretation(self.__game.version_name).optional(), morph_pipeline(consoles).interpretation(self.__game.console).optional()) game = or_(rules).interpretation(self.__game) self.parser = Parser(game) def matches(self, data): matches = [] for sent in data.text[:9000]: for match in self.parser.findall(sent): matches.append(match.fact) self.__amount_of_games.append(matches) for m in matches: print(m.name, m.version_number, m.version_name, m.console) print(len(self.__amount_of_games))
def test_samples(rules: Union[NamedRule, List[NamedRule]], texts: List[str], num: int = 20, seed: int = None, markup=None, fact=False): from random import seed as sed, sample sed(seed) texts, num = (texts, len(texts)) if len(texts) < num else (sample(texts, num), num) results: Dict[int, Dict[int, List]] = defaultdict(dict) if not (isinstance(rules, list) or isinstance(rules, tuple)): rules = [rules] for rule_idx, rule in enumerate(rules): parser = Parser(rule) for text_idx in range(num): matches = parser.findall(texts[text_idx]) results[text_idx][rule_idx] = list(matches) for text_idx, rule_matches in results.items(): spans = [(m.span[0], m.span[1], str(rules[rule_idx].name)) for rule_idx, matches in rule_matches.items() for m in matches] show_markup(texts[text_idx], spans, markup or BoxLabelMarkup) if fact: for rule_idx, matches in rule_matches.items(): for m in matches: display(m.fact)
def show_matches(rule, *lines): parser = Parser(rule) for line in lines: matches = parser.findall(line) spans = [_.span for _ in matches] show_markup(line, spans)
class Extractor(object): def __init__(self, rule, tokenizer=TOKENIZER, tagger=None): self.parser = Parser(rule, tokenizer=tokenizer, tagger=tagger) def __call__(self, text): text = normalize_text(text) matches = self.parser.findall(text) return Matches(text, matches)
class Extractor(object): def __init__(self, rule, tokenizer=TOKENIZER, tagger=None): self.parser = Parser(rule, tokenizer=tokenizer, tagger=tagger) def __call__(self, text): text = normalize_text(text) matches = self.parser.findall(text) return Matches(text, matches)
def test_tagger(): text = 'a b c d e f g' A = tag('I').repeatable() parser = Parser(A, tagger=MyTagger()) matches = parser.findall(text) spans = [_.span for _ in matches] substrings = [text[start:stop] for start, stop in spans] assert substrings == ['b c', 'e f']
def find_factors(factor_types): for i in range(len(factor_types)): factor_lst = [] FACT_RULE = morph_pipeline(factor_types[i]) parser = Parser(FACT_RULE) for match in parser.findall(text): factor_lst.append(' '.join([_.value for _ in match.tokens])) factors_span.append(match.span) if factor_lst: factors.append(i+1)
def totalNERPersons(self, string): res = PersonsResult() rules = persons_rules.get_mid_rules() middleParser = Parser(rules) middle = list(middleParser.findall(string)) res.middle = self.parseNameMatches(middle) rules = persons_rules.get_first_rules() firstParser = Parser(rules) first = list(firstParser.findall(string)) res.first = self.parseNameMatches(first) rules = persons_rules.get_second_rules() lastParser = Parser(rules) last = list(lastParser.findall(string)) res.last = self.parseNameMatches(last) return res
def test_pipeline(): RULE = rule( pipeline(['a b c', 'b c']), 'd' ) parser = Parser(RULE) assert parser.match('b c d') assert parser.match('a b c d') RULE = rule( pipeline(['a b']).repeatable(), 'c' ) parser = Parser(RULE) assert parser.match('a b a b c') RULE = rule( caseless_pipeline(['A B']), 'c' ) parser = Parser(RULE) assert parser.match('A b c') RULE = morph_pipeline([ 'текст', 'текст песни', 'материал', 'информационный материал', ]) parser = Parser(RULE) matches = list(parser.findall('текстом песни музыкальной группы')) assert len(matches) == 1 match = matches[0] assert [_.value for _ in match.tokens] == ['текстом', 'песни'] matches = list(parser.findall('информационного материала под названием')) assert len(matches) == 1 match = matches[0] assert [_.value for _ in match.tokens] == ['информационного', 'материала'] RULE = morph_pipeline(['1 B.']) parser = Parser(RULE) assert parser.match('1 b .')
class Extractor(Record): __attributes__ = ['parser'] def __init__(self, rule, pipelines=()): self.parser = Parser(rule, pipelines=pipelines) def __call__(self, text): text = normalize_text(text) matches = self.parser.findall(text) return Matches(text, matches)
def findINN(text): import data.inn parser = Parser(data.inn.INNORG) matches = list(parser.findall(text)) ret = [] y = [_.fact for _ in matches] for i in y: z = {} z['num'] = i.num ret = ret + [z] if ret: return ret
def test_tagger(): text = 'a b c d e f g' A = tag('I').repeatable() parser = Parser(A, tagger=MyTagger()) matches = parser.findall(text) spans = [_.span for _ in matches] substrings = [ text[start:stop] for start, stop in spans ] assert substrings == ['b c', 'e f']
def process_temperature(txt): parser = Parser(TEMPERATURE) res = []; for match in parser.findall(txt): element = dict() element['min'] = match.fact.min element['max'] = match.fact.max element['singular'] = match.fact.singular res.append(element) return res
def get_hyperonyms(main_word): HYPONYM = eq(utils.deaccent(main_word)) RULE = or_(rule(HYPONYM, ATAKJE, START, MID, END), rule(HYPONYM, MID, END), rule(START_S, END, KAK, HYPONYM), rule(END, INCLUDING, HYPONYM)) parser = Parser(RULE) text = utils.deaccent(wikipedia.summary(main_word)) print(text) text = re.sub(r'\(.+?\)', '', text) text = text.lower().replace('* сергии радонежскии* ', '') for idx, match in enumerate(parser.findall(text.lower())): k = [_.value for _ in match.tokens] print(k)
def findNCONTRACT(text): import data.ncont parser = Parser(data.ncont.NCONTRACT) matches = list(parser.findall(text)) ret = [] y = [_.fact for _ in matches] for i in y: z = {} z['num'] = i.num ret = ret + [z] if ret: return ret
def process_plant_height(txt): parser = Parser(SIZE_SENTENCE) res = [] for match in parser.findall(txt): element = dict() element['height_min'] = match.fact.min_height element['height_max'] = match.fact.max_height element['height_class'] = match.fact.height_class element['height_measurement'] = match.fact.measurement res.append(element) return res
def get(self): result_list = [] parser = Parser(CITY_PARSER) matches = list(parser.findall(self.text)) for match in matches: if match is not None: try: result_list.append(match.fact.city) except KeyError: pass result_list = list(dict.fromkeys(result_list)) return next(iter(result_list), '')
def get(self): result_list = [] parser = Parser(GENDER_PARSER) matches = list(parser.findall(self.text)) for match in matches: if match is not None: try: result_list.append(match.fact.gender) except KeyError: pass result_list = list(dict.fromkeys(result_list)) return result_list
def extract_AH(record): if (not 'Диагноз' in record.keys()): return None AH_predator = morph_pipeline([ 'артериальная гипертония', 'артериальная гипертензия', 'гипертоническая болезнь', 'АГ', 'ГБ' ]) parser_ee = Parser(AH_predator) line = record['Диагноз'] matches = list(parser_ee.findall(line)) if len(matches) > 0: return True return False
def get(self): result_list = [] parser = Parser(CONTENT_TYPE_PARSER) matches = list(parser.findall(self.text)) for match in matches: if match is not None: try: result_list.append(match.fact.contenttype) except KeyError: pass result_list = list(dict.fromkeys(result_list)) return result_list
def findDATECONT(text): import data.datecont parser = Parser(data.datecont.DATECONT) matches = list(parser.findall(text)) ret = [] y = [_.fact for _ in matches] for i in y: z = {} z['day'] = i.day z['mounth'] = i.mouth z['year'] = i.year ret = ret + [z] if ret: return ret
def findNCOASTCASE(text): import data.ncoast parser = Parser(data.ncoast.NCOASTCASE) matches = list(parser.findall(text)) ret = [] y = [_.fact for _ in matches] for i in y: z = {} z['first'] = i.first z['second'] = i.second z['third'] = i.third ret = ret + [z] if ret: return ret
def parse(self, file: Path, bracket: str): #bracket - символ, отделяющий название от описания with file.open("r", encoding='utf-8') as file: parser = Parser(ATTRIBUTE) for line in file: line = line.replace('\n', '') self.goods.append(line) #print(line) for match in parser.findall(line): for token in match.tokens: self.goods.append( line[token.span.start:token.span.stop]) #исключаем повторы self.goods = list(set(self.goods))
def process_flower(txt): parser = Parser(FLOWER) res = [] for match in parser.findall(txt): element = dict() element['color'] = match.fact.color element['size'] = match.fact.size element['size_min_diam'] = match.fact.size_diam_min element['size_max_diam'] = match.fact.size_diam_max element['measurement'] = match.fact.measurement element['other'] = match.fact.other res.append(element) return res
def find_side(parser, sidetext): rozha = [] lst = [] for match in parser.findall(sidetext): lst.append((match.span, [_.value for _ in match.tokens])) if lst: for i in range(len(rozha_types)): rozha_lst = [] TYPE = morph_pipeline(rozha_types[i]) parser = Parser(TYPE) for match in parser.findall(sidetext):#part): rozha_lst.append(' '.join([_.value for _ in match.tokens])) if rozha_lst: if i ==15: rozha.append('2.1') else: rozha.append(i+1) return(rozha)
def getTags(text, tag_list): if text == None: return {} RULE = morph_pipeline(tag_list) mentioned_tags = [] parser = Parser(RULE) morph = pymorphy2.MorphAnalyzer() for match in parser.findall(text): try: value = match.tokens[0].value normalized_value = morph.parse(value)[0].normal_form if normalized_value in mentioned_tags: continue mentioned_tags.append(normalized_value) except: print('Salary parser error') return mentioned_tags