Beispiel #1
0
    def sections(self, segment='all'):
        """
        attributes:
        <all>
        <themes>
        <lectures>
        <practices>
        <srs>
        """

        themes = Parser(self.section_rule)
        lectures = Parser(self.lectures_rule)
        practices = Parser(self.pract_rule)
        srs = Parser(self.srs_rule)
        found = False
        for table in self.docxdoc.tables:
            for column in table.columns:
                for cell in column.cells:
                    index = 0

                    if segment == 'all' or segment == 'themes':
                        cell_search_themes = themes.findall(cell.text)
                        for each in cell_search_themes:
                            index += 1
                        if index > 2:
                            return self.lectures(table, column)
                            if segment != 'all':
                                found = True
                            print("this is theme")
                            break

                    if segment == 'all' or segment == 'lectures':
                        cell_search_lectures = lectures.findall(cell.text)
                        for each in cell_search_lectures:
                            return self.lectures(table, column)
                            if segment != 'all':
                                found = True
                            print("ЛЕКЦИИ")
                            break

                    if segment == 'all' or segment == 'practices':
                        cell_search_practices = practices.findall(cell.text)
                        for each in cell_search_practices:
                            return self.lectures(table, column)
                            if segment != 'all':
                                found = True
                            print("практика")
                            break

                    if segment == 'all' or segment == 'srs':
                        cell_search_srs = srs.findall(cell.text)
                        for each in cell_search_srs:
                            return self.lectures(table, column)
                            if segment != 'all':
                                found = True
                            print("практика")
                            break

                if found: break
            if found: break
def test_pipeline():
    RULE = rule(pipeline(['a b c', 'b c']), 'd')
    parser = Parser(RULE)
    assert parser.match('b c d')
    assert parser.match('a b c d')

    RULE = rule(pipeline(['a b']).repeatable(), 'c')
    parser = Parser(RULE)
    assert parser.match('a b a b c')

    RULE = rule(caseless_pipeline(['A B']), 'c')
    parser = Parser(RULE)
    assert parser.match('A b c')

    RULE = morph_pipeline([
        'текст',
        'текст песни',
        'материал',
        'информационный материал',
    ])
    parser = Parser(RULE)
    matches = list(parser.findall('текстом песни музыкальной группы'))
    assert len(matches) == 1
    match = matches[0]
    assert [_.value for _ in match.tokens] == ['текстом', 'песни']

    matches = list(parser.findall('информационного материала под названием'))
    assert len(matches) == 1
    match = matches[0]
    assert [_.value for _ in match.tokens] == ['информационного', 'материала']

    RULE = morph_pipeline(['1 B.'])
    parser = Parser(RULE)
    assert parser.match('1 b .')
class OGRNExtractor:
    def __init__(self, logger=None, env='local'):

        self.env = env

        if logger is None:
            self.logger = logging.getLogger("OGRNExtractor")
            self.logger.setLevel(logging.DEBUG)
            handler = RotatingFileHandler("ogrn_extractor.log",
                                          mode='a',
                                          encoding='utf-8',
                                          backupCount=5,
                                          maxBytes=1 * 1024 * 1024)
            formatter = logging.Formatter(
                '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)
        else:
            self.logger = logger

        self.tokenizer = MorphTokenizer()

        OGRN = morph_pipeline([
            'огрн', 'основной государственный регистрационный номер', 'огрнип'
        ])

        INT = type('INT')

        OGRN_NUMBER = rule(OGRN, INT)

        self.full_ogrn_parser = Parser(OGRN_NUMBER)
        self.ogrn_num_parser = Parser(rule(INT))

    def preprocess(self, line):
        line = line.replace("\n", " ").replace("&quot;", "\"")
        return line

    def extract(self, line):
        line = self.preprocess(line)

        matches = list(self.full_ogrn_parser.findall(line))
        spans = [_.span for _ in matches]

        result = []
        for span in spans:
            match = line[span.start:span.stop]
            int_matches = list(self.ogrn_num_parser.findall(match))
            int_spans = [_.span for _ in int_matches]
            for int_span in int_spans:
                int_match = match[int_span.start:int_span.stop]
                result.append(int_match)

        result = list(set(result))
        return result

    def show_tokens(self, line):
        line = line.replace("\n", " ").replace("&quot;", "\"")
        return list(self.tokenizer(line))
    def find(self, tokens):
        parser = Parser(self.WORKPLACE_ELEM, tokenizer=ID_TOKENIZER)
        matches = parser.findall(tokens)
        spans = [_.span for _ in matches]

        tokens = list(select_span_tokens(tokens, spans))
        # print([_.value for _ in tokens])

        parser = Parser(self.WORKPLACE, tokenizer=ID_TOKENIZER)

        matches = list(parser.findall(tokens))
        return matches
Beispiel #5
0
    def find(tokens):
        parser = Parser(HOBBY_ITEMS, tokenizer=ID_TOKENIZER)
        matches = parser.findall(tokens)
        spans = [_.span for _ in matches]

        tokens = list(select_span_tokens(tokens, spans))
        # print([_.value for _ in tokens])

        parser = Parser(HOBBIES, tokenizer=ID_TOKENIZER)

        matches = list(parser.findall(tokens))
        return matches
 def find_feature(feature, RULE, RULE2, space=[40,40]):
     parser = Parser(RULE)
     lst = []
     for match in parser.findall(text):
         lst.append((match.span, [_.value for _ in match.tokens]))
     if lst:
         add_text = text[list(match.span)[1]-space[0]:list(match.span)[1]+space[1]]
         parser = Parser(RULE2)
         lst = []
         for match in parser.findall(add_text):
             lst.append((match.span, [_.value for _ in match.tokens]))
         if lst:
             dict_symp[feature] = 1
         else:
             dict_symp[feature] = 0
Beispiel #7
0
def test(rule, *lines, tree=False, facts=False):
    is_at = lambda span, set: any((span == s) for s in set)
    parser = Parser(rule)

    for line in lines:
        if isinstance(line, str):
            text, expected = line, []
        else:
            text = line[0]
            expected = [find(text, substr) for substr in line[1:]]

        matches = list(sorted(parser.findall(text), key=lambda _: _.span))
        # display(matches)
        matched_spans = [_.span for _ in matches]
        spans = [(s[0], s[1], '#aec7e8' if is_at(s, expected) else '#ff9896') for s in matched_spans] \
                + [(s[0], s[1], '#ccc') for s in expected if not is_at((s[0], s[1]), matched_spans)]

        show_markup(text, [s for s in spans if s[0] < s[1]], LineMarkup)

        if matches:
            for _ in matches:
                if tree:
                    display(matches[0].tree.as_dot)
                if facts:
                    display(_.fact)
Beispiel #8
0
class ConsoleGame:
    __game = fact(
        'Game',
        ['name', 'version_number', 'version_name', 'console']
    )
    __amount_of_games = []

    def __init__(self, names: list = [], version_numbers: list = [], version_names: list = [], consoles: list = []):
        rules = rule(morph_pipeline(names).interpretation(self.__game.name.const(names[0])),
                     morph_pipeline(version_numbers).interpretation(self.__game.version_number).optional(),
                     morph_pipeline(version_names).interpretation(self.__game.version_name).optional(),
                     morph_pipeline(consoles).interpretation(self.__game.console).optional())
        game = or_(rules).interpretation(self.__game)
        self.parser = Parser(game)

    def matches(self, data):
        matches = []

        for sent in data.text[:9000]:
            for match in self.parser.findall(sent):
                matches.append(match.fact)
                self.__amount_of_games.append(matches)

        for m in matches:
            print(m.name, m.version_number, m.version_name, m.console)

        print(len(self.__amount_of_games))
Beispiel #9
0
def test_samples(rules: Union[NamedRule, List[NamedRule]],
                 texts: List[str],
                 num: int = 20,
                 seed: int = None,
                 markup=None,
                 fact=False):
    from random import seed as sed, sample

    sed(seed)
    texts, num = (texts,
                  len(texts)) if len(texts) < num else (sample(texts, num),
                                                        num)
    results: Dict[int, Dict[int, List]] = defaultdict(dict)

    if not (isinstance(rules, list) or isinstance(rules, tuple)):
        rules = [rules]

    for rule_idx, rule in enumerate(rules):
        parser = Parser(rule)

        for text_idx in range(num):
            matches = parser.findall(texts[text_idx])
            results[text_idx][rule_idx] = list(matches)

    for text_idx, rule_matches in results.items():
        spans = [(m.span[0], m.span[1], str(rules[rule_idx].name))
                 for rule_idx, matches in rule_matches.items()
                 for m in matches]

        show_markup(texts[text_idx], spans, markup or BoxLabelMarkup)

        if fact:
            for rule_idx, matches in rule_matches.items():
                for m in matches:
                    display(m.fact)
def show_matches(rule, *lines):
    parser = Parser(rule)
    for line in lines:
        matches = parser.findall(line)
        spans = [_.span for _ in matches]

        show_markup(line, spans)
Beispiel #11
0
class Extractor(object):
    def __init__(self, rule, tokenizer=TOKENIZER, tagger=None):
        self.parser = Parser(rule, tokenizer=tokenizer, tagger=tagger)

    def __call__(self, text):
        text = normalize_text(text)
        matches = self.parser.findall(text)
        return Matches(text, matches)
Beispiel #12
0
class Extractor(object):
    def __init__(self, rule, tokenizer=TOKENIZER, tagger=None):
        self.parser = Parser(rule, tokenizer=tokenizer, tagger=tagger)

    def __call__(self, text):
        text = normalize_text(text)
        matches = self.parser.findall(text)
        return Matches(text, matches)
Beispiel #13
0
def test_tagger():
    text = 'a b c d e f g'
    A = tag('I').repeatable()
    parser = Parser(A, tagger=MyTagger())

    matches = parser.findall(text)
    spans = [_.span for _ in matches]
    substrings = [text[start:stop] for start, stop in spans]
    assert substrings == ['b c', 'e f']
 def find_factors(factor_types):
     for i in range(len(factor_types)):
         factor_lst = []
         FACT_RULE = morph_pipeline(factor_types[i])
         parser = Parser(FACT_RULE)
         for match in parser.findall(text):
             factor_lst.append(' '.join([_.value for _ in match.tokens]))
             factors_span.append(match.span)
         if factor_lst:
             factors.append(i+1)
Beispiel #15
0
    def totalNERPersons(self, string):
        res = PersonsResult()

        rules = persons_rules.get_mid_rules()
        middleParser = Parser(rules)
        middle = list(middleParser.findall(string))
        res.middle = self.parseNameMatches(middle)

        rules = persons_rules.get_first_rules()
        firstParser = Parser(rules)
        first = list(firstParser.findall(string))
        res.first = self.parseNameMatches(first)

        rules = persons_rules.get_second_rules()
        lastParser = Parser(rules)
        last = list(lastParser.findall(string))
        res.last = self.parseNameMatches(last)

        return res
def test_pipeline():
    RULE = rule(
        pipeline(['a b c', 'b c']),
        'd'
    )
    parser = Parser(RULE)
    assert parser.match('b c d')
    assert parser.match('a b c d')

    RULE = rule(
        pipeline(['a b']).repeatable(),
        'c'
    )
    parser = Parser(RULE)
    assert parser.match('a b a b c')

    RULE = rule(
        caseless_pipeline(['A B']),
        'c'
    )
    parser = Parser(RULE)
    assert parser.match('A b c')

    RULE = morph_pipeline([
        'текст',
        'текст песни',
        'материал',
        'информационный материал',
    ])
    parser = Parser(RULE)
    matches = list(parser.findall('текстом песни музыкальной группы'))
    assert len(matches) == 1
    match = matches[0]
    assert [_.value for _ in match.tokens] == ['текстом', 'песни']

    matches = list(parser.findall('информационного материала под названием'))
    assert len(matches) == 1
    match = matches[0]
    assert [_.value for _ in match.tokens] == ['информационного', 'материала']

    RULE = morph_pipeline(['1 B.'])
    parser = Parser(RULE)
    assert parser.match('1 b .')
Beispiel #17
0
class Extractor(Record):
    __attributes__ = ['parser']

    def __init__(self, rule, pipelines=()):
        self.parser = Parser(rule, pipelines=pipelines)

    def __call__(self, text):
        text = normalize_text(text)
        matches = self.parser.findall(text)
        return Matches(text, matches)
Beispiel #18
0
def findINN(text):
    import data.inn
    parser = Parser(data.inn.INNORG)
    matches = list(parser.findall(text))
    ret = []
    y = [_.fact for _ in matches]
    for i in y:
        z = {}
        z['num'] = i.num
        ret = ret + [z]
    if ret:
        return ret
Beispiel #19
0
def test_tagger():
    text = 'a b c d e f g'
    A = tag('I').repeatable()
    parser = Parser(A, tagger=MyTagger())

    matches = parser.findall(text)
    spans = [_.span for _ in matches]
    substrings = [
        text[start:stop]
        for start, stop in spans
    ]
    assert substrings == ['b c', 'e f']
Beispiel #20
0
def process_temperature(txt):
    parser = Parser(TEMPERATURE)
    res = [];
    for match in parser.findall(txt):
        element = dict()
        element['min'] = match.fact.min
        element['max'] = match.fact.max
        element['singular'] = match.fact.singular
        
        res.append(element)
        
    return res
def get_hyperonyms(main_word):
    HYPONYM = eq(utils.deaccent(main_word))
    RULE = or_(rule(HYPONYM, ATAKJE, START, MID, END), rule(HYPONYM, MID, END),
               rule(START_S, END, KAK, HYPONYM), rule(END, INCLUDING, HYPONYM))
    parser = Parser(RULE)
    text = utils.deaccent(wikipedia.summary(main_word))
    print(text)
    text = re.sub(r'\(.+?\)', '', text)
    text = text.lower().replace('* сергии радонежскии* ', '')
    for idx, match in enumerate(parser.findall(text.lower())):
        k = [_.value for _ in match.tokens]
        print(k)
Beispiel #22
0
def findNCONTRACT(text):
    import data.ncont
    parser = Parser(data.ncont.NCONTRACT)
    matches = list(parser.findall(text))
    ret = []
    y = [_.fact for _ in matches]
    for i in y:
        z = {}
        z['num'] = i.num
        ret = ret + [z]
    if ret:
        return ret
Beispiel #23
0
def process_plant_height(txt):
    parser = Parser(SIZE_SENTENCE)
    res = []
    for match in parser.findall(txt):
        element = dict()
        element['height_min'] = match.fact.min_height
        element['height_max'] = match.fact.max_height
        element['height_class'] = match.fact.height_class
        element['height_measurement'] = match.fact.measurement
        
        res.append(element)
    
    return res
Beispiel #24
0
    def get(self):
        result_list = []
        parser = Parser(CITY_PARSER)
        matches = list(parser.findall(self.text))
        for match in matches:
            if match is not None:
                try:
                    result_list.append(match.fact.city)
                except KeyError:
                    pass

        result_list = list(dict.fromkeys(result_list))
        return next(iter(result_list), '')
Beispiel #25
0
    def get(self):
        result_list = []
        parser = Parser(GENDER_PARSER)
        matches = list(parser.findall(self.text))
        for match in matches:
            if match is not None:
                try:
                    result_list.append(match.fact.gender)
                except KeyError:
                    pass

        result_list = list(dict.fromkeys(result_list))
        return result_list
Beispiel #26
0
def extract_AH(record):
    if (not 'Диагноз' in record.keys()):
        return None
    AH_predator = morph_pipeline([
        'артериальная гипертония', 'артериальная гипертензия',
        'гипертоническая болезнь', 'АГ', 'ГБ'
    ])
    parser_ee = Parser(AH_predator)
    line = record['Диагноз']
    matches = list(parser_ee.findall(line))
    if len(matches) > 0:
        return True
    return False
Beispiel #27
0
    def get(self):
        result_list = []
        parser = Parser(CONTENT_TYPE_PARSER)
        matches = list(parser.findall(self.text))
        for match in matches:
            if match is not None:
                try:
                    result_list.append(match.fact.contenttype)
                except KeyError:
                    pass

        result_list = list(dict.fromkeys(result_list))
        return result_list
Beispiel #28
0
def findDATECONT(text):
    import data.datecont
    parser = Parser(data.datecont.DATECONT)
    matches = list(parser.findall(text))
    ret = []
    y = [_.fact for _ in matches]
    for i in y:
        z = {}
        z['day'] = i.day
        z['mounth'] = i.mouth
        z['year'] = i.year
        ret = ret + [z]
    if ret:
        return ret
Beispiel #29
0
def findNCOASTCASE(text):
    import data.ncoast
    parser = Parser(data.ncoast.NCOASTCASE)
    matches = list(parser.findall(text))
    ret = []
    y = [_.fact for _ in matches]
    for i in y:
        z = {}
        z['first'] = i.first
        z['second'] = i.second
        z['third'] = i.third
        ret = ret + [z]
    if ret:
        return ret
    def parse(self, file: Path, bracket: str):
        #bracket - символ, отделяющий название от описания
        with file.open("r", encoding='utf-8') as file:
            parser = Parser(ATTRIBUTE)
            for line in file:
                line = line.replace('\n', '')
                self.goods.append(line)
                #print(line)
                for match in parser.findall(line):
                    for token in match.tokens:
                        self.goods.append(
                            line[token.span.start:token.span.stop])

        #исключаем повторы
        self.goods = list(set(self.goods))
Beispiel #31
0
def process_flower(txt):
    parser = Parser(FLOWER)
    res = []
    for match in parser.findall(txt):
        element = dict()
        element['color'] = match.fact.color
        element['size'] = match.fact.size
        element['size_min_diam'] = match.fact.size_diam_min
        element['size_max_diam'] = match.fact.size_diam_max
        element['measurement'] = match.fact.measurement
        element['other'] = match.fact.other
        
        res.append(element)
        
    return res
 def find_side(parser, sidetext):
     rozha = []
     lst = []
     for match in parser.findall(sidetext):
         lst.append((match.span, [_.value for _ in match.tokens]))
     if lst:
         for i in range(len(rozha_types)):
             rozha_lst = []
             TYPE = morph_pipeline(rozha_types[i])
             parser = Parser(TYPE)
             for match in parser.findall(sidetext):#part):
                 rozha_lst.append(' '.join([_.value for _ in match.tokens]))
             if rozha_lst:
                 if i ==15: rozha.append('2.1')
                 else: rozha.append(i+1)
     return(rozha)
Beispiel #33
0
def getTags(text, tag_list):
    if text == None:
        return {}
    RULE = morph_pipeline(tag_list)
    mentioned_tags = []
    parser = Parser(RULE)
    morph = pymorphy2.MorphAnalyzer()
    for match in parser.findall(text):
        try:
            value = match.tokens[0].value
            normalized_value = morph.parse(value)[0].normal_form
            if normalized_value in mentioned_tags:
                continue
            mentioned_tags.append(normalized_value)
        except:
            print('Salary parser error')
    return mentioned_tags