def test_pipeline():
    RULE = rule(pipeline(['a b c', 'b c']), 'd')
    parser = Parser(RULE)
    assert parser.match('b c d')
    assert parser.match('a b c d')

    RULE = rule(pipeline(['a b']).repeatable(), 'c')
    parser = Parser(RULE)
    assert parser.match('a b a b c')

    RULE = rule(caseless_pipeline(['A B']), 'c')
    parser = Parser(RULE)
    assert parser.match('A b c')

    RULE = morph_pipeline([
        'текст',
        'текст песни',
        'материал',
        'информационный материал',
    ])
    parser = Parser(RULE)
    matches = list(parser.findall('текстом песни музыкальной группы'))
    assert len(matches) == 1
    match = matches[0]
    assert [_.value for _ in match.tokens] == ['текстом', 'песни']

    matches = list(parser.findall('информационного материала под названием'))
    assert len(matches) == 1
    match = matches[0]
    assert [_.value for _ in match.tokens] == ['информационного', 'материала']

    RULE = morph_pipeline(['1 B.'])
    parser = Parser(RULE)
    assert parser.match('1 b .')
Beispiel #2
0
def test_activate():
    from yargy.pipelines import pipeline
    from yargy.predicates import gram
    from yargy.tokenizer import MorphTokenizer

    tokenizer = MorphTokenizer()

    A = pipeline(['a']).named('A')
    B = A.activate(tokenizer)
    assert_bnf(B, 'A -> pipeline')

    A = rule(gram('NOUN')).named('A')
    B = A.activate(tokenizer)
    assert_bnf(B, "A -> gram('NOUN')")
def update_rules(name):
    NAME = pipeline(name).interpretation(Socdem.name)

    SOCDEM_ELEMS = rule(or_(NAME, GENDER, date.DATE, AGE, LOCATION))

    SOCDEM = rule(
        NAME, GENDER.optional(),
        or_(
            rule(AGE.optional(),
                 date.DATE.interpretation(Socdem.date_of_birth).optional()),
            rule(
                date.DATE.interpretation(Socdem.date_of_birth).optional(),
                AGE.optional()),
        ), LOCATION.optional()).interpretation(Socdem)

    return SOCDEM_ELEMS, SOCDEM
Beispiel #4
0
from .helpers import TOKENIZER, ID_TOKENIZER, load_named_entities
from .education import EducationExtractor
from .workplace import WorkplaceExtractor
from .hobby import HobbyExtractor

from yargy.parser import Parser
from yargy.pipelines import pipeline, caseless_pipeline


EXP_TITLE = pipeline(['Опыт работы'])
EDU_TITLE = pipeline(['Образование'])
EXTRA_EDU_TITLE = caseless_pipeline(['Курсы', 'Сертификаты'])
HOBBY_TITLE = caseless_pipeline(['Хобби', 'Увлечения'])


def parse(text):

    named_entities = load_named_entities(text)
    exp_tokens = edu_tokens = hobby_tokens = tokens = list(TOKENIZER(text))
    extra_edu_tokens = []

    parser = Parser(EXP_TITLE, tokenizer=ID_TOKENIZER)
    exp_title = parser.find(tokens)

    parser = Parser(EDU_TITLE, tokenizer=ID_TOKENIZER)
    edu_title = parser.find(tokens)

    parser = Parser(HOBBY_TITLE, tokenizer=ID_TOKENIZER)
    hobby_title = parser.find(tokens)

    if exp_title:
Beispiel #5
0
from yargy import (rule, or_, Parser)

from yargy.predicates import (eq, gram)
from yargy.pipelines import (caseless_pipeline, pipeline)
from yargy.interpretation import (fact, attribute)

from .helpers import ID_TOKENIZER, select_span_tokens, show_matches

Hobby = fact('Hobby', [attribute('name').repeatable()])

HYPHEN = rule(pipeline(['-', '—', '–']))
COLON = rule(eq(':'))
COMMA = rule(eq(','))
DOT = rule(eq('.'))

TITLES = caseless_pipeline(['Хобби', 'Увлечения'])

TITLE = rule(TITLES, or_(COLON, HYPHEN))

ITEM = rule(or_(gram('NOUN'),
                gram('ADJF')).repeatable(max=3)).interpretation(Hobby.name)

HOBBY_ITEMS = rule(or_(TITLE, ITEM, COMMA, DOT))

HOBBIES = rule(
    TITLE,
    rule(ITEM, or_(COMMA, DOT)).repeatable(),
).interpretation(Hobby)


class HobbyExtractor:
from yargy.pipelines import (caseless_pipeline, pipeline)
from yargy.interpretation import (fact)

from .helpers import load_lines, load_named_entities, select_span_tokens, ID_TOKENIZER

Workplace = fact('Workplace', ['period', 'org_name', 'occupation'])
"""
Dicts
"""
FOLDER = os.path.dirname(__file__)
DICTS_FOLDER = os.path.join(FOLDER, 'dicts')
OCCUPATIONS = load_lines(os.path.join(DICTS_FOLDER, 'occupations.txt'))
"""
"""

HYPHEN = rule(pipeline(['-', '—', '–']))
COMMA = eq(',')

MONTHS = {
    'январь': 1,
    'февраль': 2,
    'март': 3,
    'апрель': 4,
    'май': 5,
    'июнь': 6,
    'июль': 7,
    'август': 8,
    'сентябрь': 9,
    'октябрь': 10,
    'ноябрь': 11,
    'декабрь': 12