Beispiel #1
0
def test_name():
    Name = fact(
        'Name',
        ['first', 'last']
    )

    gnc = gnc_relation()

    FIRST = gram('Name').interpretation(
        Name.first.inflected()
    ).match(gnc)

    LAST = gram('Surn').interpretation(
        Name.last.inflected()
    ).match(gnc)

    NAME = rule(
        FIRST,
        LAST
    ).interpretation(Name)

    parser = Parser(NAME)
    match = parser.match('саше иванову')
    assert match.fact == Name(first='саша', last='иванов')

    match = parser.match('сашу иванову')
    assert match.fact == Name(first='саша', last='иванова')

    match = parser.match('сашу ивановой')
    assert not match
Beispiel #2
0
def test_person():
    Name = fact(
        'Name',
        ['first', 'last'],
    )
    Person = fact('Person', ['position', 'name'])

    LAST = and_(
        gram('Surn'),
        not_(gram('Abbr')),
    )
    FIRST = and_(
        gram('Name'),
        not_(gram('Abbr')),
    )

    POSITION = morph_pipeline(['управляющий директор', 'вице-мэр'])

    gnc = gnc_relation()
    NAME = rule(
        FIRST.interpretation(Name.first).match(gnc),
        LAST.interpretation(Name.last).match(gnc)).interpretation(Name)

    PERSON = rule(
        POSITION.interpretation(Person.position).match(gnc),
        NAME.interpretation(Person.name)).interpretation(Person)

    parser = Parser(PERSON)

    match = parser.match('управляющий директор Иван Ульянов')
    assert match

    assert match.fact == Person(position='управляющий директор',
                                name=Name(first='Иван', last='Ульянов'))
Beispiel #3
0
def get_rules():
    RU = type('RU')
    INT = type('INT')
    NONE = type('NONE')
    NOUN = gram('NOUN')
    ADJF = gram('ADJF')
    NAME = gram('Name')
    PREP = gram('PREP')
    #GEO=gram('Geox')
    GEO = rule(
        and_(
            gram('Geox'),
            not_(
                or_(
                    eq('артема'),
                    eq('фармана'),
                    eq('оскол'),
                    eq('мунарева'),
                ))))

    NAME_OR_NOUN = or_(NAME, NOUN)

    CITY = morph_pipeline(['город', 'Нижний', 'новгород'])

    CITY_EXEP = rule(morph_pipeline(['артем', 'фармана', 'оскол']))

    CITY_NOT = rule(not_(or_(eq('артем'), eq('фармана'), eq('оскол'), INT)))

    CITY_PITER = rule(eq('санкт'), eq('-'), eq('петербург'))

    COMPLICATED_CITY = or_(rule(CITY.optional(), GEO), CITY_PITER)

    FINAL_CITY = or_(COMPLICATED_CITY)
    return FINAL_CITY
Beispiel #4
0
def get_second_rules():
    RU = type('RU')
    INT = type('INT')
    NONE = type('NONE')
    NOUN= gram('NOUN')
    ADJF = gram('ADJF')
    ANIM=gram('anim')
    GENT=gram('gent')
    SGTM=gram('Sgtm')
    FEMN=gram('femn')
    CONJ=gram('CONJ')
    PATR=gram('Patr')
    NAME = gram('Name')
    PREP=gram('PREP')


    SURNAME_CONST=rule(
        and_(
        SGTM,
        ANIM,
        not_(NAME),
        not_ (PATR),
        not_(eq('по')),
        not_(eq('ленина')),
        not_(eq('ульянова'))
        )
    )

    SURNAME=or_(
        SURNAME_CONST,
        rule(eq('Иванов')),
        rule(eq('левченко')),
        rule(eq('эйхвальд')),
        rule(eq('зимина')),
        rule(eq('хитарьян')),
        rule(eq('моторин')),
        rule(eq('рукавишников')),
        rule(eq('деткино')),
        rule(eq('буланцев')),
        rule(eq('багров')),
        rule(eq('шерл')),
        rule(eq('белоцерковский')),
        rule(eq('степанов')),
        rule(eq('шляхов')),
        rule(eq('моисеев')),
        rule(eq('пузанков')),
        rule(eq('попиченко')),
        rule(eq('сергеев')),
        rule(eq('удовенко')),
        rule(eq('тютин')),
        rule(eq('удовенко'))
    )

    COMPLICATED=rule(
        SURNAME.repeatable()
    )


    FINAL = or_(COMPLICATED)
    return FINAL
def req_preposition(preposition: str = None):
    if preposition == "None":
        return y.empty()
    else:
        return y.or_(
            y.and_(yp.gram("PREP"), yp.eq(preposition)), y.not_(yp.gram("PREP"))
        )
Beispiel #6
0
def test_checks():
    tokenizer = MorphTokenizer()
    with pytest.raises(ValueError):
        gram('UNK').activate(tokenizer)

    with pytest.raises(ValueError):
        custom(lambda _: True, types='UNK').activate(tokenizer)
def test_checks():
    tokenizer = MorphTokenizer()
    context = Context(tokenizer)
    with pytest.raises(ValueError):
        gram('UNK').activate(context)

    with pytest.raises(ValueError):
        custom(lambda _: True, types='UNK').activate(context)
Beispiel #8
0
def test_predicate():
    tokenizer = MorphTokenizer()
    predicate = or_(normalized('московским'),
                    and_(gram('NOUN'), not_(gram('femn'))))
    predicate = predicate.activate(tokenizer)

    tokens = tokenizer('московский зоопарк')
    values = [predicate(_) for _ in tokens]
    assert values == [True, True]

    tokens = tokenizer('московская погода')
    values = [predicate(_) for _ in tokens]
    assert values == [True, False]
def req_animacy(animacy: str = "любой"):
    if animacy == "любой":
        return yp.true()
    elif animacy == "одуш.":
        return y.or_(
            y.not_(yp.gram("inan")), yp.gram("anim"), yp.gram("NPRO"), yp.gram("ADJF")
        )
    elif animacy == "неодуш.":
        return y.or_(yp.gram("inan"), yp.gram("anim"), yp.gram("NPRO"), yp.gram("ADJF"))
    else:
        raise ValueError("Incorrect Animacy Type")
Beispiel #10
0
def get_first_rules():
    RU = type('RU')
    INT = type('INT')
    NONE = type('NONE')
    NOUN= gram('NOUN')
    ADJF = gram('ADJF')
    ANIM=gram('anim')
    GENT=gram('gent')
    SGTM=gram('Sgtm')
    CONJ=gram('CONJ')
    PATR=gram('Patr')
    NAME = gram('Name')
    PREP=gram('PREP')


    STATE=or_(
        eq('моторин'),
        eq('юрок'),
        eq('вакула'),
        eq('эйхвальд'),
        eq('иммуно'),
        eq('из'),
        eq('славы'),
        eq('хайбулаев'),
        eq('михална'),
        eq('валиде'),
        eq('шиян'),
        eq('сим'),
        eq('мазитов'),
        eq('хамидов')
    )

    NAME_CONST=rule(
        and_(
        NAME,
        ANIM,
        not_(
            SGTM
        ),
            not_(STATE)
        )
    )

    COMPLICATED=rule(
        NAME_CONST.repeatable()
    )

    FINAL = or_(COMPLICATED)
    return FINAL
def test_predicate():
    tokenizer = MorphTokenizer()
    predicate = or_(
        normalized('московским'),
        and_(
            gram('NOUN'),
            not_(gram('femn'))
        )
    )
    context = Context(tokenizer)
    predicate = predicate.activate(context)

    tokens = tokenizer('московский зоопарк')
    values = [predicate(_) for _ in tokens]
    assert values == [True, True]

    tokens = tokenizer('московская погода')
    values = [predicate(_) for _ in tokens]
    assert values == [True, False]
Beispiel #12
0
def test_constant_attribute():
    MONEY_RULE = rule(
        gram('INT').interpretation(Money.count),
        dictionary({'тысяча'}).interpretation(Money.base.const(10**3)),
        dictionary({'рубль', 'доллар'}).interpretation(Money.currency),
    ).interpretation(Money)

    parser = Parser(MONEY_RULE)
    matches = list(parser.match('1 тысяча рублей'))
    assert matches[0].fact == Money(count=1, base=1000, currency='рублей')
Beispiel #13
0
def test_person():
    Name = fact(
        'Name',
        ['first', 'last'],
    )
    Person = fact('Person', ['position', 'name'])

    LAST = and_(
        gram('Surn'),
        not_(gram('Abbr')),
    )
    FIRST = and_(
        gram('Name'),
        not_(gram('Abbr')),
    )

    class PositionPipeline(MorphPipeline):
        grammemes = {'Position'}
        keys = ['управляющий директор', 'вице-мэр']

    POSITION = gram('Position')

    gnc = gnc_relation()

    NAME = rule(
        FIRST.match(gnc).interpretation(Name.first),
        LAST.match(gnc).interpretation(Name.last)).interpretation(Name)

    PERSON = rule(POSITION.interpretation(Person.position),
                  NAME.interpretation(Person.name)).interpretation(Person)

    parser = Parser(PERSON, pipelines=[PositionPipeline()])

    matches = list(parser.match('управляющий директор Иван Ульянов'))
    assert len(matches) == 1

    assert matches[0].fact == Person(position='управляющий директор',
                                     name=Name(first='Иван', last='Ульянов'))
Beispiel #14
0
def test_activate():
    from yargy.pipelines import pipeline
    from yargy.predicates import gram
    from yargy.tokenizer import MorphTokenizer

    tokenizer = MorphTokenizer()

    A = pipeline(['a']).named('A')
    B = A.activate(tokenizer)
    assert_bnf(B, 'A -> pipeline')

    A = rule(gram('NOUN')).named('A')
    B = A.activate(tokenizer)
    assert_bnf(B, "A -> gram('NOUN')")
def req_argument():
    return y.and_(
        y.not_(
            y.or_(  ## prohibits arguments from being any of following parts-of-speech
                yp.gram("PREP"),
                yp.gram("CONJ"),
                yp.gram("PRCL"),
                yp.gram("INTJ"),
                yp.gram("ADJF"),
            )
        ),
        y.or_(yp.gram("NOUN"), yp.gram("NPRO")),
    )
Beispiel #16
0
def test_main():
    relation = and_(
        number_relation(),
        gender_relation()
    )

    A = rule(
        gram('Surn'),
        main(gram('Name'))
    ).match(relation)

    B = gram('VERB').match(relation)

    AB = rule(A, B)

    parser = Parser(AB)
    match = parser.match('иванов иван стал')
    assert match

    match = parser.match('иванов иван стали')
    assert not match

    match = parser.match('ивановы иван стал')
    assert match
Beispiel #17
0
def get_rules():
    RU = type('RU')
    INT = type('INT')
    NONE = type('NONE')
    NOUN = gram('NOUN')
    ADJF = gram('ADJF')
    GEO = gram('Geox')
    PREP = gram('PREP')
    CONJ = gram('CONJ')

    NAME = rule(and_(gram('Name'), not_(PREP), not_(GEO)))

    NOUN_NOT_CONJ = rule(and_(NOUN, not_(CONJ)))

    STREET_SUFFIXS = morph_pipeline([
        'улица', 'тракт', 'бульвар', 'проспект', 'микрорайон', 'проезд',
        'шоссе', 'парк'
    ])

    SPECIAL_STREET_SUFFIXS = morph_pipeline(['шоссе', 'тракт'])

    SIMPLE_STREETS_FROM_ARRAY = morph_pipeline([
        'краснопресненская', 'республике', 'маршала захарова', 'доватора',
        'мичурина', 'зеленые аллеи', 'бехтеева', 'октябрьская',
        'новогиреевская', 'югорская', 'артема', 'парковая', 'зеленые аллеи',
        'алтуфьевское', 'горького', 'Кавказский', 'хамовнический вал',
        'Кусковская', 'марьинский парк', 'московская', 'береговая',
        'антонова овсиенко', 'школьная', 'юнтоловский', 'гагарина'
    ])

    EXCEPTIONAL_STREET_CONST = morph_pipeline(['Кавказский'])

    NOUN_NOT_APPART = rule(not_(or_(eq('дом'), eq('квартира'), INT, CONJ)))

    COMPLICATED_STREETS = or_(
        rule(STREET_SUFFIXS, INT, NOUN, NOUN),
        rule(STREET_SUFFIXS, INT, ADJF, NOUN),
        rule(STREET_SUFFIXS, NOUN_NOT_CONJ, NOUN_NOT_APPART, NAME.optional()),
        rule(NAME, NOUN_NOT_APPART), rule(ADJF, NAME),
        rule(STREET_SUFFIXS, ADJF, NOUN_NOT_APPART),
        rule(STREET_SUFFIXS, CONJ, NOUN, NOUN))

    SIMPLE_STREETS_WITH_STREET_SUFFIX = rule(STREET_SUFFIXS, NOUN_NOT_APPART)
    SPECIAL_SIMPLE_STREETS_WITH_STREET_SUFFIX = rule(ADJF,
                                                     SPECIAL_STREET_SUFFIXS)

    SIMPLE_STREETS = or_(SPECIAL_SIMPLE_STREETS_WITH_STREET_SUFFIX,
                         SIMPLE_STREETS_WITH_STREET_SUFFIX,
                         SIMPLE_STREETS_FROM_ARRAY)

    FINAL_STREET = or_(COMPLICATED_STREETS, SIMPLE_STREETS)

    return FINAL_STREET
def req_deverbal(require_deverbal_noun: str = "?"):
    if require_deverbal_noun == "1":  ## strictly deverbal noun
        return y.and_(yp.gram("NOUN"), yp.in_caseless(deverbal_nouns))
    elif require_deverbal_noun == "0":  ## strictly regular verb
        return y.or_(yp.gram("VERB"), yp.gram("INFN"))
    elif require_deverbal_noun == "?":  ## anything
        return y.or_(
            y.and_(yp.gram("NOUN"), yp.in_caseless(deverbal_nouns)),
            yp.gram("VERB"),
            yp.gram("INFN"),
        )
    else:
        raise ValueError("Incorrect deverbal status")
Beispiel #19
0
def get_rules():
    RU = type('RU')
    INT = type('INT')
    NONE = type('NONE')
    NOUN = gram('NOUN')
    ADJF = gram('ADJF')
    CONJ = gram('CONJ')
    NAME = gram('Name')
    PREP = gram('PREP')
    NPRO = gram('NPRO')
    #GEO=gram('Geox')
    GEO = rule(
        and_(
            gram('Geox'),
            not_(
                or_(
                    eq('артема'),
                    eq('фармана'),
                    eq('оскол'),
                    eq('мунарева'),
                ))))

    NAME_OR_NOUN = or_(NAME, NOUN)

    HOUSE = morph_pipeline(['дом', 'корпус', 'квартира', 'строение', 'ст'])

    CITY_EXEP = rule(morph_pipeline(['артем', 'фармана', 'оскол']))

    HOUSE_NOT = rule(and_(not_(ADJF)))
    HOUSE1 = morph_pipeline(['a', 'а', '/', 'б'])

    UNIT1 = or_(
        rule(and_(INT, not_(eq('3'))), HOUSE1.optional(), HOUSE_NOT.optional(),
             INT.optional()))

    DOUBLED = rule(RU, RU)

    UNIT = or_(rule(HOUSE.optional(), UNIT1))

    COMPLICATED_HOUSE = rule(UNIT.repeatable())

    FINAL_HOUSE = or_(COMPLICATED_HOUSE)
    return FINAL_HOUSE
def req_predicate(word: str = "?", predicate_type: str = "глаг"):
    # add predicate_type handling
    if predicate_type == "глаг":
        predicate = y.or_(yp.gram("VERB"), yp.gram("INFN"))
    elif predicate_type == "сущ":
        predicate = y.or_(yp.gram("INFN"), yp.gram("NOUN"))
    elif predicate_type == "любой":
        predicate = y.or_(yp.gram("VERB"), yp.gram("INFN"), yp.gram("NOUN"))
    else:
        raise ValueError("predicate_type must be глаг or сущ or любой")
    if word != "?":
        if "|" not in word:
            # single-word scope
            predicate = y.and_(yp.normalized(word), predicate)
        else:
            predicate_words = word.split("|")
            scope_rule = list(map(yp.normalized, predicate_words))
            scope_rule = y.or_(*scope_rule)
            predicate = y.and_(scope_rule, predicate)

    return predicate
Beispiel #21
0
def get_mid_rules():
    RU = type('RU')
    INT = type('INT')
    NONE = type('NONE')
    NOUN= gram('NOUN')
    ADJF = gram('ADJF')
    CONJ=gram('CONJ')
    PATR=gram('Patr')
    NAME = gram('Name')
    PREP=gram('PREP')

    PATRONYMIC=rule(
        PATR
    )

    COMPLICATED=rule(
        PATRONYMIC.repeatable()
    )

    FINAL = or_(COMPLICATED)
    return FINAL
Beispiel #22
0
    'боец',
    'атлет',
    'футболист',
    'баскетболист',
    'агроном',
    'президент',
    'сопрезидент',
    'вице-президент',
    'экс-президент',
    'председатель',
    'руководитель',
    'директор',
    'глава',
])

GENT = gram('gent')

WHERE = or_(
    rule(GENT),
    rule(GENT, GENT),
    rule(GENT, GENT, GENT),
    rule(GENT, GENT, GENT, GENT),
    rule(GENT, GENT, GENT, GENT, GENT),
)

POSITION = or_(POSITION, rule(POSITION, WHERE)).interpretation(Person.position)

NAME = NAME.interpretation(Person.name)

SIMPLE_NAME = SIMPLE_NAME.interpretation(Person.name)
Beispiel #23
0
# coding: utf-8
from __future__ import unicode_literals
from yargy import (rule, fact, not_, and_, or_, attribute,)
from yargy.predicates import (gram, caseless, normalized, is_title, dictionary, custom,)
#from yargy.relations import (gnc_relation, case_relation,)

## 1 - FACT INIT
DateRelative = fact('DateRelative', ['name'])
from natasha.dictionaries.daterelative import DATERELATIVE_DICT
###

### 2 - INIT GRAMS & GRAM RULES (pymorphy2)
ADJF = gram('ADJF')
NOUN = gram('NOUN')
INT = gram('INT')
TITLE = is_title()

###


### 1-ST RING RULES
R1_SIMPLE = rule(
   DATERELATIVE_DICT,
).repeatable()
###


### 2-ST RING RULES

###
    
Beispiel #24
0
from yargy import (rule, and_, or_, not_, fact)

from yargy.predicates import (caseless, normalized, eq, length_eq, gram,
                              dictionary, is_single, is_title)

from yargy.relations import gnc_relation

Location = fact(
    'Location',
    ['name'],
)

gnc = gnc_relation()

REGION = rule(
    gram('ADJF').match(gnc),
    dictionary({
        'край',
        'район',
        'область',
        'губерния',
        'уезд',
    }).match(gnc),
).interpretation(Location.name.inflected())

gnc1 = gnc_relation()
gnc2 = gnc_relation()

FEDERAL_DISTRICT = rule(
    rule(caseless('северо'), '-').optional(),
    dictionary({
Beispiel #25
0
    gram, dictionary,
    is_single, is_title
)
from yargy.relations import gnc_relation


Location = fact(
    'Location',
    ['name'],
)


gnc = gnc_relation()

REGION = rule(
    gram('ADJF').match(gnc),
    dictionary({
        'край',
        'район',
        'область',
        'губерния',
        'уезд',
    }).match(gnc),
).interpretation(Location.name.inflected())

gnc = gnc_relation()

FEDERAL_DISTRICT = rule(
    rule(caseless('северо'), '-').optional(),
    dictionary({
        'центральный',
        if filename.endswith('.docx'):
            documents.append(filename)
    return documents


documents = GetDocuments()
path = baseDir + '\\' + documents[0]
print(path)
document = Document(path)

from yargy import Parser, rule, and_
from yargy.predicates import gram, is_capitalized, dictionary

Name = rule(
    and_(
        gram('ADJF'),
        or_(is_capitalized(), is_upper()
            )  # http://pymorphy2.readthedocs.io/en/latest/user/grammemes.html
    ),
    gram('ADJF').optional().repeatable(),
    dictionary({'институт', 'кафедра', 'университет'}))

Name1 = rule(
    dictionary({'институт', 'кафедра', 'университет'}),
    gram('ADJF'),
    gram('ADJF').optional().repeatable(),
    gram('NOUN'),
)
parser = Parser(Name)
# parser = Parser(Name1)
Beispiel #27
0
def extract(text):
    with open(os.path.join(os.getcwd(), 'list_diseases\\diseases'), encoding='UTF-8') as f:
        diseases = f.read().split('\n')


    text = text.replace('\ufeff', '')
    text = text.replace('\n', ' \n ')
    text = text.replace('\\', ' ')


    symptoms = ['Дата рождения', 'Дата осмотра','Дата заболевания', 'Возраст', 'Болен дней','Болен часов','Возраст в днях','Время поступления', 
                'Время заболевания', 'рост','вес', 'IMT', 'давление диаст', 'давление сист', 'температура поступления','мах температура', 'Т-Ан01', 'Т-Ан03', 
                'пол', 'др заболевания в анамнезе', 'кем направлен', 'побочное действие лекартсв','аллергическая реакция', 'озноб', 'слабость', 'вялость','головная боль', 
                'нарушение сна', 'нарушение аппетита', 'ломота','тошнота', 'нарушение сознания', 'Судороги', 'Парестезии', 'эритема', 
                'с четкими границами', 'валик', 'боль','Гиперемия', 'Отек', 'Лимфаденит', 'Лимфангит', 'квартира, дом','контакт с зараженными','речная рыба','провоцирущие факторы',
                'предрасполагающие факторы','кол-во сопут заболеваний','соц категория','сопутствующий диагноз','основной диагноз', 'контакт с зараженными', 'пищевой анамнез',
               'раневые ворота', 'аллергия на лекарства', 'клещ', 'географический анамнез', 'вредные привычки', 'домашние животные', 'условия труда','избыточное питание',
               'ППТ', 'ЛПТ', 'бытовые условия', 'питание', 'интоксикация', 'ЧСС']

    dict_symp = dict.fromkeys(symptoms)


    # In[5]:


    dates_lst = []

    DAY = and_(
        gte(1),
        lte(31)
    )
    MONTH = and_(
        gte(1),
        lte(12)
    )
    YEAR = and_(
        gte(1),
        lte(19)
    )
    YEARFULL = and_(
        gte(1900),
        lte(2020)
    )
    DATE = or_(
        rule(YEAR,'.',MONTH,'.',DAY),
        rule(DAY,'.',MONTH,'.',YEAR),
        rule(DAY,'.',MONTH,'.',YEARFULL),
        rule(DAY,'.',MONTH),
        rule(DAY,'.',MONTH,YEARFULL),
        rule(DAY,'.',MONTH,YEAR))

    parser = Parser(DATE)
    for match in parser.findall(text):
        dates_lst.append(''.join([_.value for _ in match.tokens]))

    if int(dates_lst[1][-2:])-int(dates_lst[0][-2:])<0:
        dict_symp['Дата рождения'] = dates_lst[0]
        dict_symp['Дата осмотра'] = dates_lst[1]
        dict_symp['Дата заболевания'] = dates_lst[2]
    else: 
        birth = None
        dict_symp['Дата осмотра'] = dates_lst[0]
        dict_symp['Дата заболевания'] = dates_lst[1]

    if len(dict_symp['Дата заболевания'])==5:
        dict_symp['Дата заболевания'] += dict_symp['Дата осмотра'][dict_symp['Дата осмотра'].rfind('.'):]

    TYPE = morph_pipeline(['дней'])
    parser = Parser(TYPE)
    lst = []
    for match in parser.findall(text):
        lst.append((match.span, [_.value for _ in match.tokens]))

    if len(lst)>0 and dict_symp['Дата заболевания'] is None:
        dict_symp['Дата заболевания'] = text[lst[0][0][0]-20:lst[0][0][0]+20]
        dict_symp['Дата заболевания'] = re.findall(r'\d+', dict_symp['Дата заболевания'])[0]
        dict_symp['Дата заболевания'] = str(int(dict_symp['Дата осмотра'][:2])-int(dict_symp['Дата заболевания']))
        dict_symp['Дата заболевания'] = dict_symp['Дата заболевания']+dict_symp['Дата осмотра'][2:]



    age_lst = []

    AGE = and_(
        gte(0),
        lte(100)
    )
    AGE_RULE = or_(rule("(",AGE,")"),
                  rule(gram('ADJF'),",",AGE))

    parser = Parser(AGE_RULE)
    for match in parser.findall(text):
        s = ''.join([_.value for _ in match.tokens])
        age_lst.append((re.findall(r'\d+', s)[0]))

    if len(age_lst)>0:
        dict_symp['Возраст'] = age_lst[-1]


    try:
        d1 = datetime.strptime(dict_symp['Дата осмотра'], '%d.%m.%Y')
    except:
        d1 = datetime.strptime(dict_symp['Дата осмотра'], '%d.%m.%y')
        d1 = d1.strftime('%d.%m.%Y')
        d1 = datetime.strptime(d1, '%d.%m.%Y')
    try:
        d2 = datetime.strptime(dict_symp['Дата заболевания'], '%d.%m.%Y')
    except:
        d2 = datetime.strptime(dict_symp['Дата заболевания'], '%d.%m.%y')
        d2 = d2.strftime('%d.%m.%Y')
        d2 = datetime.strptime(d2, '%d.%m.%Y')

    dict_symp['Болен дней'] = (d1 - d2).days
    dict_symp['Болен часов'] = (int(dict_symp['Болен дней'])-1)*24



    if dict_symp['Дата рождения'] is None:
        dict_symp['Возраст в днях'] = int(dict_symp['Возраст'])*365
    else:
        d1 = datetime.strptime(dict_symp['Дата осмотра'], '%d.%m.%Y')
        d2 = datetime.strptime(dict_symp['Дата рождения'], '%d.%m.%Y')
        dict_symp['Возраст в днях'] = (d1 - d2).days



    time_lst = []

    HOURS = and_(
        gte(0),
        lte(59)
    )

    MINUTES = and_(
        gte(0),
        lte(59)
    )

    TIME = or_(rule(HOURS,':',MINUTES),
               rule(not(normalized('.')),HOURS, normalized('час')),)

    parser = Parser(TIME)
    for match in parser.findall(text):
        s = (''.join([_.value for _ in match.tokens]))
        s = s.replace('часов', ':00')
        s = s.replace('час', ':00')
        time_lst.append(s)

    if len(time_lst)>0: 
        dict_symp['Время поступления'] = time_lst[0]
        dict_symp['Время заболевания'] = time_lst[0]
    if len(time_lst)>1: 
        dict_symp['Время заболевания'] = time_lst[1]

    t1 = dict_symp['Время поступления']
    t2 = dict_symp['Время заболевания']
    delta = int(t1[:t1.find(':')])+24-int(t2[:t2.find(':')])
    dict_symp['Болен часов'] = dict_symp['Болен часов'] + delta



    HEIGHT = and_(
        gte(50),
        lte(250)
    )
    WEIGHT = and_(
        gte(10),
        lte(150)
    )

    HEIGHT_RULE = or_(rule(normalized('рост'),'-',HEIGHT),
                      rule(normalized('рост'),':',HEIGHT),
                     rule(normalized('рост'),HEIGHT))

    WEIGHT_RULE = or_(rule(normalized('вес'),'-',WEIGHT),
                      rule(normalized('вес'),':',WEIGHT),
                     rule(normalized('вес'),WEIGHT))

    s=''
    parser = Parser(HEIGHT_RULE)
    for match in parser.findall(text):
        s = (''.join([_.value for _ in match.tokens]))
        s = re.findall(r'\d+', s)[0]

    if s != '':
        dict_symp['рост'] = int(s)

    s = ''
    parser = Parser(WEIGHT_RULE)
    for match in parser.findall(text):
        s = (''.join([_.value for _ in match.tokens]))
        s = re.findall(r'\d+', s)[0]

    if s != '':
        dict_symp['вес'] = int(s)

    if (dict_symp['рост'] is not None) and (dict_symp['вес'] is not None):
        dict_symp['IMT'] = round(dict_symp['вес']/(dict_symp['рост']/100*dict_symp['рост']/100),2)




    ADSIST = and_(
        gte(50),
        lte(250)
    )
    ADDIAST = and_(
        gte(20),
        lte(200)
    )

    PRES = or_(rule('АД', ADSIST,'/',ADDIAST),
               rule('АД', ADSIST,ADDIAST),
              rule('АД', ADSIST, ':',ADDIAST),
              rule('АД','-', ADSIST, '/',ADDIAST),
              rule('А/Д', ADSIST, '/',ADDIAST),
              rule('А/Д', ADSIST, ADDIAST),
              rule('А/Д',' ', ADSIST, '/',ADDIAST),
               rule(ADSIST, '/',ADDIAST))

    s = ''
    parser = Parser(PRES)
    for match in parser.findall(text):
        s = (''.join([_.value for _ in match.tokens]))
        s = re.findall(r'\d+', s)

    if len(s)>1:
        dict_symp['давление сист'] = s[0]
        dict_symp['давление диаст'] = s[1]


    PULSE = and_(
        gte(40),
        lte(150)
    )

    PRES = or_(rule('ЧСС','-',PULSE),
               rule('ЧСС',PULSE),
              rule('ЧСС','/',PULSE),
              rule('пульс',PULSE),)

    s = ''
    parser = Parser(PRES)
    for match in parser.findall(text):
        s = (''.join([_.value for _ in match.tokens]))
        s = re.findall(r'\d+', s)

    if len(s)>0:
        dict_symp['ЧСС'] = s[0]


    status = text[text.find('Объективный статус'): text.find('Объективный статус')+text[text.find('Объективный статус')+1:].find(' \n  \n')]

    DEGREES = and_(
        gte(34),
        lte(42)
    )
    SUBDEGREES = and_(
        gte(0),
        lte(9)
    )

    TEMP = or_(rule(DEGREES,',',SUBDEGREES),
               rule(DEGREES,'.',SUBDEGREES),
              rule(DEGREES))

    temp_lst = []
    parser = Parser(TEMP)
    for match in parser.findall(status):
        temp_lst.append(''.join([_.value for _ in match.tokens]))

    if len(temp_lst)>0:
        dict_symp['температура поступления'] = temp_lst[0]

    temp_lst = []
    parser = Parser(TEMP)
    for match in parser.findall(text):
        temp_lst.append(''.join([_.value for _ in match.tokens]))

    if len(temp_lst)>0:
        if dict_symp['температура поступления'] is None:
            dict_symp['температура поступления'] = temp_lst[0]
        dict_symp['мах температура'] = max([float(i.replace(',','.')) for i in temp_lst])



    if dict_symp['мах температура']>38:
        dict_symp['Т-Ан01'] = 1
    else: 
        dict_symp['Т-Ан01'] = 0

    if dict_symp['мах температура']>40:
        dict_symp['Т-Ан03'] = 3
    elif dict_symp['мах температура']>39: 
        dict_symp['Т-Ан03'] = 2
    elif dict_symp['мах температура']>38: 
        dict_symp['Т-Ан03'] = 1
    else:
        dict_symp['Т-Ан03'] = 0



    sex_lst = []
    SEX_RULE = or_(rule(normalized('женский')),
                     rule(normalized('мужской')))

    parser = Parser(SEX_RULE)
    for match in parser.findall(text):
        sex_lst.append(''.join([_.value for _ in match.tokens]))

    dict_symp['пол'] = sex_lst[0]
    dict_symp['пол'] = dict_symp['пол'].lower().replace('женский', '2')
    dict_symp['пол'] = dict_symp['пол'].lower().replace('мужской', '1')


    TYPE = morph_pipeline(diseases[:-1])

    anamnez = text[text.find('Анамнез'): text.find('Анамнез')+text[text.find('Анамнез')+1:].rfind('Анамнез')]
    anamnez = anamnez.replace('туберкулез',' ')
    anamnez = anamnez.replace('туберкулёз',' ')
    family = anamnez[anamnez.find('Семейный'):anamnez.find('Семейный')+60]
    anamnez = anamnez.replace(family,' ')
    anamnez = anamnez.replace('описторхоз',' ')
    dis_lst = []
    parser = Parser(TYPE)
    for match in parser.findall(anamnez):
        dis_lst.append(' '.join([_.value for _ in match.tokens]))

    op_rule = or_(rule(normalized('описторхоз'), not_(normalized('не'))))
    parser = Parser(op_rule)
    lst = []
    for match in parser.findall(text):
        lst.append((match.span, [_.value for _ in match.tokens]))
    if len(lst)>0:
        dis_lst.append(' описторхоз')

    tub_rule = rule(normalized('туберкулез'), not_(normalized('отрицает')))
    parser = Parser(tub_rule)
    lst = []
    for match in parser.findall(text):
        lst.append((match.span, [_.value for _ in match.tokens]))
    if len(lst)>0:
        dis_lst.append(' туберкулез')

    dict_symp['др заболевания в анамнезе'] = ', '.join(dis_lst)
    dict_symp['др заболевания в анамнезе'] = morph.parse(dict_symp['др заболевания в анамнезе'])[0].normal_form


    TYPE = morph_pipeline(['Поликлиника',"скорая помощь", "ск/помощь", 'СМП', "обратился"])

    napr = None
    napr_lst = []
    parser = Parser(TYPE)
    for match in parser.findall(text):
        napr_lst.append(' '.join([_.value for _ in match.tokens]))
    if len(napr_lst)>0:
        napr = napr_lst[-1]
        napr = morph.parse(napr)[0].normal_form
    if napr == "обратиться":
        dict_symp['кем направлен'] = 3
    elif napr == "скорая помощь" or napr == "ск/помощь" or napr == 'смп'or napr == "ск / помощь" or napr == "скорой помощь" or napr == "скорую помощь":
        dict_symp['кем направлен'] = 1
    elif napr == "поликлиника":
        dict_symp['кем направлен'] = 2


    ALLERG_RULE = or_(rule(normalized('Аллергическая'),normalized('реакция'), normalized('на'), gram('NOUN').optional().repeatable(), gram('ADJF').optional().repeatable()),
                     rule(normalized('Аллергическая'),normalized('реакция'), normalized('на'), gram('NOUN').optional().repeatable(), gram('ADJF').optional().repeatable(), 
                          '"', gram('ADJF').optional().repeatable(), gram('NOUN').optional().repeatable(), '"'),
                     rule(normalized('Аллергическая'),normalized('реакция'), normalized('на'), gram('NOUN').optional().repeatable(), gram('ADJF').optional().repeatable(),',',
                         gram('NOUN').optional().repeatable(), gram('ADJF').optional().repeatable(),',',gram('NOUN').optional().repeatable(), gram('ADJF').optional().repeatable()),
                     rule(normalized('Аллергическая'),normalized('реакция'), normalized('на'), gram('ADJF').optional(),gram('NOUN').optional().repeatable()))

    parser = Parser(ALLERG_RULE)
    for match in parser.findall(text):
        item = (' '.join([_.value for _ in match.tokens]))
        dict_symp['аллергическая реакция'] = item[item.find('на')+3:]
    if dict_symp['аллергическая реакция'] is not None:
        dict_symp['побочное действие лекартсв'] = 1
        dict_symp['аллергия на лекарства'] = 1


    symptoms = [['озноб', 'познабливание'], 'слабость', 'вялость','головная боль', 'нарушение сна', 'нарушение аппетита', 'ломота',
                'тошнота', 'нарушение сознания','Судороги', 'Парестезии', 'эритема', ['с четкими границами', 'границами четкими' , 
                'четкими неровными краями','с четкими краями', 'краями четкими' , 'четкими неровными краями'], 
                'валик', 'боль',['Гиперемия', 'гиперемирована'], 'Отек', 'Лимфаденит', 'Лимфангит']

    for i in symptoms:
        lst = []
        if isinstance(i, str):
            TYPE = morph_pipeline([i])
        else:
            TYPE = morph_pipeline(i)

        parser = Parser(TYPE)
        for match in parser.findall(text):
            lst.append(' '.join([_.value for _ in match.tokens]))
        if len(lst)>0:
            if isinstance(i, str):
                dict_symp[i]=1
            else:
                dict_symp[i[0]]=1
        else:
            if isinstance(i, str):
                dict_symp[i]=0
            else:
                dict_symp[i[0]]=0


    # In[20]:


    TYPE = morph_pipeline(['географический', 'выезжал'])
    parser = Parser(TYPE)
    lst = []
    for match in parser.findall(text):
        lst.append((match.span, [_.value for _ in match.tokens]))
    if len(lst)>0:
        text_fish = text[match.span[1]-40:match.span[1]+40]
        geo_rule = rule(not_(normalized('не')),normalized('выезжал'))
        parser = Parser(geo_rule)
        lst = []
        for match in parser.findall(text_fish):
            lst.append((match.span, [_.value for _ in match.tokens]))
        if len(lst)>0:
            dict_symp['географический анамнез'] = 1
        else:
            dict_symp['географический анамнез'] = 0



    TYPE = morph_pipeline(['бытовые'])
    parser = Parser(TYPE)
    lst = []
    for match in parser.findall(text):
        lst.append((match.span, [_.value for _ in match.tokens]))
    if len(lst)>0:
        text_fish = text[match.span[1]-30:match.span[1]+30]
        cond_rule = rule(not_(normalized('не')),normalized('удовлетворительные'))
        parser = Parser(cond_rule)
        lst = []
        for match in parser.findall(text_fish):
            lst.append((match.span, [_.value for _ in match.tokens]))
        if len(lst)>0:
            dict_symp['бытовые условия'] = 1
        else:
            dict_symp['бытовые условия'] = 0



    TYPE = morph_pipeline(['условия труда'])
    parser = Parser(TYPE)
    lst = []
    for match in parser.findall(text):
        lst.append((match.span, [_.value for _ in match.tokens]))
    if len(lst)>0:
        text_fish = text[match.span[1]-20:match.span[1]+20]
        cond_rule = rule(not_(normalized('не')),normalized('удовлетворительные'))
        parser = Parser(cond_rule)
        lst = []
        for match in parser.findall(text_fish):
            lst.append((match.span, [_.value for _ in match.tokens]))
        if len(lst)>0:
            dict_symp['условия труда'] = 1
        else:
            dict_symp['условия труда'] = 0


    TYPE = morph_pipeline(['питание'])
    parser = Parser(TYPE)
    lst = []
    for match in parser.findall(text):
        lst.append((match.span, [_.value for _ in match.tokens]))
    if len(lst)>0:
        text_fish = text[match.span[1]-20:match.span[1]+20]
        food_rule = or_(rule(not_(normalized('не')),normalized('удовлетворительное')),
                       rule(not_(normalized('не')),normalized('полноценное')))
        parser = Parser(food_rule)
        lst = []
        for match in parser.findall(text_fish):
            lst.append((match.span, [_.value for _ in match.tokens]))
        if len(lst)>0:
            dict_symp['питание'] = 1
        else:
            dict_symp['питание'] = 0

        food_rule = rule(not_(normalized('не')),normalized('избыточное'))
        parser = Parser(food_rule)
        lst = []
        for match in parser.findall(text_fish):
            lst.append((match.span, [_.value for _ in match.tokens]))
        if len(lst)>0:
            dict_symp['избыточное питание'] = 1
        else:
            dict_symp['избыточное питание'] = 0


    TYPE = morph_pipeline(['рыба'])
    parser = Parser(TYPE)
    lst = []
    for match in parser.findall(text):
        lst.append((match.span, [_.value for _ in match.tokens]))
    if len(lst)>0:
        text_fish = text[match.span[1]-40:match.span[1]+40]
        TYPE = morph_pipeline(['да', 'постоянно'])
        parser = Parser(TYPE)
        lst = []
        for match in parser.findall(text_fish):
            lst.append((match.span, [_.value for _ in match.tokens]))
        if len(lst)>0:
            dict_symp['речная рыба'] = 1
        fish_rule = rule(not_(normalized('не')),normalized('употребляет'))
        parser = Parser(fish_rule)
        lst = []
        for match in parser.findall(text_fish):
            lst.append((match.span, [_.value for _ in match.tokens]))
        if len(lst)>0:
            dict_symp['речная рыба'] = 1
        else:
            dict_symp['речная рыба'] = 0


    TYPE = morph_pipeline(['контакт'])
    parser = Parser(TYPE)
    lst = []
    for match in parser.findall(text):
        lst.append((match.span, [_.value for _ in match.tokens]))
    if len(lst)>0:
        text_fish = text[match.span[1]-40:match.span[1]+40]
        TYPE = morph_pipeline(['да'])
        parser = Parser(TYPE)
        lst = []
        for match in parser.findall(text_fish):
            lst.append((match.span, [_.value for _ in match.tokens]))
        if len(lst)>0:
            dict_symp['контакт с зараженными'] = 1
        else:
            dict_symp['контакт с зараженными'] = 0



    lst = []
    TYPE = morph_pipeline(['рана', "раневые ворота", "входные ворота"])

    parser = Parser(TYPE)
    for match in parser.findall(text):
        lst.append(' '.join([_.value for _ in match.tokens]))
    if len(lst)>0:
        dict_symp["раневые ворота"] = 1
    else:
        dict_symp["раневые ворота"] = 0



    lst = []
    TYPE = morph_pipeline(['интоксикация'])

    parser = Parser(TYPE)
    for match in parser.findall(text):
        lst.append(' '.join([_.value for _ in match.tokens]))
    if len(lst)>0:
        dict_symp["интоксикация"] = 1
    else:
        dict_symp["интоксикация"] = 0



    lst = []
    TYPE = morph_pipeline(['клещ', "присасывание"])

    parser = Parser(TYPE)
    for match in parser.findall(text):
        lst.append(' '.join([_.value for _ in match.tokens]))
    if len(lst)>0:
        dict_symp["клещ"] = 1
    else:
        dict_symp["клещ"] = 0


    TYPE = morph_pipeline(['сырой воды'])
    parser = Parser(TYPE)
    lst = []
    for match in parser.findall(text):
        lst.append((match.span, [_.value for _ in match.tokens]))
    if len(lst)>0:
        text_fish = text[match.span[1]-80:match.span[1]+80]
        TYPE = morph_pipeline(['не было', 'отрицает', 'нет'])
        parser = Parser(TYPE)
        lst = []
        for match in parser.findall(text_fish):
            lst.append((match.span, [_.value for _ in match.tokens]))
        if len(lst)>0:
            dict_symp['пищевой анамнез'] = 0
        else:
            dict_symp['пищевой анамнез'] = 1



    TYPE = morph_pipeline(['вредные привычки', 'алкоголь'])
    parser = Parser(TYPE)
    lst = []
    for match in parser.findall(text):
        lst.append((match.span, [_.value for _ in match.tokens]))
    if len(lst)>0:
        text_fish = text[match.span[1]-80:match.span[1]+80]
        TYPE = morph_pipeline(['не было', 'отрицает', 'нет', 'не употребляет'])
        parser = Parser(TYPE)
        lst = []
        for match in parser.findall(text_fish):
            lst.append((match.span, [_.value for _ in match.tokens]))
        if len(lst)>0:
            dict_symp['вредные привычки'] = 0
        else:
            dict_symp['вредные привычки'] = 1

    smoke_rule = or_(rule(not_(normalized('не')),normalized('курит')),
                    rule(not_(normalized('не')),normalized('употребляет')))
    parser = Parser(smoke_rule)
    lst = []
    for match in parser.findall(text_fish):
        lst.append((match.span, [_.value for _ in match.tokens]))
    if len(lst)>0:
        dict_symp['вредные привычки'] = 1


    home = None
    home_types = [['бездомный'],
                   ['дом благоустроенный'],
                   ['дом не благоустроенный','дом неблагоустроенный'],
                   ['квартира не благоустроенная', 'квартира неблагоустроенная'],
                   ['квартира благоустроенная'],]

    for i in range(len(home_types)):
        home_lst = []
        TYPE = morph_pipeline(home_types[i])
        parser = Parser(TYPE)
        for match in parser.findall(text):
            home_lst.append(' '.join([_.value for _ in match.tokens]))
        if len(home_lst)>0:
            home = i

    dict_symp['квартира, дом'] = home


    pets = []
    pet_types = [['кошка'],
                   ['собака'],
                   ['корова','коза']]

    for i in range(len(pet_types)):
        pet_lst = []
        TYPE = morph_pipeline(pet_types[i])
        parser = Parser(TYPE)
        for match in parser.findall(text):
            pet_lst.append(' '.join([_.value for _ in match.tokens]))
        if len(pet_lst)>0:
            pets.append(i+1)

    if len(pets)>1:
        pets = 4
    elif len(pets)>0:
        pets = pets[0]
    else:
        pets = 0
    dict_symp['домашние животные'] = pets


    factors = []
    factor_types = [['ссадины',"царапины", "раны", "расчесы", "уколы", "потертости", "трещины", 'вскрытие'],
                   ['ушибы'],
                   ['переохлаждение','перегревание','смена температуры'],
                   ['инсоляция'],
                   ['стресс'],
                   ['переутомление']]

    for i in range(len(factor_types)):
        factor_lst = []
        TYPE = morph_pipeline(factor_types[i])
        parser = Parser(TYPE)
        for match in parser.findall(text):
            factor_lst.append(' '.join([_.value for _ in match.tokens]))
        if len(factor_lst)>0:
            factors.append(i+1)

    dict_symp['провоцирущие факторы'] = factors


    factors = []
    factor_types = [['микоз',"диабет", "ожирение", "варикоз", "недостаточность", "лимфостаз", "экзема"],
                   ['тонзилит',"отит", "синусит", "кариес", "пародонтоз", "остеомиелит", "тромбофлебит", "трофические язвы"],
                   ['резиновая обувь','загрязнения кожных'],
                   ['соматические заболевания']]

    for i in range(len(factor_types)):
        factor_lst = []
        TYPE = morph_pipeline(factor_types[i])
        parser = Parser(TYPE)
        for match in parser.findall(text):
            factor_lst.append(' '.join([_.value for _ in match.tokens]))
        if len(factor_lst)>0:
            factors.append(i+1)

    dict_symp['предрасполагающие факторы'] = factors


    lst = []
    TYPE = morph_pipeline(['работает'])
    parser = Parser(TYPE)
    for match in parser.findall(text):
        lst.append(' '.join([_.value for _ in match.tokens]))
    if len(lst)>0:
        dict_symp['соц категория'] = 0

    soc_rule = rule(not_(normalized('не')),normalized('работает'))
    parser = Parser(soc_rule)
    lst = []
    for match in parser.findall(text):
        lst.append((match.span, [_.value for _ in match.tokens]))
    if len(lst)>0:
        dict_symp['соц категория'] = 1


    DIAGNOZ_RULE = or_(rule(normalized('сопутствующий'), not_(or_(gram('NOUN')))),
                      rule(normalized('сопутствующий'),normalized('диагноз')),
                      rule(normalized('диагноз'),normalized('сопутствующий')),)

    rules = ['сопутствующий', 'сопутствующий диагноз', 'диагноз сопутствующий']
    TYPE = morph_pipeline(rules)
    parser = Parser(DIAGNOZ_RULE)
    lst = []
    for match in parser.findall(text):
        lst.append((match.span, [_.value for _ in match.tokens]))
    if len(lst)>0:
        dict_symp['сопутствующий диагноз'] = text[match.span[1]+2:match.span[1]+text[match.span[1]:].find(' \n  \n')]
        dict_symp['кол-во сопут заболеваний'] = dict_symp['сопутствующий диагноз'].count('\n')
        if dict_symp['кол-во сопут заболеваний']==0: dict_symp['кол-во сопут заболеваний']=1


    DIAGNOZ_RULE = or_(rule(normalized('диагноз'),normalized('при'),normalized('поступлении')),
                       rule(normalized('клинический'),normalized('диагноз')),
                       rule(normalized('диагноз'),normalized('клинический')),
                      rule(normalized('основной'),normalized('диагноз')),
                      rule(normalized('диагноз'),normalized('основной')),
                       rule(normalized('Ds')),
                      rule(not_(or_(gram('ADJF'),gram('NOUN'))),normalized('диагноз'),not_(or_(gram('ADJF'),gram('PREP')))))

    lst = []
    parser = Parser(DIAGNOZ_RULE)
    for match in parser.findall(text):
        lst.append((match.span, [_.value for _ in match.tokens]))
    last = match.span[1]+text[match.span[1]:].find(' \n  \n')
    if last == match.span[1]-1:
        last = len(text)-1
    dict_symp['основной диагноз'] = text[match.span[1]+1:last]


    # In[38]:


    TYPE = morph_pipeline(['левая', 'слева'])
    parser = Parser(TYPE)
    lst = []
    for match in parser.findall(dict_symp['основной диагноз']):
        lst.append((match.span, [_.value for _ in match.tokens]))

    TYPE = morph_pipeline(['правая', 'справа'])
    parser = Parser(TYPE)
    for match in parser.findall(dict_symp['основной диагноз']):
        lst.append((match.span, [_.value for _ in match.tokens]))

    part = dict_symp['основной диагноз']
    if len(lst) == 0:
        parser = Parser(DIAGNOZ_RULE)
        for match in parser.findall(text):
            lst.append((match.span, [_.value for _ in match.tokens]))
        match = lst[0][0][1]
        last = match+text[match:].find(' \n  \n')
        if last == match-1:
            last = len(text)-1
        dict_symp['основной диагноз'] = text[match+1:last]
        part = text[text.find('Диагноз'):]


    TYPE = morph_pipeline(['левая', 'слева'])
    parser = Parser(TYPE)
    left_rozha = []
    lst = []
    rozha_types = [['волосистая часть головы', 'волостистой части головы'], ['лицо', "ушная раковина"],
                   ['нос','губы'],['верняя часть туловища', 'верхняя конечность'],['нижняя часть туловища'],
                   ['пах', 'половые органы'],['верняя часть спины'],['нижняя часть спины'],
                   ['плечо'],['предплечье'],['кисть'],['бедро'],['голень'],['стопа'],['голеностоп']]

    for match in parser.findall(part):
        lst.append((match.span, [_.value for _ in match.tokens]))
    if len(lst)>0:
        for i in range(len(rozha_types)):
            rozha_lst = []
            TYPE = morph_pipeline(rozha_types[i])
            parser = Parser(TYPE)
            for match in parser.findall(part):
                rozha_lst.append(' '.join([_.value for _ in match.tokens]))
            if len(rozha_lst)>0:
                left_rozha.append(i+1)
    dict_symp['ЛПТ'] = left_rozha

    TYPE = morph_pipeline(['правая', 'справа'])
    parser = Parser(TYPE)
    right_rozha = []
    lst = []
    for match in parser.findall(part):
        lst.append((match.span, [_.value for _ in match.tokens]))
    if len(lst)>0:
        for i in range(len(rozha_types)):
            rozha_lst = []
            TYPE = morph_pipeline(rozha_types[i])
            parser = Parser(TYPE)
            for match in parser.findall(part):
                rozha_lst.append(' '.join([_.value for _ in match.tokens]))
            if len(rozha_lst)>0:
                right_rozha.append(i+1)
    dict_symp['ППТ'] = right_rozha

    return dict_symp
Beispiel #28
0
        NUMBERS.repeatable().optional().interpretation(COD.n5),

        rule(normalized('подраздел')).repeatable().optional().interpretation(COD.subsection),

        NUMBERS.repeatable().optional().interpretation(COD.n6),

        rule(normalized('раздел')).repeatable().optional().interpretation(COD.section),

        NUMBERS.repeatable().optional().interpretation(COD.n7),

        rule(normalized('глава')).repeatable().optional().interpretation(COD.chapter),

        NUMBERS.repeatable().optional().interpretation(COD.n8),

        gram('ADJF').repeatable().interpretation(COD.type),

        morph_pipeline({
            'гк',
            'нк',
            'тк',
            'ук',
            'гпк',
            'упк',
            'апк',
            'жк',
            'ск',
            'уик',
            'кодекс'
        }).interpretation(COD.codex.const('Кодекс'))
     ).interpretation(COD)
def extract(text):

    with open(os.path.join(os.getcwd(), 'list_diseases\\diseases'), encoding='UTF-8') as f:
        diseases = f.read().split('\n')

    text = text.replace('\ufeff', '')
    text = text.replace('\n', ' \n ')
    text = text.replace('\\', ' ')

    symptoms = ['Дата рождения', 'Дата осмотра','Дата заболевания', 'Возраст', 'Болен дней','Болен часов','Возраст в днях','Время поступления', 
                'Время заболевания', 'рост','вес', 'IMT', 'давление диаст', 'давление сист', 'температура поступления','мах температура', 'Т-Ан01', 'Т-Ан03', 
                'пол', 'др заболевания в анамнезе', 'кем направлен', 'побочное действие лекартсв','аллергическая реакция', 'озноб', 'слабость', 'вялость','головная боль', 
                'нарушение сна', 'нарушение аппетита', 'ломота','тошнота', 'нарушение сознания', 'Судороги', 'Парестезии', 'эритема', 
                'с четкими границами', 'валик', 'боль','Гиперемия', 'Отек', 'Лимфаденит', 'Лимфангит', 'квартира, дом','контакт с зараженными','речная рыба','провоцирущие факторы',
                'предрасполагающие факторы','кол-во сопут заболеваний','соц категория','сопутствующий диагноз','основной диагноз', 'контакт с зараженными', 'пищевой анамнез',
                'раневые ворота', 'аллергия на лекарства', 'клещ', 'географический анамнез', 'вредные привычки', 'домашние животные', 'условия труда','избыточное питание',
                'ППТ', 'ЛПТ', 'бытовые условия', 'питание', 'интоксикация', 'ЧСС', 'болезненность лимфоузлов', 'увеличенность лимфоузлов','размер лимфоузлов', 'острое начало']

    dict_symp = dict.fromkeys(symptoms)
    dict_index = dict.fromkeys(symptoms)

    dates_lst = []
    dates_spans = []

    # Rule for dates detecting
    DAY = and_(gte(1),lte(31))
    MONTH = and_(gte(1),lte(12))
    YEAR = and_(gte(1),lte(19))
    YEARFULL = and_(gte(1900),lte(2020))
    DATE = or_(
        rule(YEAR,'.',MONTH,'.',DAY),
        rule(DAY,'.',MONTH,'.',YEAR),
        rule(DAY,'.',MONTH,'.',YEARFULL),
        rule(DAY,'.',MONTH),
        rule(DAY,'.',MONTH,YEARFULL),
        rule(DAY,'.',MONTH,YEAR))

    parser = Parser(DATE)
    for match in parser.findall(text):
        dates_lst.append(''.join([_.value for _ in match.tokens]))
        dates_spans.append(match.span)
    
    # Sometimes we dont have information about birthday and we should check difference between years
    # in first two dates to determine there is information about birthday or not
    if int(dates_lst[1][-2:])-int(dates_lst[0][-2:])<0:
        # According medical cards dates have this order
        dict_symp['Дата рождения'] = dates_lst[0]
        dict_symp['Дата осмотра'] = dates_lst[1]
        dict_symp['Дата заболевания'] = dates_lst[2]
        dict_index['Дата рождения'] = dates_spans[0]
        dict_index['Дата осмотра'] = dates_spans[1]
        dict_index['Дата заболевания'] = dates_spans[2]
    else: 
        birth = None
        dict_symp['Дата осмотра'] = dates_lst[0]
        dict_symp['Дата заболевания'] = dates_lst[1]
        dict_index['Дата осмотра'] = dates_spans[0]
        dict_index['Дата заболевания'] = dates_spans[1]
    
    # If date was written without year, we take year from previous date
    if len(dict_symp['Дата заболевания'])==5:
        dict_symp['Дата заболевания'] += dict_symp['Дата осмотра'][dict_symp['Дата осмотра'].rfind('.'):]
    
    # Rule for detecring dates with such situation "болен 5 дней"
    DAY_RULE = morph_pipeline(['дней'])
    parser = Parser(DAY_RULE)
    day_lst = []
    for match in parser.findall(text):
        day_lst.append((match.span, [_.value for _ in match.tokens]))

    if day_lst and dict_symp['Дата заболевания'] is None:
        dict_symp['Дата заболевания'] = text[day_lst[0][0][0]-20:day_lst[0][0][0]+20]
        dict_symp['Дата заболевания'] = re.findall(r'\d+', dict_symp['Дата заболевания'])[0]
        dict_symp['Дата заболевания'] = str(int(dict_symp['Дата осмотра'][:2])-int(dict_symp['Дата заболевания']))
        dict_symp['Дата заболевания'] = dict_symp['Дата заболевания']+dict_symp['Дата осмотра'][2:]
        dict_index['Дата заболевания'] = day_lst[0][0]

    # Rule for detecting Age
    age_lst = []
    age_spans = []

    AGE = and_(gte(0),lte(100))
    AGE_RULE = or_(rule("(",AGE,")"),
                  rule(gram('ADJF'),",",AGE))

    parser = Parser(AGE_RULE)
    for match in parser.findall(text):
        s = ''.join([_.value for _ in match.tokens])
        age_lst.append((re.findall(r'\d+', s)[0]))
        age_spans.append(match.span)

    if age_lst:
        dict_symp['Возраст'] = int(age_lst[-1])
        dict_index['Возраст'] = age_spans[-1]
    
    # Transform dates to datetime format to make calculations
    try:
        d1 = datetime.strptime(dict_symp['Дата осмотра'], '%d.%m.%Y')
    except:
        d1 = datetime.strptime(dict_symp['Дата осмотра'], '%d.%m.%y')
        d1 = d1.strftime('%d.%m.%Y')
        d1 = datetime.strptime(d1, '%d.%m.%Y')
    try:
        d2 = datetime.strptime(dict_symp['Дата заболевания'], '%d.%m.%Y')
    except:
        d2 = datetime.strptime(dict_symp['Дата заболевания'], '%d.%m.%y')
        d2 = d2.strftime('%d.%m.%Y')
        d2 = datetime.strptime(d2, '%d.%m.%Y')

    dict_symp['Болен дней'] = (d1 - d2).days
    dict_symp['Болен часов'] = (int(dict_symp['Болен дней'])-1)*24

    if dict_symp['Дата рождения'] is None:
        dict_symp['Возраст в днях'] = int(dict_symp['Возраст'])*365
    else:
        d1 = datetime.strptime(dict_symp['Дата осмотра'], '%d.%m.%Y')
        d2 = datetime.strptime(dict_symp['Дата рождения'], '%d.%m.%Y')
        dict_symp['Возраст в днях'] = (d1 - d2).days

    #Rule for time detecting
    time_lst = []
    time_spans = []

    HOURS = and_(gte(0),lte(24))

    MINUTES = and_(gte(0),lte(59))

    TIME = or_(rule(HOURS,':',MINUTES),
               rule(HOURS, normalized('час')),)

    parser = Parser(TIME)
    for match in parser.findall(text):
        s = (''.join([_.value for _ in match.tokens]))
        time_spans.append(match.span)
        s = s.replace('часов', ':00')
        s = s.replace('час', ':00')
        time_lst.append(s)
    
    # if we have only 1 date 'Время поступления' = 'Время заболевания'
    if time_lst: 
        dict_symp['Время поступления'] = time_lst[0]
        dict_symp['Время заболевания'] = time_lst[0]
        dict_index['Время поступления'] = time_spans[0]
        dict_index['Время заболевания'] = time_spans[0]
    if len(time_lst)>1: 
        dict_symp['Время заболевания'] = time_lst[1]
        dict_index['Время заболевания'] = time_spans[1]

    t1 = dict_symp['Время поступления']
    t2 = dict_symp['Время заболевания']
    delta = int(t1[:t1.find(':')])+24-int(t2[:t2.find(':')])
    dict_symp['Болен часов'] = dict_symp['Болен часов'] + delta

    # Rules for detecting Weight, Height and IMT
    HEIGHT = and_(gte(50),lte(250))
    WEIGHT = and_(gte(10),lte(150))

    HEIGHT_RULE = or_(rule(normalized('рост'),'-',HEIGHT),
                      rule(normalized('рост'),'–',HEIGHT),
                      rule(normalized('рост'),':',HEIGHT),
                      rule(normalized('рост'),HEIGHT))

    WEIGHT_RULE = or_(rule(normalized('вес'),'-',WEIGHT),
                      rule(normalized('вес'),'–',WEIGHT),
                      rule(normalized('вес'),':',WEIGHT),
                      rule(normalized('вес'),WEIGHT))

    height = None
    parser = Parser(HEIGHT_RULE)
    for match in parser.findall(text):
        height = (''.join([_.value for _ in match.tokens]))
        height_spans = match.span
        height = re.findall(r'\d+', height)[0]

    if height:
        dict_symp['рост'] = int(height)
        dict_index['рост'] = height_spans

    weight = None
    parser = Parser(WEIGHT_RULE)
    for match in parser.findall(text):
        weight = (''.join([_.value for _ in match.tokens]))
        weight = re.findall(r'\d+', weight)[0]
        weight_spans = match.span

    if weight:
        dict_symp['вес'] = int(weight)
        dict_index['вес'] = weight_spans

    if (dict_symp['рост'] is not None) and (dict_symp['вес'] is not None):
        dict_symp['IMT'] = round(dict_symp['вес']/(dict_symp['рост']/100*dict_symp['рост']/100),2)

    # Rules for detecting pressure
    ADSIST = and_(gte(50),lte(250))
    ADDIAST = and_(gte(20),lte(200))

    PRES_RULE = or_(rule('АД', ADSIST,'/',ADDIAST),
                    rule('АД', ADSIST,ADDIAST),
                    rule('АД', ADSIST, ':',ADDIAST),
                    rule('АД','-', ADSIST, '/',ADDIAST),
                    rule('А/Д', ADSIST, '/',ADDIAST),
                    rule('А/Д', ADSIST, ADDIAST),
                    rule('А/Д',' ', ADSIST, '/',ADDIAST),
                    rule(ADSIST, '/',ADDIAST))

    pres = None
    parser = Parser(PRES_RULE)
    for match in parser.findall(text):
        pres = (''.join([_.value for _ in match.tokens]))
        pres = re.findall(r'\d+', pres)
        pres_spans = match.span

    if pres:
        dict_symp['давление сист'] = int(pres[0])
        dict_symp['давление диаст'] = int(pres[1])
        dict_index['давление сист'] = pres_spans
        dict_index['давление диаст'] = pres_spans

    # Rule for detecting Pulse
    PULSE = and_(gte(40),lte(150))

    PULSE_RULE = or_(rule('ЧСС','-',PULSE),
                     rule('ЧСС',PULSE),
                     rule('ЧСС','-',PULSE),
                     rule('ЧСС','/',PULSE),
                     rule('пульс',PULSE),)

    pulse = None
    parser = Parser(PULSE_RULE)
    for match in parser.findall(text):
        pulse = (''.join([_.value for _ in match.tokens]))
        pulse = re.findall(r'\d+', pulse)
        pulse_spans = match.span

    if pulse:
        dict_symp['ЧСС'] = int(pulse[0])
        dict_index['ЧСС'] = pulse_spans

    #Rules for detecting temperatures
    DEGREES = and_(gte(34),lte(42))
    SUBDEGREES = and_(gte(0),lte(9))

    TEMP_RULE = or_(rule(DEGREES,',',SUBDEGREES),
                    rule(DEGREES,'.',SUBDEGREES),
                    rule(DEGREES))
    
    # Find 'Объективный статус', because this pert contains information about 'температура поступления'
    status = text[text.find('Объективный статус'): 
                  text.find('Объективный статус')+text[text.find('Объективный статус')+1:].find(' \n  \n')]
    temp_lst = []
    temp_spans = []
    parser = Parser(TEMP_RULE)
    for match in parser.findall(status):
        temp_lst.append(''.join([_.value for _ in match.tokens]))
        temp_spans.append(match.span)

    if temp_lst:
        dict_symp['температура поступления'] = temp_lst[0]
        dict_index['температура поступления'] = temp_spans[0]

    # Find temperatures in whole text
    temp_text = text[text.find('Жалобы'):]
    temp_lst = []
    temp_spans = []
    parser = Parser(TEMP_RULE)
    for match in parser.findall(temp_text):
        temp_lst.append(''.join([_.value for _ in match.tokens]))
        temp_spans.append(match.span)

    if temp_lst:
        if dict_symp['температура поступления'] is None:
            dict_symp['температура поступления'] = temp_lst[0]
            dict_index['температура поступления'] = temp_spans[0]
        dict_symp['мах температура'] = max([float(i.replace(',','.')) for i in temp_lst])

    if dict_symp['мах температура']>=38:
        dict_symp['Т-Ан01'] = 1
    else: 
        dict_symp['Т-Ан01'] = 0

    if dict_symp['мах температура']>=40:
        dict_symp['Т-Ан03'] = 3
    elif dict_symp['мах температура']>=39: 
        dict_symp['Т-Ан03'] = 2
    elif dict_symp['мах температура']>=38: 
        dict_symp['Т-Ан03'] = 1
    else:
        dict_symp['Т-Ан03'] = 0

    # Rule for detecting Sex
    sex_lst = []
    sex_spans = []
    SEX_RULE = or_(rule(normalized('женский')),
                     rule(normalized('мужской')))

    parser = Parser(SEX_RULE)
    for match in parser.findall(text):
        sex_lst.append(''.join([_.value for _ in match.tokens]))
        sex_spans.append(match.span)

    if sex_lst:
        dict_symp['пол'] = sex_lst[0]
        dict_index['пол'] = sex_spans[0]
        dict_symp['пол'] = dict_symp['пол'].lower().replace('женский', '2')
        dict_symp['пол'] = dict_symp['пол'].lower().replace('мужской', '1')
        dict_symp['пол'] = int(dict_symp['пол'])

    # Rule for detecting DISEASES
    DISEASES_RULE = morph_pipeline(diseases[:-1])

    # anamnez contains information about diseases of patient, but family anamnez contains 
    # information about diseases of patient, and we should remove this part
    anamnez = text[text.find('Анамнез'): text.find('Анамнез')+text[text.find('Анамнез')+1:].rfind('Анамнез')]
    family = anamnez[anamnez.find('Семейный'):anamnez.find('Семейный')+60]
    if family:
        anamnez = anamnez.replace(family,' ')
    anamnez = anamnez[:anamnez.rfind('Диагноз')]
    dis_lst = []
    dis_spans = []
    parser = Parser(DISEASES_RULE)
    for match in parser.findall(anamnez):
        dis_lst.append(' '.join([_.value for _ in match.tokens]))
        dis_spans.append(match.span)

    # Special rule for описторхоз
    OP_RULE = or_(rule(normalized('описторхоз'), not_(normalized('не'))))
    parser = Parser(OP_RULE)
    op_lst = []
    for match in parser.findall(anamnez):#text
        op_lst.append((match.span, [_.value for _ in match.tokens]))
    if op_lst:
        dis_lst.append(' описторхоз')
        dis_spans.append(match.span)

    # Special rule for туберкулез
    TUB_RULE = rule(normalized('туберкулез'), not_(normalized('отрицает')))
    parser = Parser(TUB_RULE)
    tub_lst = []
    for match in parser.findall(anamnez):#text
        tub_lst.append((match.span, [_.value for _ in match.tokens]))
    if tub_lst:
        dis_lst.append(' туберкулез')
        dis_spans.append(match.span)

    # Special rule for ВИЧ
    VICH_RULE = morph_pipeline(['ВИЧ'])
    parser = Parser(VICH_RULE)
    vich_lst = []
    for match in parser.findall(anamnez):#text
        vich_lst.append((match.span, [_.value for _ in match.tokens]))
    if vich_lst:
        text_vich = text[match.span[1]-30:match.span[1]+30]
        TYPE = morph_pipeline(['отрицает'])
        parser = Parser(TYPE)
        vich_lst = []
        for match in parser.findall(text_vich):
            vich_lst.append((match.span, [_.value for _ in match.tokens]))
        if not vich_lst:
            dis_lst.append(' ВИЧ')
            dis_spans.append(match.span)
    
    if dis_lst:
        dis_lst = list(set(dis_lst))
        dict_symp['др заболевания в анамнезе'] = ', '.join(dis_lst)
        dict_index['др заболевания в анамнезе'] = dis_spans
        dict_symp['др заболевания в анамнезе'] = morph.parse(dict_symp['др заболевания в анамнезе'])[0].normal_form
            
    # Rules for detecting information about л/у
    LU_RULE = morph_pipeline(['лимфатические узлы', "лимфоузлы", "лу", "л/у"])
    parser = Parser(LU_RULE)
    lu_lst = []
    lu_spans = []
    for match in parser.findall(text):
        lu_lst.append((match.span, [_.value for _ in match.tokens]))
    if lu_lst:
        dict_symp['Лимфаденит'] = 0
        dict_index['Лимфаденит'] = lu_spans
        text_lu = text[match.span[1]-70:match.span[1]+70]
        TYPE = morph_pipeline(["болезненны", "болезненные", "болезнены"])
        parser = Parser(TYPE)
        lu_lst = []
        for match in parser.findall(text_lu):
            lu_lst.append((match.span, [_.value for _ in match.tokens]))
        if lu_lst:
            dict_symp['болезненность лимфоузлов'] = 1
            dict_index['болезненность лимфоузлов'] = match.span
            dict_symp['Лимфаденит'] = 1
        else:
            dict_symp['болезненность лимфоузлов'] = 0
            
        TYPE = morph_pipeline(['Увеличены', 'увеличенные'])
        parser = Parser(TYPE)
        lu_lst = []
        for match in parser.findall(text_lu):
            lu_lst.append((match.span, [_.value for _ in match.tokens]))
        if lu_lst:
            dict_symp['увеличенность лимфоузлов'] = 1
            dict_index['увеличенность лимфоузлов'] = match.span
            dict_symp['Лимфаденит'] = 1
        else:
            dict_symp['увеличенность лимфоузлов'] = 0
        
        number = and_(gte(0),lte(9))
        
        LU_SIZE_RULE = or_(rule(number,'.',number),
               rule(number,',',number))
        
        lu_lst = []
        lu_spans = []
        parser = Parser(LU_SIZE_RULE)
        for match in parser.findall(text_lu):
            lu_lst.append(''.join([_.value for _ in match.tokens]))
            lu_spans.append(match.span)
        if lu_lst:
            dict_symp['размер лимфоузлов'] = lu_lst[0]
            dict_index['размер лимфоузлов'] = lu_spans[0]

    # Rule for 'кем направлен'
    NAPR_RULE = morph_pipeline(['Поликлиника',"скорая помощь", "ск/помощь", 'СМП', "обратился"])

    napr = None
    napr_lst = []
    napr_spans = []
    parser = Parser(NAPR_RULE)
    for match in parser.findall(text):
        napr_lst.append(' '.join([_.value for _ in match.tokens]))
        napr_spans.append(match.span)
    if napr_lst:
        dict_index['кем направлен'] = napr_spans[0]
        napr = napr_lst[-1]
        napr = morph.parse(napr)[0].normal_form
    if napr == "обратиться":
        dict_symp['кем направлен'] = 3
    elif napr == "скорая помощь" or napr == "ск/помощь" or napr == 'смп'or napr == "ск / помощь" or napr == "скорой помощь" or napr == "скорую помощь":
        dict_symp['кем направлен'] = 1
    elif napr == "поликлиника":
        dict_symp['кем направлен'] = 2
        
    # Rule for allergy
    ALLERG_RULE = or_(rule(normalized('Аллергическая'),normalized('реакция'), normalized('на')),
                      rule(normalized('не'),normalized('переносит')))

    all_lst = []
    parser = Parser(ALLERG_RULE)
    for match in parser.findall(text):
        all_lst.append((match.span, [_.value for _ in match.tokens]))
    if all_lst:
        index = all_lst[0][0][1]
        dict_symp['аллергическая реакция'] = text[index:text[index:].find('.')+index]
        dict_index['аллергическая реакция'] = [all_lst[0][0][0], text[index:].find('.')+index]

    # Rules for different symptoms
    symptoms = [['озноб', 'познабливание'], 'слабость', ['вялость', 'разбитость'],'головная боль', 'нарушение сна', 
                'нарушение аппетита', 'ломота','тошнота', 'нарушение сознания','Судороги', 'Парестезии', ['эритема', 
                'эритематозная', 'эритематозно'], ['с четкими границами', 'границами четкими', 'четкими неровными краями',
                'с четкими краями', 'краями четкими' , 'четкими неровными краями', 'четкими контурами', 'языков пламени'], 
                ['валик', 'вал'], 'боль',['Гиперемия', 'гиперемирована'], 'Отек', 'Лимфангит', ['рана', "раневые ворота", 
                "входные ворота"],['клещ', "присасывание"], 'интоксикация', 'острое начало']
                
    for i in symptoms:
        sym_lst = []
        sym_spans = []
        if isinstance(i, str):
            SYM_RULE = morph_pipeline([i])
            parser = Parser(SYM_RULE)
            for match in parser.findall(text):
                sym_lst.append(' '.join([_.value for _ in match.tokens]))
                sym_spans.append(match.span)
            if sym_lst:
                dict_symp[i] = 1
                dict_index[i] = sym_spans[0]
            else:
                dict_symp[i] = 0
        else:
            SYM_RULE = morph_pipeline(i)
            parser = Parser(SYM_RULE)
            for match in parser.findall(text):
                sym_lst.append(' '.join([_.value for _ in match.tokens]))
                sym_spans.append(match.span)
            if sym_lst:
                dict_symp[i[0]] = 1
                dict_index[i[0]] = sym_spans[0]
            else:
                dict_symp[i[0]] = 0

    #This fuction used for features which have the same rule
    def find_feature(feature, RULE, RULE2, space=[40,40]):
        parser = Parser(RULE)
        lst = []
        for match in parser.findall(text):
            lst.append((match.span, [_.value for _ in match.tokens]))
        if lst:
            dict_index[feature] = match.span
            add_text = text[match.span[1]-space[0]:match.span[1]+space[1]]
            parser = Parser(RULE2)
            lst = []
            for match in parser.findall(add_text):
                lst.append((match.span, [_.value for _ in match.tokens]))
            if lst:
                dict_symp[feature] = 1
                dict_index[feature] = match.span
            else:
                dict_symp[feature] = 0
    
    GEO_RULE = morph_pipeline(['географический', 'выезжал'])
    GEO_RULE2 = rule(not_(normalized('не')),normalized('выезжал'))
    geo_space = [40,40]
    
    COND_RULE = morph_pipeline(['бытовые'])
    COND_RULE2 = rule(not_(normalized('не')),normalized('удовлетворительные'))
    cond_space = [0,60]
    SEC_COND_RULE = morph_pipeline(['Социально-бытовые'])
    sec_cond_space = [0,60]
    
    WORK_COND_RULE = morph_pipeline(['условия труда'])
    work_cond_space = [20,20]
    
    CONTACT_RULE = morph_pipeline(['контакт'])
    CONTACT_RULE2 = morph_pipeline(['да'])
    contact_space = [0,40]
    
    WATER_RULE = morph_pipeline(['сырой воды'])
    WATER_RULE2 = morph_pipeline(['не было', 'отрицает', 'нет'])
    water_space = [80,80]

    features = ['географический анамнез', 'бытовые условия', 'бытовые условия',
               'условия труда','контакт с зараженными','пищевой анамнез']
    rules = [GEO_RULE, COND_RULE, SEC_COND_RULE, WORK_COND_RULE,
            CONTACT_RULE, WATER_RULE]
    sec_rules = [GEO_RULE2, COND_RULE2, COND_RULE2, COND_RULE2,
            CONTACT_RULE2, WATER_RULE2]
    spaces = [geo_space, cond_space, sec_cond_space, work_cond_space,
             contact_space, water_space]
    
    for i in range(len(features)):
        find_feature(features[i],rules[i],sec_rules[i],spaces[i])

    # Rules for bad habbits
    HAB_RULE = morph_pipeline(['вредные привычки', 'алкоголь'])
    parser = Parser(HAB_RULE)
    hab_lst = []
    for match in parser.findall(text):
        hab_lst.append((match.span, [_.value for _ in match.tokens]))
    if hab_lst:
        dict_index['вредные привычки'] = match.span
        text_hab = text[match.span[1]-80:match.span[1]+80]
        HAB_RULE = morph_pipeline(['не было', 'отрицает', 'нет', 'не употребляет'])
        parser = Parser(HAB_RULE)
        hab_lst = []
        for match in parser.findall(text_hab):
            hab_lst.append((match.span, [_.value for _ in match.tokens]))
        if hab_lst:
            dict_symp['вредные привычки'] = 0
            dict_index['вредные привычки'] = match.span
        else:
            dict_symp['вредные привычки'] = 1

    SMOKE_RULE = or_(rule(not_(normalized('не')),normalized('курит')),
                     rule(not_(normalized('не')),normalized('употребляет')))
    parser = Parser(SMOKE_RULE)
    hab_lst = []
    for match in parser.findall(text):
        hab_lst.append((match.span, [_.value for _ in match.tokens]))
    if hab_lst:
        dict_symp['вредные привычки'] = 1
        dict_index['вредные привычки'] = match.span
    
    # Rules for work
    work_lst = []
    WORK_RULE = morph_pipeline(['работает'])
    parser = Parser(WORK_RULE)
    for match in parser.findall(text):
        work_lst.append((match.span, [_.value for _ in match.tokens]))
    if work_lst:
        dict_symp['соц категория'] = 0
        dict_index['соц категория'] = match.span

    WORK_RULE = rule(not_(normalized('не')),normalized('работает'))
    parser = Parser(WORK_RULE)
    work_lst = []
    for match in parser.findall(text):
        work_lst.append((match.span, [_.value for _ in match.tokens]))
    if work_lst:
        dict_symp['соц категория'] = 1
        dict_index['соц категория'] = match.span
    
    # If patient has условия труда probably he has a job
    if dict_symp['условия труда'] is not None:
        dict_symp['соц категория'] = 1
        
    # Rule for food
    FOOD_RULE = morph_pipeline(['питание'])
    parser = Parser(FOOD_RULE)
    food_lst = []
    for match in parser.findall(text):
        food_lst.append((match.span, [_.value for _ in match.tokens]))
    if food_lst:
        dict_index['избыточное питание'] = match.span
        text_food = text[match.span[1]-20:match.span[1]+20]
        FOOD_RULE = or_(rule(not_(normalized('не')),normalized('удовлетворительное')),
                        rule(not_(normalized('не')),normalized('полноценное')))
        parser = Parser(FOOD_RULE)
        food_lst = []
        for match in parser.findall(text_food):
            food_lst.append((match.span, [_.value for _ in match.tokens]))
        if food_lst:
            dict_symp['питание'] = 1
            dict_index['питание'] = match.span
        else:
            dict_symp['питание'] = 0

        FOOD_RULE = rule(not_(normalized('не')),normalized('избыточное'))
        parser = Parser(FOOD_RULE)
        food_lst = []
        for match in parser.findall(text_food):
            food_lst.append((match.span, [_.value for _ in match.tokens]))
        if food_lst:
            dict_index['избыточное питание'] = match.span
            dict_symp['избыточное питание'] = 1
        else:
            dict_symp['избыточное питание'] = 0
            
    # Rule for fish
    FISH_RULE = morph_pipeline(['рыба'])
    parser = Parser(FISH_RULE)
    fish_lst = []
    for match in parser.findall(text):
        fish_lst.append((match.span, [_.value for _ in match.tokens]))
    if fish_lst:
        dict_index['речная рыба'] = match.span
        text_fish = text[match.span[1]-40:match.span[1]+40]
        FISH_RULE = morph_pipeline(['да', 'постоянно'])
        parser = Parser(FISH_RULE)
        fish_lst = []
        for match in parser.findall(text_fish):
            fish_lst.append((match.span, [_.value for _ in match.tokens]))
        if fish_lst:
            dict_symp['речная рыба'] = 1
        FISH_RULE = rule(not_(normalized('не')),normalized('употребляет'))
        parser = Parser(FISH_RULE)
        fish_lst = []
        for match in parser.findall(text_fish):
            fish_lst.append((match.span, [_.value for _ in match.tokens]))
        if fish_lst:
            dict_symp['речная рыба'] = 0
            dict_index['речная рыба'] = match.span

    # Rule for home
    home = None
    home_span = None
    home_types = [['бездомный'],
                   ['дом благоустроенный', 'частный дом'],
                   ['дом не благоустроенный','дом неблагоустроенный'],
                   ['квартира не благоустроенная', 'квартира неблагоустроенная'],
                   ['квартира благоустроенная', 'благоустроенная квартира'],]

    for i in range(len(home_types)):
        home_lst = []
        HOME_RULE = morph_pipeline(home_types[i])
        parser = Parser(HOME_RULE)
        for match in parser.findall(text):
            home_lst.append((match.span, [_.value for _ in match.tokens]))
        if home_lst:
            home = i
            home_span = match.span

    dict_symp['квартира, дом'] = home
    dict_index['квартира, дом'] = home_span

    pets = []
    pets_span = []
    pet_types = [['кошка'],
                 ['собака'],
                 ['корова','коза']]

    # Rule for pets
    for i in range(len(pet_types)):
        pet_lst = []
        PET_RULE = morph_pipeline(pet_types[i])
        parser = Parser(PET_RULE)
        for match in parser.findall(text):
            pet_lst.append(' '.join([_.value for _ in match.tokens]))
            pets_span.append(match.span)
        if pet_lst:
            pets.append(i+1)

    if len(pets)>1:
        pets = 4
    elif pets:
        pets = pets[0]
    else:
        pets = 0
    dict_symp['домашние животные'] = pets
    dict_index['домашние животные'] = pets_span

    # Rules for different factors
    factors = []
    factors_span = []
    factor_types = [['ссадины',"царапины", "раны", "расчесы", "уколы", "потертости", "трещины", 'вскрытие'],
                   ['ушибы'],
                   ['переохлаждение','перегревание','смена температуры'],
                   ['инсоляция'],
                   ['стресс'],
                   ['переутомление']]

    def find_factors(factor_types):
        for i in range(len(factor_types)):
            factor_lst = []
            FACT_RULE = morph_pipeline(factor_types[i])
            parser = Parser(FACT_RULE)
            for match in parser.findall(text):
                factor_lst.append(' '.join([_.value for _ in match.tokens]))
                factors_span.append(match.span)
            if factor_lst:
                factors.append(i+1)
                
    find_factors(factor_types)
    if factors:
        dict_symp['провоцирущие факторы'] = factors
        dict_index['провоцирущие факторы'] = factors_span

    factors = []
    factors_span = []
    factor_types = [['микоз',"диабет", "ожирение", "варикоз", "недостаточность", "лимфостаз", "экзема"],
                   ['тонзилит',"отит", "синусит", "кариес", "пародонтоз", "остеомиелит", "тромбофлебит", "трофические язвы"],
                   ['резиновая обувь','загрязнения кожных'],
                   ['соматические заболевания']]

    find_factors(factor_types)
    if factors:
        dict_symp['предрасполагающие факторы'] = factors
        dict_index['предрасполагающие факторы'] = factors_span

    # Rule for detecting the second diagnosis
    DIAGNOZ_RULE = or_(rule(normalized('сопутствующий'), not_(or_(gram('NOUN')))),
                       rule(normalized('сопутствующий'),normalized('диагноз')),
                       rule(normalized('диагноз'),normalized('сопутствующий')),)

    parser = Parser(DIAGNOZ_RULE)
    diag_lst = []
    for match in parser.findall(text):
        diag_lst.append((match.span, [_.value for _ in match.tokens]))
    if diag_lst:
        dict_symp['сопутствующий диагноз'] = text[match.span[1]+2:match.span[1]+text[match.span[1]:].find(' \n  \n')]
        dict_index['сопутствующий диагноз'] = [match.span[1]+2,match.span[1]+text[match.span[1]:].find(' \n  \n')]
        dict_symp['кол-во сопут заболеваний'] = dict_symp['сопутствующий диагноз'].count('\n')
        if dict_symp['кол-во сопут заболеваний']==0: dict_symp['кол-во сопут заболеваний']=1

    # Rule for detecting the first diagnosis
    DIAGNOZ_RULE = or_(rule(normalized('диагноз'),normalized('при'),normalized('поступлении')),
                       rule(normalized('клинический'),normalized('диагноз')),
                       rule(normalized('диагноз'),normalized('клинический')),
                       rule(normalized('основной'),normalized('диагноз')),
                       rule(normalized('диагноз'),normalized('основной')),
                       rule(normalized('Ds')),
                       rule(normalized('Ds:')),
                       rule(not_(or_(gram('ADJF'),gram('NOUN'))),normalized('диагноз'),not_(or_(gram('ADJF'),gram('PREP')))))

    diag_lst = []
    parser = Parser(DIAGNOZ_RULE)
    for match in parser.findall(text):
        diag_lst.append((match.span, [_.value for _ in match.tokens]))
    last = match.span[1]+text[match.span[1]:].find(' \n  \n')
    if last == match.span[1]-1:
        last = len(text)-1
    dict_symp['основной диагноз'] = text[match.span[1]+1:last]
    dict_index['основной диагноз'] = [match.span[1]+1,last]

    # Rules for detecting ЛПТ and ППТ
    LEFT_RULE = morph_pipeline(['левая', 'слева'])
    parser = Parser(LEFT_RULE)
    side_lst = []
    for match in parser.findall(dict_symp['основной диагноз']):
        side_lst.append((match.span, [_.value for _ in match.tokens]))

    RIGHT_RULE = morph_pipeline(['правая', 'справа'])
    parser = Parser(RIGHT_RULE)
    for match in parser.findall(dict_symp['основной диагноз']):
        side_lst.append((match.span, [_.value for _ in match.tokens]))
    
    # If we dont have information about side in 'основной диагноз', check other diagnosis
    DIAGNOZ_RULE = or_(rule(normalized('Обоснование'),normalized('Диагноза')))
    part = dict_symp['основной диагноз']
    if len(side_lst) == 0:
        part = text[text.find('Диагноз'):]
        side_lst = []
        parser = Parser(DIAGNOZ_RULE)
        for match in parser.findall(part):
            side_lst.append((match.span, [_.value for _ in match.tokens]))
        last = match.span[1]+part[match.span[1]:].find(' \n  \n')
        if last == match.span[1]-1:
            last = len(part)-1
        explaining = part[match.span[1]+1:last]
        if len(explaining)>1:
            part = part.replace(explaining,' ')
    
    # If we dont have information about side in diagnosis, check other 'Жалобы'
    DIAGNOZ_RULE = or_(rule(normalized('Жалобы')))
    comp_lst = []
    parser = Parser(DIAGNOZ_RULE)
    for match in parser.findall(text):
        comp_lst.append((match.span, [_.value for _ in match.tokens]))
    last = comp_lst[0][0][1]+text[comp_lst[0][0][1]:].find(' \n  \n')
    if last == comp_lst[0][0][1]-1:
        last = len(text)-1
    zhalobi = text[comp_lst[0][0][1]+1:last]
    
    rozha_types = [['волосистая часть головы', 'волостистой части головы'], ['лицо','щека','лоб','глаз'],
                   ['нос','губы'],['верняя часть туловища', 'верхняя конечность'],['нижняя часть туловища'],
                   ['пах', 'половые органы'],['верняя часть спины'],['нижняя часть спины'],
                   ['плечо'],['предплечье'],['кисть'],['бедро'],['голень'],['стопа'],['голеностоп'], ["ушная раковина"]]
    
    def find_side(parser, sidetext):
        rozha = []
        lst = []
        for match in parser.findall(sidetext):
            lst.append((match.span, [_.value for _ in match.tokens]))
        if lst:
            for i in range(len(rozha_types)):
                rozha_lst = []
                TYPE = morph_pipeline(rozha_types[i])
                parser = Parser(TYPE)
                for match in parser.findall(sidetext):#part):
                    rozha_lst.append(' '.join([_.value for _ in match.tokens]))
                if rozha_lst:
                    if i ==15: rozha.append('2.1')
                    else: rozha.append(i+1)
        return(rozha)
    
    parser = Parser(LEFT_RULE)
    dict_symp['ЛПТ'] = find_side(parser, part)
    
    parser = Parser(RIGHT_RULE)
    dict_symp['ППТ'] = find_side(parser, part)
    
    if not dict_symp['ППТ'] and not dict_symp['ЛПТ']:
        parser = Parser(LEFT_RULE)
        dict_symp['ЛПТ'] = find_side(parser, zhalobi)
        
        parser = Parser(RIGHT_RULE)
        dict_symp['ППТ'] = find_side(parser, zhalobi)
        
    # Special rule for detecting face
    face_lst = []
    FACE_RULE = morph_pipeline(['нос','губы'])
    parser = Parser(FACE_RULE)
    for match in parser.findall(part):
        face_lst.append((match.span, [_.value for _ in match.tokens]))
    if face_lst:
        dict_symp['ППТ'].append(3)
        dict_symp['ЛПТ'].append(3)

    dict_symp['ЛПТ'] = list(set(dict_symp['ЛПТ']))
    dict_symp['ППТ'] = list(set(dict_symp['ППТ']))
    if not dict_symp['ППТ']: dict_symp['ППТ'] = None
    if not dict_symp['ЛПТ']: dict_symp['ЛПТ'] = None
        
    return dict_symp, dict_index
Beispiel #30
0
    ['name', 'type']
)
Building = fact(
    'Building',
    ['number', 'type']
)
Room = fact(
    'Room',
    ['number', 'type']
)


DASH = eq('-')
DOT = eq('.')

ADJF = gram('ADJF')
NOUN = gram('NOUN')
INT = type('INT')
TITLE = is_title()

ANUM = rule(
    INT,
    DASH.optional(),
    in_caseless({
        'я', 'й', 'е',
        'ое', 'ая', 'ий', 'ой'
    })
)


#########
Beispiel #31
0
    'футболист',
    'баскетболист',

    'агроном',

    'президент',
    'сопрезидент',
    'вице-президент',
    'экс-президент',
    'председатель',
    'руководитель',
    'директор',
    'глава',
])

GENT = gram('gent')

WHERE = or_(
    rule(GENT),
    rule(GENT, GENT),
    rule(GENT, GENT, GENT),
    rule(GENT, GENT, GENT, GENT),
    rule(GENT, GENT, GENT, GENT, GENT),
)

POSITION = or_(
    POSITION,
    rule(POSITION, WHERE)
).interpretation(
    Person.position
)
for val in df.values:
    idx2syns[val[0]] = val[1]
    try:
        pidxs = json.loads(prestr(val[2]))
        concp = [el.split(",")[0] for el in json.loads(prestr(val[3]))]
        idx2syns.update(dict(zip(pidxs, concp)))
    except:
        print(prestr(val[2]))
        print(prestr(val[3]))

# In[ ]:

# In[4]:

START = rule(
    or_(rule(gram('ADJF')), rule(gram('NOUN'))).optional(), gram('NOUN'))

START_S = or_(
    eq('такой'),
    eq('такие'),
)

KAK = eq('как')
INCLUDING = or_(
    or_(
        eq('в'),
        eq('том'),
        eq('числе'),
    ),
    eq('включающий'),
    or_(
Beispiel #33
0
from yargy.predicates import (eq, in_, gram, normalized, caseless)

Money = fact('Money', ['amount', 'currency'])

EURO = normalized('евро')

DOLLARS = or_(normalized('доллар'), eq('$'))

RUBLES = or_(rule(normalized('рубль')),
             rule(or_(caseless('руб'), caseless('р')),
                  eq('.').optional()))

CURRENCY = or_(rule(EURO), rule(DOLLARS),
               RUBLES).interpretation(Money.currency)

INT = gram('INT')

AMOUNT_ = or_(
    rule(INT),
    rule(INT, INT),
    rule(INT, INT, INT),
    rule(INT, '.', INT),
    rule(INT, '.', INT, '.', INT),
)

FRACTION_AMOUN = rule(AMOUNT_, in_({',', '.'}), INT)

AMOUNT = or_(AMOUNT_, FRACTION_AMOUN).interpretation(Money.amount)

MONEY = rule(AMOUNT, CURRENCY).interpretation(Money)
Beispiel #34
0
IN_MAYBE_FIRST = dictionary(MAYBE_FIRST_DICT)
IN_LAST = dictionary(LAST_DICT)

gnc = gnc_relation()


########
#
#   FIRST
#
########


TITLE = is_capitalized()

NOUN = gram('NOUN')
NAME_CRF = tag('I')

ABBR = gram('Abbr')
SURN = gram('Surn')
NAME = and_(
    gram('Name'),
    not_(ABBR)
)
PATR = and_(
    gram('Patr'),
    not_(ABBR)
)

FIRST = and_(
    NAME_CRF,
              rule(in_('мМmM'))).interpretation(const(10**6))

THOUSAND = or_(rule(caseless('т'), DOT), rule(caseless('к')),
               rule(caseless('k')), rule(caseless('тыс'), DOT.optional()),
               rule(normalized('тысяча'))).interpretation(const(10**3))

MULTIPLIER = or_(MILLIARD, MILLION, THOUSAND).interpretation(Money.multiplier)

########
#
#  NUMERAL
#
#######

NUMR = or_(
    gram('NUMR'),
    # https://github.com/OpenCorpora/opencorpora/issues/818
    dictionary({'ноль', 'один'}),
)
# TODO: можно выпилить дробные части для снижения числа ложных срабатываний, их все равно не бывает в реальных вилках
#  Хотя одна вакаха в Tampere University of Technology реально была с дробями
MODIFIER = in_caseless({'целых', 'сотых', 'десятых'})

PART = or_(rule(or_(INT, NUMR, MODIFIER)), MILLIARD, MILLION, THOUSAND,
           CURRENCY, COINS_CURRENCY)
# TODO: вот здесь можно поправить, чтобы телефоны не парсились
BOUND = in_('()//')

NUMERAL = rule(BOUND, PART.repeatable(), BOUND)

#######
Beispiel #36
0
    MILLION,
    THOUSAND
).interpretation(
    Money.multiplier
)


########
#
#  NUMERAL
#
#######


NUMR = or_(
    gram('NUMR'),
    # https://github.com/OpenCorpora/opencorpora/issues/818
    dictionary({
        'ноль',
        'один'
    }),
)

MODIFIER = in_caseless({
    'целых',
    'сотых',
    'десятых'
})

PART = or_(
    rule(
morph = pymorphy2.MorphAnalyzer()

LimbState = fact(
    'LimbState',
    ['state'],
)

Limb = fact(
    'limb',
    ['name'],
)

Disease = fact('Person', ['limb', 'limbstate'])

LIMBSTATE = rule(gram('VERB').interpretation(
    LimbState.state)).interpretation(LimbState)

LIMB = rule(gram('NOUN').interpretation(Limb.name)).interpretation(Limb)

DISEASE = or_(
    rule(
        LIMB.interpretation(Disease.limb),
        or_(
            gram('ADVB'),
            gram('PRTF'),
            gram('PRED'),
            gram('NPRO'),
        ).repeatable(max=5).optional(),
        LIMBSTATE.interpretation(Disease.limbstate)),
    rule(
        LIMBSTATE.interpretation(Disease.limbstate),
    'госкорпорация',
    'профорганизация',
    'стартап',
    'нотариальная контора',
    'букмекерская контора',
    'авиазавод',
    'автозавод',
    'винзавод',
    'подстанция',
    'гидроэлектростанция',
])

gnc = gnc_relation()
ADJF_PREFIX = rule(
    or_(
        rule(gram('ADJF').match(gnc)),  # международное
        rule(  # историко-просветительское
            true(),
            eq('-'),
            gram('ADJF').match(gnc),
        ),
    ),
    or_(caseless('и'), eq(',')).optional(),
).repeatable()

case = case_relation()
GENT_GROUP = rule(
    gram('gent').match(case)
).repeatable().optional()

QUOTED = rule(