Beispiel #1
0
# coding: utf-8
from __future__ import unicode_literals
from yargy import (rule, fact, not_, and_, or_, attribute,)
from yargy.predicates import (gram, caseless, normalized, is_title, dictionary, custom,)
#from yargy.relations import (gnc_relation, case_relation,)

## 1 - FACT INIT
DateRelative = fact('DateRelative', ['name'])
from natasha.dictionaries.daterelative import DATERELATIVE_DICT
###

### 2 - INIT GRAMS & GRAM RULES (pymorphy2)
ADJF = gram('ADJF')
NOUN = gram('NOUN')
INT = gram('INT')
TITLE = is_title()

###


### 1-ST RING RULES
R1_SIMPLE = rule(
   DATERELATIVE_DICT,
).repeatable()
###


### 2-ST RING RULES

###
    
Beispiel #2
0
    'Building',
    ['number', 'type']
)
Room = fact(
    'Room',
    ['number', 'type']
)


DASH = eq('-')
DOT = eq('.')

ADJF = gram('ADJF')
NOUN = gram('NOUN')
INT = type('INT')
TITLE = is_title()

ANUM = rule(
    INT,
    DASH.optional(),
    in_caseless({
        'я', 'й', 'е',
        'ое', 'ая', 'ий', 'ой'
    })
)


#########
#
#  STRANA
#
Beispiel #3
0
##########
#
#  COMPONENTS
#
###########

IS_FIRST = dictionary(FIRST_DICT)

MAYBE_FIRST = or_(
    and_(
        gram('Name'),
        not_(gram('Abbr'))  # А. Леонидов
    ),
    dictionary(MAYBE_FIRST_DICT))

TITLE_FIRST = and_(or_(IS_FIRST, MAYBE_FIRST), is_title())

TITLE_FIRST_ABBR = and_(length_eq(1), is_title())

TITLE_MIDDLE = and_(
    gram('Patr'),
    not_(gram('Abbr')),  # Фил О’Рейли -> "О" is Patr
    is_title())

TITLE_MIDDLE_ABBR = and_(length_eq(1), is_title())

IS_LAST = dictionary(LAST_DICT)

MAYBE_LAST = or_(gram('Surn'), dictionary(MAYBE_LAST_DICT))

TITLE_LAST = and_(or_(IS_LAST, MAYBE_LAST), is_title())
Beispiel #4
0
Address = fact('Address', [attribute('parts').repeatable()])
Index = fact('Index', ['value'])
Country = fact('Country', ['name'])
Region = fact('Region', ['name', 'type'])
Settlement = fact('Settlement', ['name', 'type'])
Street = fact('Street', ['name', 'type'])
Building = fact('Building', ['number', 'type'])
Room = fact('Room', ['number', 'type'])

DASH = eq('-')
DOT = eq('.')

ADJF = gram('ADJF')
NOUN = gram('NOUN')
INT = gram('INT')
TITLE = is_title()

ANUM = rule(INT, DASH.optional(),
            in_caseless({'я', 'й', 'е', 'ое', 'ая', 'ий', 'ой'}))

#########
#
#  STRANA
#
##########

# TODO
COUNTRY_VALUE = dictionary({'россия', 'украина'})

ABBR_COUNTRY_VALUE = in_caseless({'рф'})
    documents = []
    for filename in os.listdir(baseDir):
        if filename.endswith('.docx'):
            documents.append(filename)
    return documents


from yargy.tokenizer import TokenRule
from yargy.tokenizer import Tokenizer

CODE_RULE = TokenRule('Code', '\d{2}.\d{2}.\d{2}(?!\d)')
tokenizer = Tokenizer()
tokenizer.remove_types('EOL', 'LATIN', 'RU', 'INT', 'PUNCT', 'OTHER')
tokenizer.add_rules(CODE_RULE)

isRPD = rule(and_(dictionary({'рабочая'}), is_title()),
             dictionary({'программа'}))

isRPD2 = rule(dictionary({'дисциплина'}))

rpdRule = Parser(isRPD)

# print(rpdRule.find('Рабочая программа дисциплины'))

baseDir = r'C:\Users\Katia\Desktop\рпд'
documents = GetDocuments()
path = baseDir + '\\' + documents[0]
print(path)
document = Document(path)