Example #1
0
class Person(Enum):

    # Иванов Иван Иванович
    Full = [
        {
            'labels': [
                gram('Surn'),
                gram_not('Abbr'),
            ],
            'normalization': NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Lastname,
            },
        },
        {
            'labels': [
                gram('Name'),
                gram_not('Abbr'),
                gnc_match(-1, solve_disambiguation=True),
            ],
            'normalization':
            NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Firstname,
            },
        },
        {
            'labels': [
                gram('Patr'),
                gram_not('Abbr'),
                gnc_match(-1, solve_disambiguation=True),
            ],
            'normalization':
            NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Middlename,
            },
        },
    ]

    # Иван Иванович Иванов
    FullReversed = [
        {
            'labels': [
                gram('Name'),
                gram_not('Abbr'),
            ],
            'normalization': NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Firstname,
            },
        },
        {
            'labels': [
                gram('Patr'),
                gram_not('Abbr'),
                gnc_match(-1, solve_disambiguation=True),
            ],
            'normalization':
            NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Middlename,
            },
        },
        {
            'labels': [
                gram('Surn'),
                gram_not('Abbr'),
                gnc_match(-1, solve_disambiguation=True),
            ],
            'normalization':
            NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Lastname,
            },
        },
    ]

    # Фелипе Родригес Фернандес
    # https://www.englishelp.ru/business-english/other/284-patronymic-vs-middle-name.html

    FullReversedWithLatinMiddlename = [
        {
            'labels': [
                gram('Name'),
            ],
            'normalization': NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Firstname,
            },
        },
        {
            'labels': [
                gram('Name'),
                gnc_match(-1, solve_disambiguation=True),
            ],
            'repeatable': True,
            'normalization': NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Middlename,
            },
        },
        {
            'labels': [
                gram('Surn'),
                gnc_match(0, solve_disambiguation=True),
                gnc_match(-1, solve_disambiguation=True),
            ],
            'normalization':
            NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Lastname,
            },
        },
    ]

    # Л. А. Раневская
    InitialsAndLastname = [
        {
            'labels': [
                gram_in(['Name', 'Abbr']),
            ],
            'interpretation': {
                'attribute': PersonObject.Attributes.Firstname,
            },
            'normalization': NormalizationType.Original,
        },
        {
            'labels': [
                gram('PUNCT'),
                eq('.'),
            ],
        },
        {
            'labels': [
                gram_in(['Patr', 'Abbr']),
                gnc_match(0, solve_disambiguation=True),
            ],
            'interpretation': {
                'attribute': PersonObject.Attributes.Middlename,
            },
            'normalization':
            NormalizationType.Original,
        },
        {
            'labels': [
                gram('PUNCT'),
                eq('.'),
            ],
        },
        {
            'labels': [
                gram('Surn'),
                gram_not('Abbr'),
                gnc_match(0, solve_disambiguation=True),
            ],
            'normalization':
            NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Lastname,
            },
        },
    ]

    # Иван Иванов
    FirstnameAndLastname = [
        {
            'labels': [
                gram('Name'),
                gram_not('Abbr'),
            ],
            'normalization': NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Firstname,
            },
        },
        {
            'labels': [
                gram('Surn'),
                gram_not('Abbr'),
                gnc_match(-1, solve_disambiguation=True),
            ],
            'normalization':
            NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Lastname,
            },
        },
    ]

    FirstnameAndLastnameWithQuotedNickname = [
        FirstnameAndLastname[0], {
            'labels': {
                gram('QUOTE'),
                gram_any({
                    'G-QUOTE',
                    'L-QUOTE',
                }),
            },
            'normalization': NormalizationType.Original,
        }, {
            'labels': {
                gram_not_in({
                    'QUOTE',
                    'PUNCT',
                }),
            },
            'normalization': NormalizationType.Original,
            'interpretation': {
                'attribute': PersonObject.Attributes.Nickname,
            },
        }, {
            'labels': {
                gram('QUOTE'),
            },
            'normalization': NormalizationType.Original,
        }, InitialsAndLastname[-1]
    ]

    # Иванов Иван
    LastnameAndFirstname = [
        {
            'labels': [
                gram('Surn'),
                gram_not('Abbr'),
            ],
            'normalization': NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Lastname,
            },
        },
        {
            'labels': [
                gram('Name'),
                gram_not('Abbr'),
                gnc_match(-1, solve_disambiguation=True),
            ],
            'normalization':
            NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Firstname,
            },
        },
    ]

    # Александр Ф. Скляр
    FullReversedWithMiddlenameAsInitials = FullReversed[:
                                                        1] + InitialsAndLastname[
                                                            3:]

    # Раневская Л. А.
    LastnameAndInitials = [
        LastnameAndFirstname[0],
    ] + InitialsAndLastname[:4]

    # Раневская Л.
    LastnameAndFirstnameAsInitials = [
        LastnameAndFirstname[0],
    ] + InitialsAndLastname[:2]

    # Л. Раневская
    FirstnameAsInitialsAndLastname = InitialsAndLastname[:2] + [
        InitialsAndLastname[-1],
    ]

    # Иван Иванович
    FirstnameAndMiddlename = [
        {
            'labels': [
                gram('Name'),
                gram_not('Abbr'),
            ],
            'normalization': NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Firstname,
            },
        },
        {
            'labels': [
                gram('Patr'),
                gram_not('Abbr'),
                gnc_match(-1, solve_disambiguation=True),
            ],
            'normalization':
            NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Middlename,
            },
        },
    ]

    # Иванов
    Lastname = [
        {
            'labels': [
                gram('Surn'),
                gram_any({
                    'sing',
                    'Stgm',
                }),
                gram_not('Abbr'),
                is_capitalized(True),
            ],
            'normalization':
            NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Lastname,
            },
        },
    ]

    # Иванович
    Middlename = [{
        'labels': [
            gram('Patr'),
            gram_any({
                'sing',
                'Stgm',
            }),
            gram_not('Abbr'),
            is_capitalized(True),
        ],
        'normalization':
        NormalizationType.Inflected,
        'interpretation': {
            'attribute': PersonObject.Attributes.Middlename,
        },
    }]

    # Иван
    Firstname = [
        {
            'labels': [
                gram('Name'),
                gram_any({
                    'sing',
                    'Stgm',
                }),
                gram_not('Abbr'),
                is_capitalized(True),
            ],
            'normalization':
            NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Firstname,
            },
        },
    ]

    # Отто фон Бисмарк
    FirstnameAndLastnameWithNobilityParticle = [
        FullReversed[0],
        {
            'labels': [
                dictionary(NAME_NOBILITY_PARTICLE_DICTIONARY),
            ],
            'normalization': NormalizationType.Original,
            'interpretation': {
                'attribute': PersonObject.Attributes.Lastname,
            }
        },
        {
            'labels': [
                gram('Surn'),
                gram_not('Abbr'),
                gnc_match(0, solve_disambiguation=True)
            ],
            'normalization':
            NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Lastname,
            },
        },
    ]

    # Премьер-министр РФ Дмитрий Медведев
    WithPosition = [
        {
            'labels': [
                gram('Person/Position'),
            ],
            'interpretation': {
                'attribute': PersonObject.Attributes.Descriptor,
            },
        },
        {
            'labels': [
                or_((
                    and_((
                        or_((
                            gram_any({
                                'ablt',
                                'loct',
                                'gent',
                            }),
                            gram('Fixd'),
                        )),
                        gram_not_in({
                            'Name',
                            'Patr',
                            'Surn',
                        }),
                    )),
                    gram('Abbr'),
                    gram('LATN'),
                )),
            ],
            'optional':
            True,
            'repeatable':
            True,
            'normalization':
            NormalizationType.Original,
        },
        {
            'labels': [
                gram('Name'),
                case_match(0, solve_disambiguation=True),
                number_match(0, solve_disambiguation=True),
            ],
            'normalization':
            NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Firstname,
            },
        },
        {
            'labels': [
                gram('Patr'),
                case_match(0, solve_disambiguation=True),
                number_match(0, solve_disambiguation=True),
                gnc_match(-1, solve_disambiguation=True),
            ],
            'optional':
            True,
            'normalization':
            NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Middlename,
            },
        },
        {
            'labels': [
                gram('Surn'),
                case_match(0, solve_disambiguation=True),
                number_match(0, solve_disambiguation=True),
                gnc_match(-1, solve_disambiguation=True),
            ],
            'normalization':
            NormalizationType.Inflected,
            'interpretation': {
                'attribute': PersonObject.Attributes.Lastname,
            },
        },
    ]

    # Пресс-секретарь «Роснефти» Михаил Леонтьев
    WithPositionAndQuotedOrganisationName = (WithPosition[:2] + [
        {
            'labels': [
                gram('QUOTE'),
            ],
            'normalization': NormalizationType.Original,
        },
        {
            'labels': [
                gram_not('QUOTE'),
                gram_not_in('END-OF-LINE'),
            ],
            'repeatable': True,
            'normalization': NormalizationType.Original,
        },
        {
            'labels': [
                gram('QUOTE'),
            ],
            'normalization': NormalizationType.Original,
        },
    ] + WithPosition[2:])

    # граф де Кристо
    PositionAndNobilitySurname = [{
        'labels': [
            gram('Person/Position'),
        ],
    }, {
        'labels': [
            dictionary(NAME_NOBILITY_PARTICLE_DICTIONARY),
        ],
    }, {
        'labels': [
            gram('Surn'),
            gnc_match(0, solve_disambiguation=True),
        ],
        'interpretation': {
            'attribute': PersonObject.Attributes.Lastname,
        },
    }]

    # Генрих Восьмой / Карл XII
    NameWithNumericPart = [{
        'labels': [
            gram_in({
                'Name',
                'sing',
            }),
            gram_not('Abbr'),
        ],
        'interpretation': {
            'attribute': PersonObject.Attributes.Firstname,
        },
    }, {
        'labels': [
            or_((
                and_((gram_in({
                    'ADJF',
                    'Anum',
                }), gnc_match(-1, solve_disambiguation=True))),
                gram('ROMN'),
            ))
        ]
    }]
Example #2
0
}

SHORT_STREET_DESCRIPTOR_RULE = [
    {
        'labels': {
            or_((
                dictionary({
                    'ул',  # улица
                    'пр',  # проспект / проезд?
                    'проспа',  # проспект
                    'пр-том',  # see kmike/github issue #88
                    'площадь',  # площадь,
                    'пр-кт',  # проспект
                    'пр-далее',  # проезд
                    'б-литр',  # бульвар, #88
                    'б-р',  # бульвар
                    'бул',  # бульвар
                    'наб',  # набережная
                    'ш',  # шоссе
                    'тупой',  # тупик, #88
                    'дора',  # дорога, #88
                }),
                in_({
                    'пер',  # переулок, #88,
                    'н',  # набережная
                }),
            )),
        },
        'normalization': NormalizationType.Original,
        'interpretation': {
            'attribute': AddressObject.Attributes.Street_Descriptor,
        },
Example #3
0
class Organisation(Enum):

    OfficialQuoted = [
        {
            'labels': [
                or_((
                    gram('Orgn/Commercial'),
                    gram('Orgn/Social'),
                    gram('Orgn/Abbr'),
                )),
            ],
            'normalization':
            NormalizationType.Inflected,
            'interpretation': {
                'attribute': OrganisationObject.Attributes.Descriptor,
            },
        },
        {
            'labels': [
                is_abbr(True),
            ],
            'optional': True,
            'repeatable': True,
            'normalization': NormalizationType.Original,
            'interpretation': {
                'attribute': OrganisationObject.Attributes.Descriptor,
            },
        },
        {
            'labels': [
                gram('QUOTE'),
            ],
        },
        {
            'labels': [
                gram_not('QUOTE'),
                gram_not_in('END-OF-LINE'),
                not_eq('.'),
            ],
            'repeatable': True,
            'normalization': NormalizationType.Original,
            'interpretation': {
                'attribute': OrganisationObject.Attributes.Name,
            },
        },
        {
            'labels': [
                gram('QUOTE'),
            ],
        },
    ]

    Abbr = [
        {
            'labels': [
                gram('Abbr'),
                gram('Orgn'),
                gram_not('Orgn/Abbr'),
            ],
            'normalization': NormalizationType.Original,
            'interpretation': {
                'attribute': OrganisationObject.Attributes.Name,
            },
        },
    ]

    IndividualEntrepreneur = [
        {
            'labels': [
                eq('ИП'),
            ],
            'normalization': NormalizationType.Original,
        },
        Person.Full.value[0],
        Person.Full.value[1],
        Person.Full.value[2],
    ]

    SimpleLatin = [
        {
            'labels': [
                gram('Orgn/Commercial'),
            ],
            'normalization': NormalizationType.Normalized,
            'interpretation': {
                'attribute': OrganisationObject.Attributes.Descriptor,
            },
        },
        {
            'labels': [
                gram_any({
                    'LATN',
                    'NUMBER',
                }),
            ],
            'repeatable': True,
            'normalization': NormalizationType.Original,
            'interpretation': {
                'attribute': OrganisationObject.Attributes.Name,
            },
        },
    ]

    # Санкт-Петербургский Государственный университет
    Educational = [{
        'labels': [
            gram('ADJF'),
            is_capitalized(True),
        ],
        'normalization': NormalizationType.Inflected,
        'interpretation': {
            'attribute': OrganisationObject.Attributes.Name,
        },
    }, {
        'labels': [
            gram('ADJF'),
            gnc_match(-1, solve_disambiguation=True),
        ],
        'optional':
        True,
        'repeatable':
        True,
        'normalization':
        NormalizationType.Inflected,
        'interpretation': {
            'attribute': OrganisationObject.Attributes.Name,
        },
    }, {
        'labels': [
            gram('Orgn/Educational'),
            gnc_match(0, solve_disambiguation=True),
            gnc_match(-1, solve_disambiguation=True),
        ],
        'normalization':
        NormalizationType.Normalized,
        'interpretation': {
            'attribute': OrganisationObject.Attributes.Descriptor,
        },
    }, {
        'labels': [
            gram_any({
                'ablt',
                'gent',
            }),
            gram_not_in({
                'PREP',
            }),
            dictionary_not({
                'имя',
            }),
        ],
        'normalization':
        NormalizationType.Original,
        'interpretation': {
            'attribute': OrganisationObject.Attributes.Name,
        },
        'optional':
        True,
        'repeatable':
        True,
    }]

    # Публичная библиотека имени М. Е. Салтыкова-Щедрина
    EducationalWithInitials = Educational + NAMED_ORG_INITIALS_PREFIX_RULE + NAMED_ORG_INITIALS_RULE
    # Публичная библиотека имени Салтыкова-Щедрина
    EducationalWithLastname = Educational + NAMED_ORG_INITIALS_PREFIX_RULE + LASTNAME_GRAMMAR

    # Кировский завод
    AdjCommercial = Educational[:2] + [
        {
            'labels': [
                gram('Orgn/Commercial'),
                gnc_match(0, solve_disambiguation=True),
                gnc_match(-1, solve_disambiguation=True),
            ],
            'normalization':
            NormalizationType.Normalized,
            'interpretation': {
                'attribute': OrganisationObject.Attributes.Descriptor,
            },
        },
        Educational[-1],
    ]

    AdjCommercialWithInitials = AdjCommercial + NAMED_ORG_INITIALS_PREFIX_RULE + NAMED_ORG_INITIALS_RULE
    AdjCommercialWithLastname = AdjCommercial + NAMED_ORG_INITIALS_PREFIX_RULE + LASTNAME_GRAMMAR

    # Общества андрологии и сексуальной медицины
    Social = [
        {
            'labels': [
                gram('Orgn/Social'),
                gram('sing'),
            ],
            'normalization': NormalizationType.Inflected,
            'interpretation': {
                'attribute': OrganisationObject.Attributes.Descriptor,
            },
        },
        {
            'labels': [
                gram_not_in({
                    'PREP',
                    'CONJ',
                }),
                gram_any({
                    'accs',
                    'datv',
                    'gent',
                }),
            ],
            'normalization':
            NormalizationType.Original,
            'interpretation': {
                'attribute': OrganisationObject.Attributes.Name,
            },
        },
        {
            'labels': [
                gram_any({
                    'gent',
                    'accs',
                    'ablt',
                }),
                gram_not_in({
                    'PREP',
                    'Name',
                    'Patr',
                    'Surn',
                }),
            ],
            'optional':
            True,
            'repeatable':
            True,
            'normalization':
            NormalizationType.Original,
            'interpretation': {
                'attribute': OrganisationObject.Attributes.Name,
            },
        },
    ]

    SocialWithInitials = Social + NAMED_ORG_INITIALS_PREFIX_RULE + NAMED_ORG_INITIALS_RULE
    SocialWithLastname = Social + NAMED_ORG_INITIALS_PREFIX_RULE + LASTNAME_GRAMMAR

    AdjSocial = [
        {
            'labels': [
                gram('ADJF'),
            ],
            'normalization': NormalizationType.Inflected,
            'interpretation': {
                'attribute': OrganisationObject.Attributes.Name,
            },
        },
        {
            'labels': [
                gram('ADJF'),
                gnc_match(-1, solve_disambiguation=True),
            ],
            'optional': True,
            'repeatable': True,
            'normalization': NormalizationType.Inflected,
            'interpretation': {
                'attribute': OrganisationObject.Attributes.Name,
            },
        },
        {
            'labels': [
                gram('Orgn/Social'),
                gnc_match(0, solve_disambiguation=True),
                gnc_match(-1, solve_disambiguation=True),
            ],
            'interpretation': {
                'attribute': OrganisationObject.Attributes.Descriptor,
            },
        },
        Social[-1],
    ]

    AdjSocialWithInitials = AdjSocial + NAMED_ORG_INITIALS_PREFIX_RULE + NAMED_ORG_INITIALS_RULE
    AdjSocialWithLastname = AdjSocial + NAMED_ORG_INITIALS_PREFIX_RULE + LASTNAME_GRAMMAR