class Person(Enum): # Иванов Иван Иванович Full = [ { 'labels': [ gram('Surn'), gram_not('Abbr'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }, { 'labels': [ gram('Name'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, }, { 'labels': [ gram('Patr'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Middlename, }, }, ] # Иван Иванович Иванов FullReversed = [ { 'labels': [ gram('Name'), gram_not('Abbr'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, }, { 'labels': [ gram('Patr'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Middlename, }, }, { 'labels': [ gram('Surn'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }, ] # Фелипе Родригес Фернандес # https://www.englishelp.ru/business-english/other/284-patronymic-vs-middle-name.html FullReversedWithLatinMiddlename = [ { 'labels': [ gram('Name'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, }, { 'labels': [ gram('Name'), gnc_match(-1, solve_disambiguation=True), ], 'repeatable': True, 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Middlename, }, }, { 'labels': [ gram('Surn'), gnc_match(0, solve_disambiguation=True), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }, ] # Л. А. Раневская InitialsAndLastname = [ { 'labels': [ gram_in(['Name', 'Abbr']), ], 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, 'normalization': NormalizationType.Original, }, { 'labels': [ gram('PUNCT'), eq('.'), ], }, { 'labels': [ gram_in(['Patr', 'Abbr']), gnc_match(0, solve_disambiguation=True), ], 'interpretation': { 'attribute': PersonObject.Attributes.Middlename, }, 'normalization': NormalizationType.Original, }, { 'labels': [ gram('PUNCT'), eq('.'), ], }, { 'labels': [ gram('Surn'), gram_not('Abbr'), gnc_match(0, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }, ] # Иван Иванов FirstnameAndLastname = [ { 'labels': [ gram('Name'), gram_not('Abbr'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, }, { 'labels': [ gram('Surn'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }, ] FirstnameAndLastnameWithQuotedNickname = [ FirstnameAndLastname[0], { 'labels': { gram('QUOTE'), gram_any({ 'G-QUOTE', 'L-QUOTE', }), }, 'normalization': NormalizationType.Original, }, { 'labels': { gram_not_in({ 'QUOTE', 'PUNCT', }), }, 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': PersonObject.Attributes.Nickname, }, }, { 'labels': { gram('QUOTE'), }, 'normalization': NormalizationType.Original, }, InitialsAndLastname[-1] ] # Иванов Иван LastnameAndFirstname = [ { 'labels': [ gram('Surn'), gram_not('Abbr'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }, { 'labels': [ gram('Name'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, }, ] # Александр Ф. Скляр FullReversedWithMiddlenameAsInitials = FullReversed[: 1] + InitialsAndLastname[ 3:] # Раневская Л. А. LastnameAndInitials = [ LastnameAndFirstname[0], ] + InitialsAndLastname[:4] # Раневская Л. LastnameAndFirstnameAsInitials = [ LastnameAndFirstname[0], ] + InitialsAndLastname[:2] # Л. Раневская FirstnameAsInitialsAndLastname = InitialsAndLastname[:2] + [ InitialsAndLastname[-1], ] # Иван Иванович FirstnameAndMiddlename = [ { 'labels': [ gram('Name'), gram_not('Abbr'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, }, { 'labels': [ gram('Patr'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Middlename, }, }, ] # Иванов Lastname = [ { 'labels': [ gram('Surn'), gram_any({ 'sing', 'Stgm', }), gram_not('Abbr'), is_capitalized(True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }, ] # Иванович Middlename = [{ 'labels': [ gram('Patr'), gram_any({ 'sing', 'Stgm', }), gram_not('Abbr'), is_capitalized(True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Middlename, }, }] # Иван Firstname = [ { 'labels': [ gram('Name'), gram_any({ 'sing', 'Stgm', }), gram_not('Abbr'), is_capitalized(True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, }, ] # Отто фон Бисмарк FirstnameAndLastnameWithNobilityParticle = [ FullReversed[0], { 'labels': [ dictionary(NAME_NOBILITY_PARTICLE_DICTIONARY), ], 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, } }, { 'labels': [ gram('Surn'), gram_not('Abbr'), gnc_match(0, solve_disambiguation=True) ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }, ] # Премьер-министр РФ Дмитрий Медведев WithPosition = [ { 'labels': [ gram('Person/Position'), ], 'interpretation': { 'attribute': PersonObject.Attributes.Descriptor, }, }, { 'labels': [ or_(( and_(( or_(( gram_any({ 'ablt', 'loct', 'gent', }), gram('Fixd'), )), gram_not_in({ 'Name', 'Patr', 'Surn', }), )), gram('Abbr'), gram('LATN'), )), ], 'optional': True, 'repeatable': True, 'normalization': NormalizationType.Original, }, { 'labels': [ gram('Name'), case_match(0, solve_disambiguation=True), number_match(0, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, }, { 'labels': [ gram('Patr'), case_match(0, solve_disambiguation=True), number_match(0, solve_disambiguation=True), gnc_match(-1, solve_disambiguation=True), ], 'optional': True, 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Middlename, }, }, { 'labels': [ gram('Surn'), case_match(0, solve_disambiguation=True), number_match(0, solve_disambiguation=True), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }, ] # Пресс-секретарь «Роснефти» Михаил Леонтьев WithPositionAndQuotedOrganisationName = (WithPosition[:2] + [ { 'labels': [ gram('QUOTE'), ], 'normalization': NormalizationType.Original, }, { 'labels': [ gram_not('QUOTE'), gram_not_in('END-OF-LINE'), ], 'repeatable': True, 'normalization': NormalizationType.Original, }, { 'labels': [ gram('QUOTE'), ], 'normalization': NormalizationType.Original, }, ] + WithPosition[2:]) # граф де Кристо PositionAndNobilitySurname = [{ 'labels': [ gram('Person/Position'), ], }, { 'labels': [ dictionary(NAME_NOBILITY_PARTICLE_DICTIONARY), ], }, { 'labels': [ gram('Surn'), gnc_match(0, solve_disambiguation=True), ], 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }] # Генрих Восьмой / Карл XII NameWithNumericPart = [{ 'labels': [ gram_in({ 'Name', 'sing', }), gram_not('Abbr'), ], 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, }, { 'labels': [ or_(( and_((gram_in({ 'ADJF', 'Anum', }), gnc_match(-1, solve_disambiguation=True))), gram('ROMN'), )) ] }]
} SHORT_STREET_DESCRIPTOR_RULE = [ { 'labels': { or_(( dictionary({ 'ул', # улица 'пр', # проспект / проезд? 'проспа', # проспект 'пр-том', # see kmike/github issue #88 'площадь', # площадь, 'пр-кт', # проспект 'пр-далее', # проезд 'б-литр', # бульвар, #88 'б-р', # бульвар 'бул', # бульвар 'наб', # набережная 'ш', # шоссе 'тупой', # тупик, #88 'дора', # дорога, #88 }), in_({ 'пер', # переулок, #88, 'н', # набережная }), )), }, 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Descriptor, },
class Organisation(Enum): OfficialQuoted = [ { 'labels': [ or_(( gram('Orgn/Commercial'), gram('Orgn/Social'), gram('Orgn/Abbr'), )), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': OrganisationObject.Attributes.Descriptor, }, }, { 'labels': [ is_abbr(True), ], 'optional': True, 'repeatable': True, 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': OrganisationObject.Attributes.Descriptor, }, }, { 'labels': [ gram('QUOTE'), ], }, { 'labels': [ gram_not('QUOTE'), gram_not_in('END-OF-LINE'), not_eq('.'), ], 'repeatable': True, 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, { 'labels': [ gram('QUOTE'), ], }, ] Abbr = [ { 'labels': [ gram('Abbr'), gram('Orgn'), gram_not('Orgn/Abbr'), ], 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, ] IndividualEntrepreneur = [ { 'labels': [ eq('ИП'), ], 'normalization': NormalizationType.Original, }, Person.Full.value[0], Person.Full.value[1], Person.Full.value[2], ] SimpleLatin = [ { 'labels': [ gram('Orgn/Commercial'), ], 'normalization': NormalizationType.Normalized, 'interpretation': { 'attribute': OrganisationObject.Attributes.Descriptor, }, }, { 'labels': [ gram_any({ 'LATN', 'NUMBER', }), ], 'repeatable': True, 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, ] # Санкт-Петербургский Государственный университет Educational = [{ 'labels': [ gram('ADJF'), is_capitalized(True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, { 'labels': [ gram('ADJF'), gnc_match(-1, solve_disambiguation=True), ], 'optional': True, 'repeatable': True, 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, { 'labels': [ gram('Orgn/Educational'), gnc_match(0, solve_disambiguation=True), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Normalized, 'interpretation': { 'attribute': OrganisationObject.Attributes.Descriptor, }, }, { 'labels': [ gram_any({ 'ablt', 'gent', }), gram_not_in({ 'PREP', }), dictionary_not({ 'имя', }), ], 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, 'optional': True, 'repeatable': True, }] # Публичная библиотека имени М. Е. Салтыкова-Щедрина EducationalWithInitials = Educational + NAMED_ORG_INITIALS_PREFIX_RULE + NAMED_ORG_INITIALS_RULE # Публичная библиотека имени Салтыкова-Щедрина EducationalWithLastname = Educational + NAMED_ORG_INITIALS_PREFIX_RULE + LASTNAME_GRAMMAR # Кировский завод AdjCommercial = Educational[:2] + [ { 'labels': [ gram('Orgn/Commercial'), gnc_match(0, solve_disambiguation=True), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Normalized, 'interpretation': { 'attribute': OrganisationObject.Attributes.Descriptor, }, }, Educational[-1], ] AdjCommercialWithInitials = AdjCommercial + NAMED_ORG_INITIALS_PREFIX_RULE + NAMED_ORG_INITIALS_RULE AdjCommercialWithLastname = AdjCommercial + NAMED_ORG_INITIALS_PREFIX_RULE + LASTNAME_GRAMMAR # Общества андрологии и сексуальной медицины Social = [ { 'labels': [ gram('Orgn/Social'), gram('sing'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': OrganisationObject.Attributes.Descriptor, }, }, { 'labels': [ gram_not_in({ 'PREP', 'CONJ', }), gram_any({ 'accs', 'datv', 'gent', }), ], 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, { 'labels': [ gram_any({ 'gent', 'accs', 'ablt', }), gram_not_in({ 'PREP', 'Name', 'Patr', 'Surn', }), ], 'optional': True, 'repeatable': True, 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, ] SocialWithInitials = Social + NAMED_ORG_INITIALS_PREFIX_RULE + NAMED_ORG_INITIALS_RULE SocialWithLastname = Social + NAMED_ORG_INITIALS_PREFIX_RULE + LASTNAME_GRAMMAR AdjSocial = [ { 'labels': [ gram('ADJF'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, { 'labels': [ gram('ADJF'), gnc_match(-1, solve_disambiguation=True), ], 'optional': True, 'repeatable': True, 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, { 'labels': [ gram('Orgn/Social'), gnc_match(0, solve_disambiguation=True), gnc_match(-1, solve_disambiguation=True), ], 'interpretation': { 'attribute': OrganisationObject.Attributes.Descriptor, }, }, Social[-1], ] AdjSocialWithInitials = AdjSocial + NAMED_ORG_INITIALS_PREFIX_RULE + NAMED_ORG_INITIALS_RULE AdjSocialWithLastname = AdjSocial + NAMED_ORG_INITIALS_PREFIX_RULE + LASTNAME_GRAMMAR