class Address(Enum): # Садовая улица AdjFull = [ { 'labels': [ gram('ADJF'), gram_not('Abbr'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Name, }, }, { 'labels': [ gram('ADJF'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'repeatable': True, 'optional': True, 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Name, }, }, { 'labels': [ dictionary(STREET_DESCRIPTOR_DICTIONARY), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Descriptor, }, }, ] # улица Садовая AdjFullReversed = [ { 'labels': [ dictionary(STREET_DESCRIPTOR_DICTIONARY), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Descriptor, }, }, { 'labels': [ gram('ADJF'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'repeatable': True, 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Name, }, }, ] # ул. Садовая AdjShort = SHORT_STREET_DESCRIPTOR_RULE + AdjFull[:2] # Садовая ул. AdjShortReversed = AdjFull[:2] + SHORT_STREET_DESCRIPTOR_RULE # улица Красных Десантников AdjNounFull = [AdjFullReversed[0]] + AdjFull[:2] + [{ 'labels': [ gram('gent'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'repeatable': True, 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Name, }, }] # ул. Красных Десантников AdjNounShort = AdjShort + [AdjNounFull[-1]] # улица Карла Маркса GentFullReversed = [ AdjFullReversed[0], { 'labels': [ gram('gent'), gram_not('Abbr'), ], 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Name, }, }, { 'labels': [ gram('gent'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True) ], 'optional': True, 'repeatable': True, 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Name, }, }, ] # улица К. Маркса GentFullReversedWithShortcut = [ GentFullReversed[0], { 'labels': [ gram('Abbr'), ], 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Name, }, }, { 'labels': [ eq('.'), ], 'normalization': NormalizationType.Original, }, ] + GentFullReversed[1:] # улица В. В. Ленина GentFullReversedWithExtendedShortcut = GentFullReversedWithShortcut[:3] + GentFullReversedWithShortcut[ 1:3] + GentFullReversedWithShortcut[3:] # пр. Маршала жукова GentShortReversed = SHORT_STREET_DESCRIPTOR_RULE + GentFullReversed[1:] # пр. К. Маркса GentShortReversedWithShortcut = SHORT_STREET_DESCRIPTOR_RULE + GentFullReversedWithShortcut[ 1:] # пл. В. В. Ленина GentShortReversedWithExtendedShortcut = SHORT_STREET_DESCRIPTOR_RULE + GentFullReversedWithExtendedShortcut[ 1:] # Николая Ершова улица GentFull = GentFullReversed[1:] + GentFullReversed[:1] # Обуховской Обороны пр-кт GentShort = GentShortReversed[2:] + SHORT_STREET_DESCRIPTOR_RULE # 1-я новорублевская улица AdjFullWithNumericPart = NUMERIC_STREET_PART_RULE + AdjFull # улица 1-я новорублевская AdjFullReversedWithNumericPart = AdjFullReversed[: 1] + AdjFullWithNumericPart[: -1] # 1-я новорублевская ул. AdjShortWithNumericPart = AdjFullWithNumericPart[:-1] + SHORT_STREET_DESCRIPTOR_RULE # ул. 1-я промышленная AdjShortReversedWithNumericPart = SHORT_STREET_DESCRIPTOR_RULE + AdjFullWithNumericPart[: -1] # проспект 50 лет октября GentFullReversedWithNumericPrefix = GentFullReversed[:1] + NUMERIC_STREET_PART_WITHOUT_SUFFIX_RULE + GentFullReversed[ 1:2] + GentFullReversed[1:] # пр-т. 50 лет советской власти GentShortReversedWithNumericPrefix = GentShortReversed[:2] + NUMERIC_STREET_PART_WITHOUT_SUFFIX_RULE + GentFullReversed[ 1:2] + GentFullReversed[1:] # 2-ой проезд Перова Поля GentNumericSplittedByFullDescriptor = NUMERIC_STREET_PART_RULE + GentFullReversed # 7-я ул. текстильщиков GentNumericSplittedByShortDescriptor = NUMERIC_STREET_PART_RULE + GentShortReversed ''' Street names with house numbers ''' # Зеленая улица, дом 7 AdjFullWithHn = AdjFull + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # улица Зеленая, дом 7 AdjFullReversedWithHn = AdjFullReversed + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # ул. Нижняя Красносельская дом 7 AdjShortWithHn = AdjShort + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # Настасьинский пер., дом 2 AdjShortReversedWithHn = AdjShortReversed + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # улица Красной Гвардии, дом 2 AdjNounFullWithHn = AdjNounFull + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # ул. Брянской пролетарской дивизии дом 2 AdjNounShortWithHn = AdjNounShort + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # Николая Ершова улица дом 1 GentFullWithHn = GentFull + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # улица Карла Маркса дом 1 GentFullReversedWithHn = GentFullReversed + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # улица К. Маркса, дом 1 GentFullReversedWithShortcutWithHn = GentFullReversedWithShortcut + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # улица В. И. Ленина, дом 1 GentFullReversedWithExtendedShortcutWithHn = GentFullReversedWithExtendedShortcut + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # Обуховской Обороны пр-кт дом 1 GentShortWithHn = GentShort + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # пр-кт Обуховской Обороны дом 1 GentShortReversedWithHn = GentShortReversed + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # ул. К. Маркса, дом 1 GentShortReversedWithShortcutWithHn = GentShortReversedWithShortcut + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # ул. В. И. Ленина, дом 1 GentShortReversedWithExtendedShortcutWithHn = GentShortReversedWithExtendedShortcut + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # 1-я новорублевская улица дом 1 AdjFullWithNumericPartWithHn = AdjFullWithNumericPart + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # улица 1-я новорублевская, дом 1 AdjFullReversedWithNumericPartWithHn = AdjFullReversedWithNumericPart + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # 1-я новорублевская ул. дом 1 AdjShortWithNumericPartWithHn = AdjShortWithNumericPart + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # ул. 1-я промышленная, дом 1 AdjShortReversedWithNumericPartWithHn = AdjShortReversedWithNumericPart + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # проспект 50 лет октября, дом 1 GentFullReversedWithNumericPrefixWithHn = GentFullReversedWithNumericPrefix + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # пр-т. 50 лет советской власти, дом 1 GentShortReversedWithNumericPrefixWithHn = GentShortReversedWithNumericPrefix + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # 2-ой проезд Перова Поля, дом 1 GentNumericSplittedByFullDescriptorWithHn = GentNumericSplittedByFullDescriptor + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR # 7-я ул. текстильщиков, дом 1 GentNumericSplittedByShortDescriptorWithHn = GentNumericSplittedByShortDescriptor + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR ''' Street names with house numbers and letters ''' # Зеленая улица, дом 7, лит А AdjFullWithHnAndLetter = AdjFull + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # улица Зеленая, дом 7, лит А AdjFullReversedWithHnAndLetter = AdjFullReversed + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # ул. Нижняя Красносельская дом 7, лит А AdjShortWithHnAndLetter = AdjShort + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # Настасьинский пер., дом 2 AdjShortReversedWithHnAndLetter = AdjShortReversed + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # улица Красной Гвардии, дом 2 AdjNounFullWithHnAndLetter = AdjNounFull + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # ул. Брянской пролетарской дивизии дом 2 AdjNounShortWithHnAndLetter = AdjNounShort + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # Николая Ершова улица дом 1 GentFullWithHnAndLetter = GentFull + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # улица Карла Маркса дом 1 GentFullReversedWithHnAndLetter = GentFullReversed + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # улица К. Маркса, дом 1 GentFullReversedWithShortcutWithHnAndLetter = GentFullReversedWithShortcut + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # улица В. И. Ленина, дом 1 GentFullReversedWithExtendedShortcutWithHnAndLetter = GentFullReversedWithExtendedShortcut + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # Обуховской Обороны пр-кт дом 1 GentShortWithHnAndLetter = GentShort + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # пр-кт Обуховской Обороны дом 1 GentShortReversedWithHnAndLetter = GentShortReversed + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # ул. К. Маркса, дом 1 GentShortReversedWithShortcutWithHnAndLetter = GentShortReversedWithShortcut + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # ул. В. И. Ленина, дом 1 GentShortReversedWithExtendedShortcutWithHnAndLetter = GentShortReversedWithExtendedShortcut + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # 1-я новорублевская улица дом 1 AdjFullWithNumericPartWithHnAndLetter = AdjFullWithNumericPart + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # улица 1-я новорублевская, дом 1 AdjFullReversedWithNumericPartWithHnAndLetter = AdjFullReversedWithNumericPart + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # 1-я новорублевская ул. дом 1 AdjShortWithNumericPartWithHnAndLetter = AdjShortWithNumericPart + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # ул. 1-я промышленная, дом 1 AdjShortReversedWithNumericPartWithHnAndLetter = AdjShortReversedWithNumericPart + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # проспект 50 лет октября, дом 1 GentFullReversedWithNumericPrefixWithHnAndLetter = GentFullReversedWithNumericPrefix + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # пр-т. 50 лет советской власти, дом 1 GentShortReversedWithNumericPrefixWithHnAndLetter = GentShortReversedWithNumericPrefix + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # 2-ой проезд Перова Поля, дом 1 GentNumericSplittedByFullDescriptorWithHnAndLetter = GentNumericSplittedByFullDescriptor + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR # 7-я ул. текстильщиков, дом 1 GentNumericSplittedByShortDescriptorWithHnAndLetter = GentNumericSplittedByShortDescriptor + OPTIONAL_COMMA_GRAMMAR + HOUSE_NUMBER_GRAMMAR + HOUSE_LETTER_GRAMMAR
class Location(Enum): FederalDistrict = [ { 'labels': [ gram('ADJF'), dictionary(FEDERAL_DISTRICT_DICTIONARY), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Name, }, }, { 'labels': [ gnc_match(-1, solve_disambiguation=True), dictionary({ 'федеральный', }), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Descriptor, }, }, { 'labels': [ gnc_match(-1, solve_disambiguation=True), dictionary({ 'округ', }), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Descriptor, }, }, ] FederalDistrictAbbr = [ { 'labels': [ gram('ADJF'), dictionary(FEDERAL_DISTRICT_DICTIONARY), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Name, }, }, { 'labels': [ eq('ФО'), ], 'interpretation': { 'attribute': LocationObject.Attributes.Descriptor, }, }, ] AutonomousDistrict = [ { 'labels': [ gram('ADJF'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Name, }, }, { 'labels': [ gnc_match(-1, solve_disambiguation=True), dictionary({ 'автономный', }), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Descriptor, }, }, { 'labels': [ gnc_match(-1, solve_disambiguation=True), dictionary({ 'округ', }), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Descriptor, }, }, ] AutonomousDistrictAbbr = [ { 'labels': [ gram('ADJF'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Name, }, }, { 'labels': [ eq('АО'), ], 'interpretation': { 'attribute': LocationObject.Attributes.Descriptor, }, }, ] Region = [ { 'labels': [ gram('ADJF'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Name, }, }, { 'labels': [ dictionary(REGION_TYPE_DICTIONARY), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Descriptor, }, }, ] ComplexObject = [ { 'labels': [ gram('ADJF'), dictionary(COMPLEX_OBJECT_PREFIX_DICTIONARY), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Name, }, }, { 'labels': [ gram('NOUN'), gram('Geox'), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Name, }, }, ] PartialObject = [ { 'labels': [ gram('NOUN'), dictionary(PARTIAL_OBJECT_PREFIX_DICTIONARY), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Name, }, }, { 'labels': [ gram('NOUN'), gram('Geox'), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Name, }, }, ] # Донецкая народная республика / Российская Федерация AdjfFederation = [ { 'labels': [ gram('ADJF'), is_capitalized(True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Name, }, }, { 'labels': [ gram('ADJF'), gnc_match(-1, solve_disambiguation=True), ], 'optional': True, 'repeatable': True, 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Name, }, }, { 'labels': [ gnc_match(0, solve_disambiguation=True), dictionary({ 'федерация', 'республика', 'империя', }), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Descriptor, }, }, ] # Соединенные Штаты / Соединенные Штаты Америки AdjxFederation = [{ 'labels': [ gram('Adjx'), is_capitalized(True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Name, }, }, { 'labels': [ gram('Adjx'), gnc_match(-1, solve_disambiguation=True), ], 'optional': True, 'repeatable': True, 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Name, }, }, { 'labels': [ gnc_match(0, solve_disambiguation=True), dictionary({ 'штат', 'эмират', }), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Name, }, }, { 'labels': [ gram('gent'), ], 'optional': True, 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Name, }, }] Object = [ { 'labels': [ is_capitalized(True), gram('Geox'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': LocationObject.Attributes.Name, }, }, ]
class ProbabilisticPerson(Enum): ''' This grammars matches words that looks like (but may not to be) person names Not included in natasha DEFAULT_GRAMMARS, but shows good result on factRuEval-16 testset ''' FirstnameAndLastname = [ Person.Firstname.value[0], POSSIBLE_LASTNAME_GRAMMAR, ] InitialsAndLastname = Person.InitialsAndLastname.value[:4] + [ POSSIBLE_LASTNAME_GRAMMAR, ] LastnameAndInitials = [POSSIBLE_LASTNAME_GRAMMAR ] + Person.InitialsAndLastname.value[:4] FirstnameAsInitialsAndLastname = Person.InitialsAndLastname.value[:2] + [ POSSIBLE_LASTNAME_GRAMMAR, ] LastnameAndfirstnameAsInitials = [ POSSIBLE_LASTNAME_GRAMMAR, ] + Person.InitialsAndLastname.value[:2] # Джон Х. Доу FirstnameAndMiddlenameAsInitialsWithLastname = FirstnameAndLastname[:1] + Person.InitialsAndLastname.value[ 2:4] + [ POSSIBLE_LASTNAME_GRAMMAR, ] FirstnameAndLastnameWithNobilityParticle = [ Person.Firstname.value[0], { 'labels': [ dictionary(NAME_NOBILITY_PARTICLE_DICTIONARY), ], 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, } }, POSSIBLE_LASTNAME_GRAMMAR, ] FirstnameAndLastnameWithPosition = Person.WithPosition.value[:-1] + [ FirstnameAndLastname[-1] ] # Эрнесто «Че» Гевара FirstnameAndLastnameWithQuotedNickname = Person.Firstname.value[:1] + Person.FirstnameAndLastnameWithQuotedNickname.value[ 1:-1] + [ POSSIBLE_LASTNAME_GRAMMAR, ] # С.П. фон Дервиз InitialsAndLastnameWithNobilityParticle = InitialsAndLastname[:4] + [ FirstnameAndLastnameWithNobilityParticle[1], POSSIBLE_LASTNAME_GRAMMAR, ] # John S. Doe Latin = [ { 'labels': [ gram('LATN'), is_capitalized(True), ], 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, } }, { 'labels': [ gram('LATN'), is_capitalized(True), ], 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': PersonObject.Attributes.Middlename, } }, { 'labels': [gram('PUNCT'), eq('.')], }, { 'labels': [ gram('LATN'), is_capitalized(True), ], 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, } }, ]
'дора', # дорога, #88 }), in_({ 'пер', # переулок, #88, 'н', # набережная }), )), }, 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Descriptor, }, }, { 'labels': [ eq('.'), ], 'optional': True, 'normalization': NormalizationType.Original, } ] NUMERIC_STREET_PART_RULE = [ # 1-я, 10-й, 100500-ой и т.д. { 'labels': [ gram('INT'), gte(1), ], 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Name,
class Person(Enum): # Иванов Иван Иванович Full = [ { 'labels': [ gram('Surn'), gram_not('Abbr'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }, { 'labels': [ gram('Name'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, }, { 'labels': [ gram('Patr'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Middlename, }, }, ] # Иван Иванович Иванов FullReversed = [ { 'labels': [ gram('Name'), gram_not('Abbr'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, }, { 'labels': [ gram('Patr'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Middlename, }, }, { 'labels': [ gram('Surn'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }, ] # Фелипе Родригес Фернандес # https://www.englishelp.ru/business-english/other/284-patronymic-vs-middle-name.html FullReversedWithLatinMiddlename = [ { 'labels': [ gram('Name'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, }, { 'labels': [ gram('Name'), gnc_match(-1, solve_disambiguation=True), ], 'repeatable': True, 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Middlename, }, }, { 'labels': [ gram('Surn'), gnc_match(0, solve_disambiguation=True), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }, ] # Л. А. Раневская InitialsAndLastname = [ { 'labels': [ gram_in(['Name', 'Abbr']), ], 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, 'normalization': NormalizationType.Original, }, { 'labels': [ gram('PUNCT'), eq('.'), ], }, { 'labels': [ gram_in(['Patr', 'Abbr']), gnc_match(0, solve_disambiguation=True), ], 'interpretation': { 'attribute': PersonObject.Attributes.Middlename, }, 'normalization': NormalizationType.Original, }, { 'labels': [ gram('PUNCT'), eq('.'), ], }, { 'labels': [ gram('Surn'), gram_not('Abbr'), gnc_match(0, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }, ] # Иван Иванов FirstnameAndLastname = [ { 'labels': [ gram('Name'), gram_not('Abbr'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, }, { 'labels': [ gram('Surn'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }, ] FirstnameAndLastnameWithQuotedNickname = [ FirstnameAndLastname[0], { 'labels': { gram('QUOTE'), gram_any({ 'G-QUOTE', 'L-QUOTE', }), }, 'normalization': NormalizationType.Original, }, { 'labels': { gram_not_in({ 'QUOTE', 'PUNCT', }), }, 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': PersonObject.Attributes.Nickname, }, }, { 'labels': { gram('QUOTE'), }, 'normalization': NormalizationType.Original, }, InitialsAndLastname[-1] ] # Иванов Иван LastnameAndFirstname = [ { 'labels': [ gram('Surn'), gram_not('Abbr'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }, { 'labels': [ gram('Name'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, }, ] # Александр Ф. Скляр FullReversedWithMiddlenameAsInitials = FullReversed[: 1] + InitialsAndLastname[ 3:] # Раневская Л. А. LastnameAndInitials = [ LastnameAndFirstname[0], ] + InitialsAndLastname[:4] # Раневская Л. LastnameAndFirstnameAsInitials = [ LastnameAndFirstname[0], ] + InitialsAndLastname[:2] # Л. Раневская FirstnameAsInitialsAndLastname = InitialsAndLastname[:2] + [ InitialsAndLastname[-1], ] # Иван Иванович FirstnameAndMiddlename = [ { 'labels': [ gram('Name'), gram_not('Abbr'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, }, { 'labels': [ gram('Patr'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Middlename, }, }, ] # Иванов Lastname = [ { 'labels': [ gram('Surn'), gram_any({ 'sing', 'Stgm', }), gram_not('Abbr'), is_capitalized(True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }, ] # Иванович Middlename = [{ 'labels': [ gram('Patr'), gram_any({ 'sing', 'Stgm', }), gram_not('Abbr'), is_capitalized(True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Middlename, }, }] # Иван Firstname = [ { 'labels': [ gram('Name'), gram_any({ 'sing', 'Stgm', }), gram_not('Abbr'), is_capitalized(True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, }, ] # Отто фон Бисмарк FirstnameAndLastnameWithNobilityParticle = [ FullReversed[0], { 'labels': [ dictionary(NAME_NOBILITY_PARTICLE_DICTIONARY), ], 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, } }, { 'labels': [ gram('Surn'), gram_not('Abbr'), gnc_match(0, solve_disambiguation=True) ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }, ] # Премьер-министр РФ Дмитрий Медведев WithPosition = [ { 'labels': [ gram('Person/Position'), ], 'interpretation': { 'attribute': PersonObject.Attributes.Descriptor, }, }, { 'labels': [ or_(( and_(( or_(( gram_any({ 'ablt', 'loct', 'gent', }), gram('Fixd'), )), gram_not_in({ 'Name', 'Patr', 'Surn', }), )), gram('Abbr'), gram('LATN'), )), ], 'optional': True, 'repeatable': True, 'normalization': NormalizationType.Original, }, { 'labels': [ gram('Name'), case_match(0, solve_disambiguation=True), number_match(0, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, }, { 'labels': [ gram('Patr'), case_match(0, solve_disambiguation=True), number_match(0, solve_disambiguation=True), gnc_match(-1, solve_disambiguation=True), ], 'optional': True, 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Middlename, }, }, { 'labels': [ gram('Surn'), case_match(0, solve_disambiguation=True), number_match(0, solve_disambiguation=True), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }, ] # Пресс-секретарь «Роснефти» Михаил Леонтьев WithPositionAndQuotedOrganisationName = (WithPosition[:2] + [ { 'labels': [ gram('QUOTE'), ], 'normalization': NormalizationType.Original, }, { 'labels': [ gram_not('QUOTE'), gram_not_in('END-OF-LINE'), ], 'repeatable': True, 'normalization': NormalizationType.Original, }, { 'labels': [ gram('QUOTE'), ], 'normalization': NormalizationType.Original, }, ] + WithPosition[2:]) # граф де Кристо PositionAndNobilitySurname = [{ 'labels': [ gram('Person/Position'), ], }, { 'labels': [ dictionary(NAME_NOBILITY_PARTICLE_DICTIONARY), ], }, { 'labels': [ gram('Surn'), gnc_match(0, solve_disambiguation=True), ], 'interpretation': { 'attribute': PersonObject.Attributes.Lastname, }, }] # Генрих Восьмой / Карл XII NameWithNumericPart = [{ 'labels': [ gram_in({ 'Name', 'sing', }), gram_not('Abbr'), ], 'interpretation': { 'attribute': PersonObject.Attributes.Firstname, }, }, { 'labels': [ or_(( and_((gram_in({ 'ADJF', 'Anum', }), gnc_match(-1, solve_disambiguation=True))), gram('ROMN'), )) ] }]
class Organisation(Enum): OfficialQuoted = [ { 'labels': [ or_(( gram('Orgn/Commercial'), gram('Orgn/Social'), gram('Orgn/Abbr'), )), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': OrganisationObject.Attributes.Descriptor, }, }, { 'labels': [ is_abbr(True), ], 'optional': True, 'repeatable': True, 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': OrganisationObject.Attributes.Descriptor, }, }, { 'labels': [ gram('QUOTE'), ], }, { 'labels': [ gram_not('QUOTE'), gram_not_in('END-OF-LINE'), not_eq('.'), ], 'repeatable': True, 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, { 'labels': [ gram('QUOTE'), ], }, ] Abbr = [ { 'labels': [ gram('Abbr'), gram('Orgn'), gram_not('Orgn/Abbr'), ], 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, ] IndividualEntrepreneur = [ { 'labels': [ eq('ИП'), ], 'normalization': NormalizationType.Original, }, Person.Full.value[0], Person.Full.value[1], Person.Full.value[2], ] SimpleLatin = [ { 'labels': [ gram('Orgn/Commercial'), ], 'normalization': NormalizationType.Normalized, 'interpretation': { 'attribute': OrganisationObject.Attributes.Descriptor, }, }, { 'labels': [ gram_any({ 'LATN', 'NUMBER', }), ], 'repeatable': True, 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, ] # Санкт-Петербургский Государственный университет Educational = [{ 'labels': [ gram('ADJF'), is_capitalized(True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, { 'labels': [ gram('ADJF'), gnc_match(-1, solve_disambiguation=True), ], 'optional': True, 'repeatable': True, 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, { 'labels': [ gram('Orgn/Educational'), gnc_match(0, solve_disambiguation=True), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Normalized, 'interpretation': { 'attribute': OrganisationObject.Attributes.Descriptor, }, }, { 'labels': [ gram_any({ 'ablt', 'gent', }), gram_not_in({ 'PREP', }), dictionary_not({ 'имя', }), ], 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, 'optional': True, 'repeatable': True, }] # Публичная библиотека имени М. Е. Салтыкова-Щедрина EducationalWithInitials = Educational + NAMED_ORG_INITIALS_PREFIX_RULE + NAMED_ORG_INITIALS_RULE # Публичная библиотека имени Салтыкова-Щедрина EducationalWithLastname = Educational + NAMED_ORG_INITIALS_PREFIX_RULE + LASTNAME_GRAMMAR # Кировский завод AdjCommercial = Educational[:2] + [ { 'labels': [ gram('Orgn/Commercial'), gnc_match(0, solve_disambiguation=True), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Normalized, 'interpretation': { 'attribute': OrganisationObject.Attributes.Descriptor, }, }, Educational[-1], ] AdjCommercialWithInitials = AdjCommercial + NAMED_ORG_INITIALS_PREFIX_RULE + NAMED_ORG_INITIALS_RULE AdjCommercialWithLastname = AdjCommercial + NAMED_ORG_INITIALS_PREFIX_RULE + LASTNAME_GRAMMAR # Общества андрологии и сексуальной медицины Social = [ { 'labels': [ gram('Orgn/Social'), gram('sing'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': OrganisationObject.Attributes.Descriptor, }, }, { 'labels': [ gram_not_in({ 'PREP', 'CONJ', }), gram_any({ 'accs', 'datv', 'gent', }), ], 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, { 'labels': [ gram_any({ 'gent', 'accs', 'ablt', }), gram_not_in({ 'PREP', 'Name', 'Patr', 'Surn', }), ], 'optional': True, 'repeatable': True, 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, ] SocialWithInitials = Social + NAMED_ORG_INITIALS_PREFIX_RULE + NAMED_ORG_INITIALS_RULE SocialWithLastname = Social + NAMED_ORG_INITIALS_PREFIX_RULE + LASTNAME_GRAMMAR AdjSocial = [ { 'labels': [ gram('ADJF'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, { 'labels': [ gram('ADJF'), gnc_match(-1, solve_disambiguation=True), ], 'optional': True, 'repeatable': True, 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, { 'labels': [ gram('Orgn/Social'), gnc_match(0, solve_disambiguation=True), gnc_match(-1, solve_disambiguation=True), ], 'interpretation': { 'attribute': OrganisationObject.Attributes.Descriptor, }, }, Social[-1], ] AdjSocialWithInitials = AdjSocial + NAMED_ORG_INITIALS_PREFIX_RULE + NAMED_ORG_INITIALS_RULE AdjSocialWithLastname = AdjSocial + NAMED_ORG_INITIALS_PREFIX_RULE + LASTNAME_GRAMMAR
type_required, string_type, ) from yargy.compat import string_type from yargy.parser import OR from yargy.normalization import NormalizationType from natasha.grammars import Person from natasha.grammars.organisation.interpretation import OrganisationObject NAMED_ORG_INITIALS_PREFIX_RULE = [ OR( [ { 'labels': [ eq('им'), # имени ], 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': OrganisationObject.Attributes.Name, }, }, { 'labels': [ eq('.'), ], 'normalization': NormalizationType.Original, } ], [{ 'labels': [
class Street(Enum): # Садовая улица AdjFull = [ { 'labels': [ gram('ADJF'), gram_not('Abbr'), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Name, }, }, { 'labels': [ gram('ADJF'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'repeatable': True, 'optional': True, 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Name, }, }, { 'labels': [ dictionary(STREET_DESCRIPTOR_DICTIONARY), gnc_match(-1, solve_disambiguation=True), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Descriptor, }, }, ] # улица Садовая AdjFullReversed = [ { 'labels': [ dictionary(STREET_DESCRIPTOR_DICTIONARY), ], 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Descriptor, }, }, { 'labels': [ gram('ADJF'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'repeatable': True, 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Name, }, }, ] # ул. Садовая AdjShort = SHORT_STREET_DESCRIPTOR_RULE + AdjFull[:2] # Садовая ул. AdjShortReversed = AdjFull[:2] + SHORT_STREET_DESCRIPTOR_RULE # улица Красных Десантников AdjNounFull = [AdjFullReversed[0]] + AdjFull[:2] + [ { 'labels': [ gram('gent'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True), ], 'repeatable': True, 'normalization': NormalizationType.Inflected, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Name, }, } ] # ул. Красных Десантников AdjNounShort = AdjShort + [ AdjNounFull[-1] ] # улица Карла Маркса GentFullReversed = [ AdjFullReversed[0], { 'labels': [ gram('gent'), gram_not('Abbr'), ], 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Name, }, }, { 'labels': [ gram('gent'), gram_not('Abbr'), gnc_match(-1, solve_disambiguation=True) ], 'optional': True, 'repeatable': True, 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Name, }, }, ] # улица К. Маркса GentFullReversedWithShortcut = [ GentFullReversed[0], { 'labels': [ gram('Abbr'), ], 'normalization': NormalizationType.Original, 'interpretation': { 'attribute': AddressObject.Attributes.Street_Name, }, }, { 'labels': [ eq('.'), ], 'normalization': NormalizationType.Original, }, ] + GentFullReversed[1:] # улица В. В. Ленина GentFullReversedWithExtendedShortcut = GentFullReversedWithShortcut[:3] + GentFullReversedWithShortcut[1:3] + GentFullReversedWithShortcut[3:] # пр. Маршала жукова GentShortReversed = SHORT_STREET_DESCRIPTOR_RULE + GentFullReversed[1:] # пр. К. Маркса GentShortReversedWithShortcut = SHORT_STREET_DESCRIPTOR_RULE + GentFullReversedWithShortcut[1:] # пл. В. В. Ленина GentShortReversedWithExtendedShortcut = SHORT_STREET_DESCRIPTOR_RULE + GentFullReversedWithExtendedShortcut[1:] # Николая Ершова улица GentFull = GentFullReversed[1:] + GentFullReversed[:1] # Обуховской Обороны пр-кт GentShort = GentShortReversed[2:] + SHORT_STREET_DESCRIPTOR_RULE # 1-я новорублевская улица AdjFullWithNumericPart = NUMERIC_STREET_PART_RULE + AdjFull # улица 1-я новорублевская AdjFullReversedWithNumericPart = AdjFullReversed[:1] + AdjFullWithNumericPart[:-1] # 1-я новорублевская ул. AdjShortWithNumericPart = AdjFullWithNumericPart[:-1] + SHORT_STREET_DESCRIPTOR_RULE # ул. 1-я промышленная AdjShortReversedWithNumericPart = SHORT_STREET_DESCRIPTOR_RULE + AdjFullWithNumericPart[:-1] # проспект 50 лет октября GentFullReversedWithNumericPrefix = GentFullReversed[:1] + NUMERIC_STREET_PART_WITHOUT_SUFFIX_RULE + GentFullReversed[1:2] + GentFullReversed[1:] # пр-т. 50 лет советской власти GentShortReversedWithNumericPrefix = GentShortReversed[:2] + NUMERIC_STREET_PART_WITHOUT_SUFFIX_RULE + GentFullReversed[1:2] + GentFullReversed[1:] # 2-ой проезд Перова Поля GentNumericSplittedByFullDescriptor = NUMERIC_STREET_PART_RULE + GentFullReversed # 7-я ул. текстильщиков GentNumericSplittedByShortDescriptor = NUMERIC_STREET_PART_RULE + GentShortReversed