def test_sfinxbis(self): """Test abydos.phonetic.SfinxBis.""" self.assertEqual(self.pa.encode(''), ('', )) # http://www.swami.se/download/18.248ad5af12aa81365338000106/TestSfinx.txt # cases where the gold standard gave clearly wrong values have been # corrected below (marked with '# wrong' self.assertEqual(self.pa.encode('af Sandeberg'), ('S53162', )) self.assertEqual(self.pa.encode('av Ekenstam'), ('$25835', )) self.assertEqual(self.pa.encode('Da Costa'), ('K83', )) self.assertEqual(self.pa.encode('Das Neves'), ('D8', 'N78')) self.assertEqual(self.pa.encode('de Besche'), ('B8', )) self.assertEqual(self.pa.encode('de la Motte'), ('M3', )) self.assertEqual(self.pa.encode('de Las Heras'), ('H68', )) # wrong self.assertEqual(self.pa.encode('de Los Santos'), ('S538', )) self.assertEqual(self.pa.encode('del Rosario'), ('R862', )) self.assertEqual(self.pa.encode('Den Boer'), ('B6', )) self.assertEqual(self.pa.encode('Der de Kazinczy'), ('D6', 'K8528')) # wrong self.assertEqual(self.pa.encode('des Rieux'), ('R28', )) self.assertEqual(self.pa.encode('Di Luca'), ('L2', )) self.assertEqual(self.pa.encode('Do Rosario'), ('R862', )) self.assertEqual(self.pa.encode('Don Lind'), ('L53', )) self.assertEqual(self.pa.encode('Dos Santos'), ('S538', )) self.assertEqual(self.pa.encode('du Rietz'), ('R38', )) self.assertEqual(self.pa.encode('in de Betou'), ('B3', )) self.assertEqual(self.pa.encode('La Fleur'), ('F46', )) self.assertEqual(self.pa.encode('Le Grand'), ('G653', )) self.assertEqual(self.pa.encode('li Puma'), ('L', 'P5')) self.assertEqual(self.pa.encode('lo Martire'), ('L', 'M636')) self.assertEqual(self.pa.encode('mac Donald'), ('D543', )) self.assertEqual(self.pa.encode('mc Intosh'), ('$538', )) self.assertEqual(self.pa.encode('S:t Cyr'), ('S6', )) self.assertEqual(self.pa.encode('Van Doom'), ('D5', )) self.assertEqual(self.pa.encode('Van de Peppel'), ('P14', )) self.assertEqual(self.pa.encode('Van den Berg'), ('B62', )) self.assertEqual(self.pa.encode('Van Der Kwast'), ('K783', )) self.assertEqual(self.pa.encode('von Ahn'), ('$5', )) self.assertEqual(self.pa.encode('von Dem Knesebeck'), ('K5812', )) self.assertEqual(self.pa.encode('von Der Burg'), ('B62', )) self.assertEqual(self.pa.encode("D'Angelo"), ('D524', )) self.assertEqual(self.pa.encode("O'Conner"), ('$256', )) self.assertEqual(self.pa.encode('Los'), ('L8', )) self.assertEqual(self.pa.encode('Mac'), ('M2', )) self.assertEqual(self.pa.encode('Till'), ('T4', )) self.assertEqual(self.pa.encode('Van'), ('V5', )) self.assertEqual(self.pa.encode('Von'), ('V5', )) self.assertEqual(self.pa.encode('Bernadotte af Wisborg'), ('B6533', 'V8162')) self.assertEqual(self.pa.encode('Hjort af Ornäs'), ('J63', '$658')) self.assertEqual(self.pa.encode('Horn af Åminne'), ('H65', '$55')) self.assertEqual(self.pa.encode('Horn av Åminne'), ('H65', '$55')) self.assertEqual(self.pa.encode('Hård af Segerstad'), ('H63', 'S26833')) self.assertEqual(self.pa.encode('Hård av Segerstad'), ('H63', 'S26833')) self.assertEqual(self.pa.encode('Stael von Holstein'), ('S34', 'H48325')) self.assertEqual(self.pa.encode('de Oliveira e Silva'), ('$4726', 'S47')) self.assertEqual(self.pa.encode('de Alfaro y Gómez'), ('$476', 'G58')) self.assertEqual(self.pa.encode('Arjaliès-de la Lande'), ('$6248', 'L53')) self.assertEqual(self.pa.encode('Dominicus van den Bussche'), ('D5528', 'B8')) self.assertEqual(self.pa.encode('Edebol Eeg-Olofsson'), ('$314', '$2', '$4785')) self.assertEqual(self.pa.encode('Jonsson-Blomqvist'), ('J585', 'B452783')) self.assertEqual(self.pa.encode('Kiviniemi Birgersson'), ('#755', 'B62685')) self.assertEqual(self.pa.encode('Massena Serpa dos Santos'), ('M85', 'S61', 'S538')) self.assertEqual(self.pa.encode('S:t Clair Renard'), ('K426', 'R563')) self.assertEqual(self.pa.encode('Skoog H Andersson'), ('S22', 'H', '$53685')) self.assertEqual(self.pa.encode('von Post-Skagegård'), ('P83', 'S22263')) self.assertEqual(self.pa.encode('von Zur-Mühlen'), ('S6', 'M45')) self.assertEqual(self.pa.encode('Waltå O:son'), ('V43', '$85')) self.assertEqual(self.pa.encode('Zardán Gómez de la Torre'), ('S635', 'G58', 'T6')) self.assertEqual(self.pa.encode('af Jochnick'), ('J252', )) self.assertEqual(self.pa.encode('af Ioscnick'), ('J8252', )) self.assertEqual(self.pa.encode('Aabakken'), ('$125', )) self.assertEqual(self.pa.encode('Åbacken'), ('$125', )) self.assertEqual(self.pa.encode('Ahlen'), ('$45', )) self.assertEqual(self.pa.encode('Aleen'), ('$45', )) self.assertEqual(self.pa.encode('Braunerhielm'), ('B656245', )) self.assertEqual(self.pa.encode('Branneerhielm'), ('B656245', )) self.assertEqual(self.pa.encode('Carlzon'), ('K6485', )) self.assertEqual(self.pa.encode('Karlsson'), ('K6485', )) self.assertEqual(self.pa.encode('Enochsson'), ('$5285', )) self.assertEqual(self.pa.encode('Ericsson'), ('$6285', )) self.assertEqual(self.pa.encode('Ericksson'), ('$6285', )) self.assertEqual(self.pa.encode('Erixson'), ('$6285', )) self.assertEqual(self.pa.encode('Filipsson'), ('F4185', )) self.assertEqual(self.pa.encode('Philipson'), ('F4185', )) self.assertEqual(self.pa.encode('Flycht'), ('F423', )) self.assertEqual(self.pa.encode('Flygt'), ('F423', )) self.assertEqual(self.pa.encode('Flykt'), ('F423', )) self.assertEqual(self.pa.encode('Fröijer'), ('F626', )) self.assertEqual(self.pa.encode('Fröjer'), ('F626', )) self.assertEqual(self.pa.encode('Gertner'), ('J6356', )) self.assertEqual(self.pa.encode('Hiertner'), ('J6356', )) self.assertEqual(self.pa.encode('Hirch'), ('H62', )) self.assertEqual(self.pa.encode('Hirsch'), ('H68', )) self.assertEqual(self.pa.encode('Haegermarck'), ('H26562', )) self.assertEqual(self.pa.encode('Hägermark'), ('H26562', )) self.assertEqual(self.pa.encode('Isaxon'), ('$8285', )) self.assertEqual(self.pa.encode('Isacsson'), ('$8285', )) self.assertEqual(self.pa.encode('Joachimsson'), ('J2585', )) self.assertEqual(self.pa.encode('Joakimson'), ('J2585', )) self.assertEqual(self.pa.encode('Kjell'), ('#4', )) self.assertEqual(self.pa.encode('Käll'), ('#4', )) self.assertEqual(self.pa.encode('Knapp'), ('K51', )) self.assertEqual(self.pa.encode('Krans'), ('K658', )) self.assertEqual(self.pa.encode('Krantz'), ('K6538', )) self.assertEqual(self.pa.encode('Kvist'), ('K783', )) self.assertEqual(self.pa.encode('Quist'), ('K783', )) self.assertEqual(self.pa.encode('Lidbeck'), ('L312', )) self.assertEqual(self.pa.encode('Lidbäck'), ('L312', )) self.assertEqual(self.pa.encode('Linnér'), ('L56', )) self.assertEqual(self.pa.encode('Linner'), ('L56', )) self.assertEqual(self.pa.encode('Lorenzsonn'), ('L6585', )) self.assertEqual(self.pa.encode('Lorentzon'), ('L65385', )) self.assertEqual(self.pa.encode('Lorenßon'), ('L6585', )) self.assertEqual(self.pa.encode('Lyxell'), ('L284', )) self.assertEqual(self.pa.encode('Lycksell'), ('L284', )) self.assertEqual(self.pa.encode('Marcström'), ('M628365', )) self.assertEqual(self.pa.encode('Markström'), ('M628365', )) self.assertEqual(self.pa.encode('Michaelsson'), ('M2485', )) self.assertEqual(self.pa.encode('Mikaelson'), ('M2485', )) self.assertEqual(self.pa.encode('Mörch'), ('M62', )) self.assertEqual(self.pa.encode('Mörck'), ('M62', )) self.assertEqual(self.pa.encode('Mörk'), ('M62', )) self.assertEqual(self.pa.encode('Mørk'), ('M62', )) self.assertEqual(self.pa.encode('Nääs'), ('N8', )) self.assertEqual(self.pa.encode('Naess'), ('N8', )) self.assertEqual(self.pa.encode('Nordstedt'), ('N63833', )) self.assertEqual(self.pa.encode('Oxenstierna'), ('$28583265', )) self.assertEqual(self.pa.encode('Palmçrañtz'), ('P4526538', )) self.assertEqual(self.pa.encode('Palmcrantz'), ('P4526538', )) self.assertEqual(self.pa.encode('Palmkrantz'), ('P4526538', )) self.assertEqual(self.pa.encode('Preuss'), ('P68', )) self.assertEqual(self.pa.encode('Preutz'), ('P638', )) self.assertEqual(self.pa.encode('Richardson'), ('R26385', )) self.assertEqual(self.pa.encode('Rikardson'), ('R26385', )) self.assertEqual(self.pa.encode('Ruuth'), ('R3', )) self.assertEqual(self.pa.encode('Ruth'), ('R3', )) self.assertEqual(self.pa.encode('Sæter'), ('S36', )) self.assertEqual(self.pa.encode('Zäter'), ('S36', )) self.assertEqual(self.pa.encode('Schedin'), ('#35', )) self.assertEqual(self.pa.encode('Sjödin'), ('#35', )) self.assertEqual(self.pa.encode('Siöö'), ('#', )) self.assertEqual(self.pa.encode('Sjöh'), ('#', )) self.assertEqual(self.pa.encode('Svedberg'), ('S73162', )) self.assertEqual(self.pa.encode('Zwedberg'), ('S73162', )) self.assertEqual(self.pa.encode('Tjäder'), ('#36', )) self.assertEqual(self.pa.encode('þornquist'), ('T652783', )) self.assertEqual(self.pa.encode('Thörnqvist'), ('T652783', )) self.assertEqual(self.pa.encode('Törnkvist'), ('T652783', )) self.assertEqual(self.pa.encode('Wichman'), ('V255', )) self.assertEqual(self.pa.encode('Wickman'), ('V255', )) self.assertEqual(self.pa.encode('Wictorin'), ('V2365', )) self.assertEqual(self.pa.encode('Wictorsson'), ('V23685', )) self.assertEqual(self.pa.encode('Viktorson'), ('V23685', )) self.assertEqual(self.pa.encode('Zachrisson'), ('S2685', )) self.assertEqual(self.pa.encode('Zakrison'), ('S2685', )) self.assertEqual(self.pa.encode('Övragård'), ('$76263', )) self.assertEqual(self.pa.encode('Öfvragårdh'), ('$76263', )) self.assertEqual(self.pa.encode('Bogdanovic'), ('B23572', )) self.assertEqual(self.pa.encode('Bogdanovitch'), ('B235732', )) self.assertEqual(self.pa.encode('Dieterich'), ('D362', )) self.assertEqual(self.pa.encode('Eichorn'), ('$265', )) self.assertEqual(self.pa.encode('Friedrich'), ('F6362', )) self.assertEqual(self.pa.encode('Grantcharova'), ('G653267', )) self.assertEqual(self.pa.encode('Ilichev'), ('$427', )) self.assertEqual(self.pa.encode('Ivankovic'), ('$75272', )) self.assertEqual(self.pa.encode('Ivangurich'), ('$75262', )) self.assertEqual(self.pa.encode('Kinch'), ('#52', )) self.assertEqual(self.pa.encode('Kirchmann'), ('#6255', )) self.assertEqual(self.pa.encode('Machado'), ('M23', )) self.assertEqual(self.pa.encode('Reich'), ('R2', )) self.assertEqual(self.pa.encode('Roche'), ('R2', )) self.assertEqual(self.pa.encode('Rubaszkin'), ('R1825', )) self.assertEqual(self.pa.encode('Rubaschkin'), ('R1825', )) self.assertEqual(self.pa.encode('Sanchez'), ('S528', )) self.assertEqual(self.pa.encode('Walukiewicz'), ('V42728', )) self.assertEqual(self.pa.encode('Valukievitch'), ('V42732', )) self.assertEqual(self.pa.encode('K'), ('K', )) self.assertEqual(self.pa.encode('2010'), ('', )) self.assertEqual(self.pa.encode('cese'), ('S8', )) # a few max_length tests self.assertEqual( SfinxBis(3).encode('Kiviniemi Birgersson'), ('#75', 'B62')) self.assertEqual(self.pa4.encode('Eichorn'), ('$265', )) self.assertEqual(self.pa4.encode('Friedrich'), ('F636', )) self.assertEqual(self.pa4.encode('Grantcharova'), ('G653', )) self.assertEqual(self.pa4.encode('Ilichev'), ('$427', )) self.assertEqual(self.pa4.encode('Ivankovic'), ('$752', )) self.assertEqual(self.pa4.encode('Ivangurich'), ('$752', )) self.assertEqual(self.pa4.encode('Kinch'), ('#52', )) self.assertEqual(self.pa4.encode('Kirchmann'), ('#625', )) self.assertEqual(self.pa4.encode('Machado'), ('M23', )) self.assertEqual(self.pa4.encode('Reich'), ('R2', )) self.assertEqual(self.pa4.encode('Roche'), ('R2', )) self.assertEqual(self.pa4.encode('Rubaszkin'), ('R182', )) self.assertEqual(self.pa4.encode('Rubaschkin'), ('R182', )) self.assertEqual(self.pa4.encode('Sanchez'), ('S528', )) self.assertEqual(self.pa4.encode('Walukiewicz'), ('V427', )) self.assertEqual(self.pa4.encode('Valukievitch'), ('V427', )) self.assertEqual(self.pa4.encode('K'), ('K', )) self.assertEqual(self.pa4.encode('2010'), ('', )) self.assertEqual(self.pa4.encode('cese'), ('S8', )) # etc. (for code coverage) self.assertEqual(self.pa.encode('chans'), ('#58', )) self.assertEqual(self.pa.encode('ljud'), ('J3', )) self.assertEqual(self.pa.encode('qi'), ('K', )) self.assertEqual(self.pa.encode('xavier'), ('S76', )) self.assertEqual(self.pa.encode('skjul'), ('#4', )) self.assertEqual(self.pa.encode('schul'), ('#4', )) self.assertEqual(self.pa.encode('skil'), ('#4', )) # max_length bounds tests self.assertEqual(SfinxBis(max_length=-1).encode('Niall'), ('N4', )) self.assertEqual(SfinxBis(max_length=0).encode('Niall'), ('N4', )) # encode_alpha self.assertEqual(self.pa.encode_alpha('Stael von Holstein'), ('STL', 'HLSTKN')) self.assertEqual(self.pa.encode_alpha('de Oliveira e Silva'), ('$LFKR', 'SLF')) self.assertEqual(self.pa.encode_alpha('de Alfaro y Gómez'), ('$LFR', 'GNS')) self.assertEqual(self.pa.encode_alpha('Arjaliès-de la Lande'), ('$RKLS', 'LNT')) # Test wrapper self.assertEqual(sfinxbis('af Sandeberg'), ('S53162', ))
SoundexBR, SpanishMetaphone, StatisticsCanada, Waahlin, ) from . import EXTREME_TEST, _corpus_file, _fuzz, _random_char alpha_sis = AlphaSIS() daitch_mokotoff = DaitchMokotoff() double_metaphone = DoubleMetaphone() haase = Haase() haase_primary = Haase(primary_only=True) koelner = Koelner() russell = RussellIndex() sfinxbis = SfinxBis() sfinxbis_6 = SfinxBis(max_length=6) soundex_census = Soundex(var='Census') spfc = SPFC() algorithms = { 'ainsworth': Ainsworth().encode, 'alpha_sis': lambda _: ', '.join(alpha_sis.encode(_)), 'bmpm': BeiderMorse().encode, 'bmpm_german': BeiderMorse(language_arg='german').encode, 'bmpm_french': BeiderMorse(language_arg='french').encode,
'pshp_soundex_first': PSHPSoundexFirst().encode, 'pshp_soundex_first_german': PSHPSoundexFirst(german=True).encode, 'pshp_soundex_first_ml8': PSHPSoundexFirst(max_length=8).encode, 'pshp_soundex_last': PSHPSoundexLast().encode, 'pshp_soundex_last_german': PSHPSoundexLast(german=True).encode, 'pshp_soundex_last_ml8': PSHPSoundexLast(max_length=8).encode, 'refined_soundex': RefinedSoundex().encode, 'refined_soundex_vowels': RefinedSoundex(retain_vowels=True).encode, 'refined_soundex_0pad_ml6': RefinedSoundex(zero_pad=True, max_length=6).encode, 'reth_schek_phonetik': RethSchek().encode, 'roger_root': RogerRoot().encode, 'roger_root_nopad_ml8': RogerRoot(max_length=8, zero_pad=False).encode, 'russell_index': RussellIndex().encode, 'russell_index_alpha': RussellIndex().encode_alpha, 'sfinxbis': SfinxBis().encode, 'sfinxbis_ml6': SfinxBis(max_length=6).encode, 'sound_d': SoundD().encode, 'sound_d_ml8': SoundD(max_length=8).encode, 'soundex': Soundex().encode, 'soundex_reverse': Soundex(reverse=True).encode, 'soundex_0pad_ml6': Soundex(zero_pad=True, max_length=6).encode, 'soundex_special': Soundex(var='special').encode, 'soundex_census': Soundex(var='Census').encode, 'soundex_br': SoundexBR().encode, 'spanish_metaphone': SpanishMetaphone().encode, 'spanish_metaphone_modified': SpanishMetaphone(modified=True).encode, 'spanish_metaphone_ml4': SpanishMetaphone(max_length=4).encode, 'spfc': lambda _: spfc.encode('{0} {0}'.format(_)), 'statistics_canada': StatisticsCanada().encode, 'statistics_canada_ml8': StatisticsCanada(max_length=8).encode,
nrl = NRL() nysiis = NYSIIS() onca = ONCA() parmar_kumbharana = ParmarKumbharana() phonem = Phonem() phonet = Phonet() phonetic_spanish = PhoneticSpanish() phonex = Phonex() phonix = Phonix() pshp_soundex_first = PSHPSoundexFirst() pshp_soundex_last = PSHPSoundexLast() refined_soundex = RefinedSoundex() reth_schek = RethSchek() roger_root = RogerRoot() russell = RussellIndex() sfinxbis = SfinxBis() sound_d = SoundD() soundex = Soundex() soundex_br = SoundexBR() spanish_metaphone = SpanishMetaphone() spfc = SPFC() statistics_canada = StatisticsCanada() algorithms = { 'russell_index': lambda _: str(russell.encode(_)), 'russell_index_num_to_alpha': lambda _: russell._to_alpha( # noqa: SF01 russell.encode(_)), 'russell_index_alpha': russell.encode_alpha,