class PhoneticTestCases(unittest.TestCase):
    """Test phonetic fingerprint functions.

    abydos.fingerprint.Phonetic
    """

    fp = Phonetic()
    fp_phonet = Phonetic(Phonet())
    fp_soundex = Phonetic(Soundex())
    soundex = Soundex()

    def test_phonetic_fingerprint(self):
        """Test abydos.fingerprint.Phonetic."""
        # Base case
        self.assertEqual(self.fp.fingerprint(''), '')

        self.assertEqual(
            self.fp.fingerprint(' '.join(NIALL)), 'a anl mknl njl nklk nl'
        )
        self.assertEqual(
            self.fp_phonet.fingerprint(' '.join(NIALL)),
            'knile makneil maknele neil nel nele nial nigeli '
            + 'nigl nil noigialach oneil ui',
        )
        self.assertEqual(
            self.fp_soundex.fingerprint(' '.join(NIALL)),
            'k540 m254 n240 n242 n400 o540 u000',
        )
 def test_soundex_special(self):
     """Test abydos.phonetic.Soundex (special 1880-1910 variant method)."""
     pa_special = Soundex(var='special')
     self.assertEqual(pa_special.encode('Ashcroft'), 'A226')
     self.assertEqual(pa_special.encode('Asicroft'), 'A226')
     self.assertEqual(pa_special.encode('AsWcroft'), 'A226')
     self.assertEqual(pa_special.encode('Rupert'), 'R163')
     self.assertEqual(pa_special.encode('Rubin'), 'R150')
Exemple #3
0
    def test_caversham(self):
        """Test using Caversham test set (SoundEx, Metaphone, & Caverphone)."""
        soundex = Soundex()
        metaphone = Metaphone()

        with open(_corpus_file('variantNames.csv')) as cav_testset:
            next(cav_testset)
            for cav_line in cav_testset:
                (
                    name1,
                    soundex1,
                    metaphone1,
                    caverphone1,
                    name2,
                    soundex2,
                    metaphone2,
                    caverphone2,
                    soundex_same,
                    metaphone_same,
                    caverphone_same,
                ) = cav_line.strip().split(',')

                self.assertEqual(soundex.encode(name1), soundex1)
                self.assertEqual(soundex.encode(name2), soundex2)
                if soundex_same == '1':
                    self.assertEqual(soundex.encode(name1),
                                     soundex.encode(name2))
                else:
                    self.assertNotEqual(soundex.encode(name1),
                                        soundex.encode(name2))

                self.assertEqual(metaphone.encode(name1), metaphone1)
                self.assertEqual(metaphone.encode(name2), metaphone2)
                if metaphone_same == '1':
                    self.assertEqual(metaphone.encode(name1),
                                     metaphone.encode(name2))
                else:
                    self.assertNotEqual(metaphone.encode(name1),
                                        metaphone.encode(name2))

                self.assertEqual(self.pa.encode(name1), caverphone1)
                self.assertEqual(self.pa.encode(name2), caverphone2)
                if caverphone_same == '1':
                    self.assertEqual(self.pa.encode(name1),
                                     self.pa.encode(name2))
                else:
                    self.assertNotEqual(self.pa.encode(name1),
                                        self.pa.encode(name2))
 def test_soundex_census(self):
     """Test abydos.phonetic.Soundex(Census variant method)."""
     pa_census = Soundex(var='Census')
     self.assertEqual(pa_census.encode('Vandeusen'), ('V532', 'D250'))
     self.assertEqual(pa_census.encode('van Deusen'), ('V532', 'D250'))
     self.assertEqual(pa_census.encode('McDonald'), 'M235')
     self.assertEqual(pa_census.encode('la Cruz'), ('L262', 'C620'))
     self.assertEqual(pa_census.encode('vanDamme'), ('V535', 'D500'))
class WaahlinTestCases(unittest.TestCase):
    """Test Wåhlin functions.

    test cases for abydos.phonetic.Waahlin
    """

    pa = Waahlin()
    pa_sdx = Waahlin(Soundex())

    def test_waahlin(self):
        """Test abydos.phonetic.Waahlin."""
        self.assertEqual(self.pa.encode(''), '')

        self.assertEqual(self.pa.encode('kjol'), '+OL')
        self.assertEqual(self.pa.encode('stråken'), 'STRÅ+EN')
        self.assertEqual(self.pa.encode('skytten'), '*YTTEN')
        self.assertEqual(self.pa.encode('ljuden'), 'JUDEN')
        self.assertEqual(self.pa.encode('högre'), 'HÖGRE')
        self.assertEqual(self.pa.encode('först'), 'FÖRST')
        self.assertEqual(self.pa.encode('hval'), 'VAL')
        self.assertEqual(self.pa.encode('hrothgar'), 'ROTHGAR')
        self.assertEqual(self.pa.encode('denna'), 'DENNA')
        self.assertEqual(self.pa.encode('djur'), 'JUR')
        self.assertEqual(self.pa.encode('hjärta'), 'JERTA')
        self.assertEqual(self.pa.encode('STIEN'), '*EN')
        self.assertEqual(self.pa.encode('SKJERN'), '*ERN')
        self.assertEqual(self.pa.encode('HIELPA'), 'JELPA')
        self.assertEqual(self.pa.encode('CEILA'), 'SEILA')
        self.assertEqual(self.pa.encode('GELD'), 'JELD')
        self.assertEqual(self.pa.encode('IERN'), 'JERN')

        # encode_alpha
        self.assertEqual(self.pa.encode_alpha('kjol'), 'ÇOL')
        self.assertEqual(self.pa.encode_alpha('stråken'), 'STRÅÇEN')
        self.assertEqual(self.pa.encode_alpha('skytten'), 'ŠYTTEN')
        self.assertEqual(self.pa.encode_alpha('ljuden'), 'JUDEN')

    def test_waahlin_soundex(self):
        """Test abydos.phonetic.Waahlin with Soundex."""
        self.assertEqual(self.pa_sdx.encode(''), '')

        self.assertEqual(self.pa_sdx.encode('kjol'), '+O400')
        self.assertEqual(self.pa_sdx.encode('stråken'), 'ST625')
        self.assertEqual(self.pa_sdx.encode('skytten'), '*Y350')
        self.assertEqual(self.pa_sdx.encode('ljuden'), 'JU350')
        self.assertEqual(self.pa_sdx.encode('högre'), 'HO260')
        self.assertEqual(self.pa_sdx.encode('först'), 'FO623')
        self.assertEqual(self.pa_sdx.encode('hval'), 'VA400')
        self.assertEqual(self.pa_sdx.encode('hrothgar'), 'RO326')
        self.assertEqual(self.pa_sdx.encode('denna'), 'DE500')
        self.assertEqual(self.pa_sdx.encode('djur'), 'JU600')
        self.assertEqual(self.pa_sdx.encode('hjärta'), 'JA630')
 def test_soundex_special(self):
     """Test abydos.phonetic.Soundex (special 1880-1910 variant method)."""
     pa_special = Soundex(var='special')
     self.assertEqual(pa_special.encode('Ashcroft'), 'A226')
     self.assertEqual(pa_special.encode('Asicroft'), 'A226')
     self.assertEqual(pa_special.encode('AsWcroft'), 'A226')
     self.assertEqual(pa_special.encode('Rupert'), 'R163')
     self.assertEqual(pa_special.encode('Rubin'), 'R150')
 def test_soundex_census(self):
     """Test abydos.phonetic.Soundex(Census variant method)."""
     pa_census = Soundex(var='Census')
     self.assertEqual(pa_census.encode('Vandeusen'), ('V532', 'D250'))
     self.assertEqual(pa_census.encode('van Deusen'), ('V532', 'D250'))
     self.assertEqual(pa_census.encode('McDonald'), 'M235')
     self.assertEqual(pa_census.encode('la Cruz'), ('L262', 'C620'))
     self.assertEqual(pa_census.encode('vanDamme'), ('V535', 'D500'))
    def test_caversham(self):
        """Test using Caversham test set (SoundEx, Metaphone, & Caverphone)."""
        soundex = Soundex()
        metaphone = Metaphone()

        with open(_corpus_file('variantNames.csv')) as cav_testset:
            next(cav_testset)
            for cav_line in cav_testset:
                (
                    name1,
                    soundex1,
                    metaphone1,
                    caverphone1,
                    name2,
                    soundex2,
                    metaphone2,
                    caverphone2,
                    soundex_same,
                    metaphone_same,
                    caverphone_same,
                ) = cav_line.strip().split(',')

                self.assertEqual(soundex.encode(name1), soundex1)
                self.assertEqual(soundex.encode(name2), soundex2)
                if soundex_same == '1':
                    self.assertEqual(
                        soundex.encode(name1), soundex.encode(name2)
                    )
                else:
                    self.assertNotEqual(
                        soundex.encode(name1), soundex.encode(name2)
                    )

                self.assertEqual(metaphone.encode(name1), metaphone1)
                self.assertEqual(metaphone.encode(name2), metaphone2)
                if metaphone_same == '1':
                    self.assertEqual(
                        metaphone.encode(name1), metaphone.encode(name2)
                    )
                else:
                    self.assertNotEqual(
                        metaphone.encode(name1), metaphone.encode(name2)
                    )

                self.assertEqual(self.pa.encode(name1), caverphone1)
                self.assertEqual(self.pa.encode(name2), caverphone2)
                if caverphone_same == '1':
                    self.assertEqual(
                        self.pa.encode(name1), self.pa.encode(name2)
                    )
                else:
                    self.assertNotEqual(
                        self.pa.encode(name1), self.pa.encode(name2)
                    )
    def test_soundex(self):
        """Test abydos.phonetic.Soundex."""
        self.assertEqual(self.pa.encode(''), '0000')

        # https://archive.org/stream/accessingindivid00moor#page/14/mode/2up
        self.assertEqual(self.pa.encode('Euler'), 'E460')
        self.assertEqual(self.pa.encode('Gauss'), 'G200')
        self.assertEqual(self.pa.encode('Hilbert'), 'H416')
        self.assertEqual(self.pa.encode('Knuth'), 'K530')
        self.assertEqual(self.pa.encode('Lloyd'), 'L300')
        self.assertEqual(self.pa.encode('Lukasieicz'), 'L222')
        self.assertEqual(self.pa.encode('Ellery'), 'E460')
        self.assertEqual(self.pa.encode('Ghosh'), 'G200')
        self.assertEqual(self.pa.encode('Heilbronn'), 'H416')
        self.assertEqual(self.pa.encode('Kant'), 'K530')
        self.assertEqual(self.pa.encode('Ladd'), 'L300')
        self.assertEqual(self.pa.encode('Lissajous'), 'L222')
        self.assertEqual(self.pa.encode('Rogers'), 'R262')
        self.assertEqual(self.pa.encode('Rodgers'), 'R326')
        self.assertNotEqual(self.pa.encode('Rogers'),
                            self.pa.encode('Rodgers'))
        self.assertNotEqual(self.pa.encode('Sinclair'),
                            self.pa.encode('St. Clair'))
        self.assertNotEqual(self.pa.encode('Tchebysheff'),
                            self.pa.encode('Chebyshev'))

        # http://creativyst.com/Doc/Articles/SoundEx1/SoundEx1.htm#Related
        self.assertEqual(self.pa.encode('Htacky'), 'H320')
        self.assertEqual(self.pa.encode('Atacky'), 'A320')
        self.assertEqual(self.pa.encode('Schmit'), 'S530')
        self.assertEqual(self.pa.encode('Schneider'), 'S536')
        self.assertEqual(self.pa.encode('Pfister'), 'P236')
        self.assertEqual(self.pa.encode('Ashcroft'), 'A261')
        self.assertEqual(self.pa.encode('Asicroft'), 'A226')

        # https://en.wikipedia.org/wiki/Soundex
        self.assertEqual(self.pa.encode('Robert'), 'R163')
        self.assertEqual(self.pa.encode('Rupert'), 'R163')
        self.assertEqual(self.pa.encode('Rubin'), 'R150')
        self.assertEqual(self.pa.encode('Tymczak'), 'T522')

        # https://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex
        self.assertEqual(self.pa.encode('Peters'), 'P362')
        self.assertEqual(self.pa.encode('Peterson'), 'P362')
        self.assertEqual(self.pa.encode('Moskowitz'), 'M232')
        self.assertEqual(self.pa.encode('Moskovitz'), 'M213')
        self.assertEqual(self.pa.encode('Auerbach'), 'A612')
        self.assertEqual(self.pa.encode('Uhrbach'), 'U612')
        self.assertEqual(self.pa.encode('Jackson'), 'J250')
        self.assertEqual(self.pa.encode('Jackson-Jackson'), 'J252')

        # max_length tests
        self.assertEqual(Soundex(10).encode('Lincoln'), 'L524500000')
        self.assertEqual(Soundex(5).encode('Lincoln'), 'L5245')
        self.assertEqual(Soundex(6).encode('Christopher'), 'C62316')

        # max_length bounds tests
        self.assertEqual(
            Soundex(max_length=-1).encode('Niall'),
            'N4000000000000000000000000000000000000000000000000' +
            '00000000000000',
        )
        self.assertEqual(Soundex(max_length=0).encode('Niall'), 'N400')

        # reverse tests
        self.assertEqual(Soundex(reverse=True).encode('Rubin'), 'N160')
        self.assertEqual(Soundex(reverse=True).encode('Llyod'), 'D400')
        self.assertEqual(Soundex(reverse=True).encode('Lincoln'), 'N425')
        self.assertEqual(Soundex(reverse=True).encode('Knuth'), 'H352')

        # zero_pad tests
        self.assertEqual(
            Soundex(max_length=-1, zero_pad=False).encode('Niall'), 'N4')
        self.assertEqual(
            Soundex(max_length=0, zero_pad=False).encode('Niall'), 'N4')
        self.assertEqual(
            Soundex(max_length=0, zero_pad=True).encode('Niall'), 'N400')
        self.assertEqual(Soundex(max_length=4, zero_pad=False).encode(''), '0')
        self.assertEqual(
            Soundex(max_length=4, zero_pad=True).encode(''), '0000')

        # encode_alpha
        self.assertEqual(self.pa.encode_alpha('Euler'), 'ELR')
        self.assertEqual(self.pa.encode_alpha('Gauss'), 'GK')
        self.assertEqual(self.pa.encode_alpha('Hilbert'), 'HLPR')
        self.assertEqual(self.pa.encode_alpha('Knuth'), 'KNT')
Exemple #10
0
    'pshp_soundex_last_german': PSHPSoundexLast(german=True).encode,
    'pshp_soundex_last_ml8': PSHPSoundexLast(max_length=8).encode,
    'refined_soundex': RefinedSoundex().encode,
    'refined_soundex_vowels': RefinedSoundex(retain_vowels=True).encode,
    'refined_soundex_0pad_ml6': RefinedSoundex(zero_pad=True,
                                               max_length=6).encode,
    'reth_schek_phonetik': RethSchek().encode,
    'roger_root': RogerRoot().encode,
    'roger_root_nopad_ml8': RogerRoot(max_length=8, zero_pad=False).encode,
    'russell_index': RussellIndex().encode,
    'russell_index_alpha': RussellIndex().encode_alpha,
    'sfinxbis': SfinxBis().encode,
    'sfinxbis_ml6': SfinxBis(max_length=6).encode,
    'sound_d': SoundD().encode,
    'sound_d_ml8': SoundD(max_length=8).encode,
    'soundex': Soundex().encode,
    'soundex_reverse': Soundex(reverse=True).encode,
    'soundex_0pad_ml6': Soundex(zero_pad=True, max_length=6).encode,
    'soundex_special': Soundex(var='special').encode,
    'soundex_census': Soundex(var='Census').encode,
    'soundex_br': SoundexBR().encode,
    'spanish_metaphone': SpanishMetaphone().encode,
    'spanish_metaphone_modified': SpanishMetaphone(modified=True).encode,
    'spanish_metaphone_ml4': SpanishMetaphone(max_length=4).encode,
    'spfc': lambda _: spfc.encode('{0} {0}'.format(_)),
    'statistics_canada': StatisticsCanada().encode,
    'statistics_canada_ml8': StatisticsCanada(max_length=8).encode,
    'waahlin': Waahlin().encode,
    'waahlin_soundex': Waahlin(encoder=Soundex()).encode,
}
Exemple #11
0
    StatisticsCanada,
    Waahlin,
)

from . import EXTREME_TEST, _corpus_file, _fuzz, _random_char

alpha_sis = AlphaSIS()
daitch_mokotoff = DaitchMokotoff()
double_metaphone = DoubleMetaphone()
haase = Haase()
haase_primary = Haase(primary_only=True)
koelner = Koelner()
russell = RussellIndex()
sfinxbis = SfinxBis()
sfinxbis_6 = SfinxBis(max_length=6)
soundex_census = Soundex(var='Census')
spfc = SPFC()

algorithms = {
    'ainsworth':
    Ainsworth().encode,
    'alpha_sis':
    lambda _: ', '.join(alpha_sis.encode(_)),
    'bmpm':
    BeiderMorse().encode,
    'bmpm_german':
    BeiderMorse(language_arg='german').encode,
    'bmpm_french':
    BeiderMorse(language_arg='french').encode,
    'bmpm_gen_exact':
    BeiderMorse(match_mode='exact').encode,
Exemple #12
0
    StatisticsCanada,
)

from . import EXTREME_TEST, _corpus_file, _fuzz, _random_char

russell = RussellIndex()
koelner = Koelner()
spfc = SPFC()

algorithms = {
    'russell_index': russell.encode,
    'russell_index_num_to_alpha': lambda _: russell._to_alpha(  # noqa: SF01
        russell.encode(_)
    ),
    'russell_index_alpha': russell.encode_alpha,
    'soundex': Soundex().encode,
    'reverse_soundex': Soundex(reverse=True).encode,
    'soundex_0pad_ml6': Soundex(zero_pad=True, max_length=6).encode,
    'soundex_special': Soundex(var='special').encode,
    'soundex_census': Soundex(var='Census').encode,
    'refined_soundex': RefinedSoundex().encode,
    'refined_soundex_vowels': RefinedSoundex(retain_vowels=True).encode,
    'refined_soundex_0pad_ml6': RefinedSoundex(
        zero_pad=True, max_length=6
    ).encode,
    'daitch_mokotoff_soundex': DaitchMokotoff().encode,
    'koelner_phonetik': koelner.encode,
    'koelner_phonetik_num_to_alpha': lambda _: koelner._to_alpha(  # noqa: SF01
        koelner.encode(_)
    ),
    'koelner_phonetik_alpha': koelner.encode_alpha,
class UnigramCorpusTestCases(unittest.TestCase):
    """Test abydos.corpus.UnigramCorpus."""

    simple_corpus = UnigramCorpus()
    simple_corpus.gng_importer(_corpus_file('simple-ngrams.txt'))

    double_corpus = UnigramCorpus()
    double_corpus.gng_importer(_corpus_file('simple-ngrams.txt'))
    double_corpus.gng_importer(_corpus_file('simple-ngrams.txt'))

    sotu2015_sample = "Mr. Speaker, Mr. Vice President, Members of Congress,\
    my fellow Americans:\n\nWe are 15 years into this new century.\n Fifteen\
    years that dawned with terror touching our shores; that unfolded with a\
    new generation fighting two long and costly wars; that saw a vicious\
    recession spread across our nation and the world.\n It has been, and still\
    is, a hard time for many.\n\nBut tonight, we turn the page.\n Tonight,\
    after a breakthrough year for America, our economy is growing and creating\
    jobs at the fastest pace since 1999.\n Our unemployment rate is now lower\
    than it was before the financial crisis.\n More of our kids are graduating\
    than ever before.\n More of our people are insured than ever before.\n And\
    we are as free from the grip of foreign oil as we've been in almost 30\
    years.\n\nTonight, for the first time since 9/11, our combat mission in\
    Afghanistan is over.\n Six years ago, nearly 180,000 American troops\
    served in Iraq and Afghanistan.\n Today, fewer than 15,000 remain.\n And\
    we salute the courage and sacrifice of every man and woman in this 9/11\
    Generation who has served to keep us safe.\n We are humbled and grateful\
    for your service.\n\nAmerica, for all that we have endured; for all the\
    grit and hard work required to come back; for all the tasks that lie\
    ahead, know this: The shadow of crisis has passed, and the State of the\
    Union is strong.\n\nAt this moment -- with a growing economy, shrinking\
    deficits, bustling industry, booming energy production -- we have risen\
    from recession freer to write our own future than any other nation on\
    Earth.\n It's now up to us to choose who we want to be over the next 15\
    years and for decades to come.\n\nWill we accept an economy where only a\
    few of us do spectacularly well?\n Or will we commit ourselves to an\
    economy that generates rising incomes and chances for everyone who makes\
    the effort?\n\nWill we approach the world fearful and reactive, dragged\
    into costly conflicts that strain our military and set back our\
    standing?\n Or will we lead wisely, using all elements of our power to\
    defeat new threats and protect our planet?\n\nWill we allow ourselves to\
    be sorted into factions and turned against one another?\n Or will we\
    recapture the sense of common purpose that has always propelled America\
    forward?\n\nIn two weeks, I will send this Congress a budget filled with\
    ideas that are practical, not partisan.\n And in the months ahead, I'll\
    crisscross the country making a case for those ideas.\n So tonight, I want\
    to focus less on a checklist of proposals, and focus more on the values at\
    stake in the choices before us."

    sotu2015_corpus = UnigramCorpus(sotu2015_sample)

    sdx_corpus = UnigramCorpus(word_transform=Soundex().encode)

    qsg_corpus = UnigramCorpus(
        word_tokenizer=QSkipgrams(qval=3, start_stop=''))

    pos_corpus = UnigramCorpus()
    pos_corpus.gng_importer(_corpus_file('simple-ngrams-pos.txt'))

    def test_unigram_corpus_init(self):
        """Test abydos.corpus.UnigramCorpus.__init__."""
        self.assertIsInstance(UnigramCorpus(), UnigramCorpus)
        self.assertIsInstance(self.sotu2015_corpus, UnigramCorpus)

    def test_unigram_corpus_gng_importer(self):
        """Test abydos.corpus.UnigramCorpus.gng_importer."""
        self.assertIsInstance(self.simple_corpus, UnigramCorpus)
        self.assertIsInstance(self.simple_corpus.corpus, defaultdict)

        # skip tests of UnigramCorpus on Python < 3.6 (lack ordered dict)
        if sys.version_info < (3, 6):
            return

        self.sdx_corpus.gng_importer('tests/corpora/simple-ngrams.txt')
        self.assertEqual(
            list(self.sdx_corpus.corpus.items()),
            [
                ('T000', (20, 20)),
                ('Q200', (2, 2)),
                ('B650', (3, 3)),
                ('F200', (1, 1)),
                ('J513', (4, 4)),
                ('O160', (6, 6)),
                ('L200', (1, 1)),
                ('D200', (5, 5)),
                ('T220', (2, 2)),
                ('Q216', (1, 1)),
                ('B651', (1, 1)),
                ('F251', (1, 1)),
                ('O163', (3, 3)),
                ('T420', (2, 2)),
                ('L232', (1, 1)),
            ],
        )

        self.qsg_corpus.gng_importer('tests/corpora/simple-ngrams.txt')
        self.assertEqual(
            list(self.qsg_corpus.corpus.items())[:30:2],
            [
                ('the', (27, 27)),
                ('quc', (5, 5)),
                ('qic', (5, 5)),
                ('qck', (5, 5)),
                ('uik', (5, 5)),
                ('ick', (5, 5)),
                ('brw', (5, 5)),
                ('bow', (5, 5)),
                ('bwn', (5, 5)),
                ('ron', (5, 5)),
                ('own', (5, 5)),
                ('jum', (5, 5)),
                ('jue', (6, 5)),
                ('jmp', (5, 5)),
                ('jmd', (5, 5)),
            ],
        )

        for term, _ in self.pos_corpus.corpus.items():
            self.assertTrue('_' not in term)

    def test_unigram_corpus_save_load_corpus(self):
        """Test abydos.corpus.UnigramCorpus.save_corpus & .load_corpus."""
        handle, path = tempfile.mkstemp('.dat')
        self.sotu2015_corpus.save_corpus(path)
        self.sotu2015_corpus.load_corpus(path)
        statinfo = os.stat(path)
        self.assertGreater(statinfo.st_size, 0)
        os.close(handle)
        os.remove(path)

    def test_unigram_corpus_idf(self):
        """Test abydos.corpus.UnigramCorpus.idf."""
        # string-style tests
        self.assertAlmostEqual(self.simple_corpus.idf('the'), 0.69314718056)
        self.assertAlmostEqual(self.simple_corpus.idf('quick'), 2.3978952728)
        self.assertAlmostEqual(self.simple_corpus.idf('trolley'), float('inf'))
Exemple #14
0
class SoundexTestCases(unittest.TestCase):
    """Test Soundex functions.

    test cases for abydos.phonetic.Soundex
    """

    pa = Soundex()

    def test_soundex(self):
        """Test abydos.phonetic.Soundex."""
        self.assertEqual(self.pa.encode(''), '0000')

        # https://archive.org/stream/accessingindivid00moor#page/14/mode/2up
        self.assertEqual(self.pa.encode('Euler'), 'E460')
        self.assertEqual(self.pa.encode('Gauss'), 'G200')
        self.assertEqual(self.pa.encode('Hilbert'), 'H416')
        self.assertEqual(self.pa.encode('Knuth'), 'K530')
        self.assertEqual(self.pa.encode('Lloyd'), 'L300')
        self.assertEqual(self.pa.encode('Lukasieicz'), 'L222')
        self.assertEqual(self.pa.encode('Ellery'), 'E460')
        self.assertEqual(self.pa.encode('Ghosh'), 'G200')
        self.assertEqual(self.pa.encode('Heilbronn'), 'H416')
        self.assertEqual(self.pa.encode('Kant'), 'K530')
        self.assertEqual(self.pa.encode('Ladd'), 'L300')
        self.assertEqual(self.pa.encode('Lissajous'), 'L222')
        self.assertEqual(self.pa.encode('Rogers'), 'R262')
        self.assertEqual(self.pa.encode('Rodgers'), 'R326')
        self.assertNotEqual(self.pa.encode('Rogers'),
                            self.pa.encode('Rodgers'))
        self.assertNotEqual(self.pa.encode('Sinclair'),
                            self.pa.encode('St. Clair'))
        self.assertNotEqual(self.pa.encode('Tchebysheff'),
                            self.pa.encode('Chebyshev'))

        # http://creativyst.com/Doc/Articles/SoundEx1/SoundEx1.htm#Related
        self.assertEqual(self.pa.encode('Htacky'), 'H320')
        self.assertEqual(self.pa.encode('Atacky'), 'A320')
        self.assertEqual(self.pa.encode('Schmit'), 'S530')
        self.assertEqual(self.pa.encode('Schneider'), 'S536')
        self.assertEqual(self.pa.encode('Pfister'), 'P236')
        self.assertEqual(self.pa.encode('Ashcroft'), 'A261')
        self.assertEqual(self.pa.encode('Asicroft'), 'A226')

        # https://en.wikipedia.org/wiki/Soundex
        self.assertEqual(self.pa.encode('Robert'), 'R163')
        self.assertEqual(self.pa.encode('Rupert'), 'R163')
        self.assertEqual(self.pa.encode('Rubin'), 'R150')
        self.assertEqual(self.pa.encode('Tymczak'), 'T522')

        # https://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex
        self.assertEqual(self.pa.encode('Peters'), 'P362')
        self.assertEqual(self.pa.encode('Peterson'), 'P362')
        self.assertEqual(self.pa.encode('Moskowitz'), 'M232')
        self.assertEqual(self.pa.encode('Moskovitz'), 'M213')
        self.assertEqual(self.pa.encode('Auerbach'), 'A612')
        self.assertEqual(self.pa.encode('Uhrbach'), 'U612')
        self.assertEqual(self.pa.encode('Jackson'), 'J250')
        self.assertEqual(self.pa.encode('Jackson-Jackson'), 'J252')

        # max_length tests
        self.assertEqual(self.pa.encode('Lincoln', 10), 'L524500000')
        self.assertEqual(self.pa.encode('Lincoln', 5), 'L5245')
        self.assertEqual(self.pa.encode('Christopher', 6), 'C62316')

        # max_length bounds tests
        self.assertEqual(
            self.pa.encode('Niall', max_length=-1),
            'N4000000000000000000000000000000000000000000000000' +
            '00000000000000',
        )
        self.assertEqual(self.pa.encode('Niall', max_length=0), 'N400')

        # reverse tests
        self.assertEqual(self.pa.encode('Rubin', reverse=True), 'N160')
        self.assertEqual(self.pa.encode('Llyod', reverse=True), 'D400')
        self.assertEqual(self.pa.encode('Lincoln', reverse=True), 'N425')
        self.assertEqual(self.pa.encode('Knuth', reverse=True), 'H352')

        # zero_pad tests
        self.assertEqual(
            self.pa.encode('Niall', max_length=-1, zero_pad=False), 'N4')
        self.assertEqual(self.pa.encode('Niall', max_length=0, zero_pad=False),
                         'N4')
        self.assertEqual(self.pa.encode('Niall', max_length=0, zero_pad=True),
                         'N400')
        self.assertEqual(self.pa.encode('', max_length=4, zero_pad=False), '0')
        self.assertEqual(self.pa.encode('', max_length=4, zero_pad=True),
                         '0000')

        # Test wrapper
        self.assertEqual(soundex('Euler'), 'E460')

    def test_soundex_special(self):
        """Test abydos.phonetic.Soundex (special 1880-1910 variant method)."""
        self.assertEqual(self.pa.encode('Ashcroft', var='special'), 'A226')
        self.assertEqual(self.pa.encode('Asicroft', var='special'), 'A226')
        self.assertEqual(self.pa.encode('AsWcroft', var='special'), 'A226')
        self.assertEqual(self.pa.encode('Rupert', var='special'), 'R163')
        self.assertEqual(self.pa.encode('Rubin', var='special'), 'R150')

    def test_soundex_census(self):
        """Test abydos.phonetic.Soundex(Census variant method)."""
        self.assertEqual(self.pa.encode('Vandeusen', var='Census'),
                         ('V532', 'D250'))
        self.assertEqual(self.pa.encode('van Deusen', var='Census'),
                         ('V532', 'D250'))
        self.assertEqual(self.pa.encode('McDonald', var='Census'), 'M235')
        self.assertEqual(self.pa.encode('la Cruz', var='Census'),
                         ('L262', 'C620'))
        self.assertEqual(self.pa.encode('vanDamme', var='Census'),
                         ('V535', 'D500'))
Exemple #15
0
onca = ONCA()
parmar_kumbharana = ParmarKumbharana()
phonem = Phonem()
phonet = Phonet()
phonetic_spanish = PhoneticSpanish()
phonex = Phonex()
phonix = Phonix()
pshp_soundex_first = PSHPSoundexFirst()
pshp_soundex_last = PSHPSoundexLast()
refined_soundex = RefinedSoundex()
reth_schek = RethSchek()
roger_root = RogerRoot()
russell = RussellIndex()
sfinxbis = SfinxBis()
sound_d = SoundD()
soundex = Soundex()
soundex_br = SoundexBR()
spanish_metaphone = SpanishMetaphone()
spfc = SPFC()
statistics_canada = StatisticsCanada()

algorithms = {
    'russell_index':
    lambda _: str(russell.encode(_)),
    'russell_index_num_to_alpha':
    lambda _: russell._to_alpha(  # noqa: SF01
        russell.encode(_)),
    'russell_index_alpha':
    russell.encode_alpha,
    'soundex':
    soundex.encode,
Exemple #16
0
class PhoneticDistanceTestCases(unittest.TestCase):
    """Test phonetic distance functions.

    abydos.distance.PhoneticDistance
    """

    sdx = PhoneticDistance(transforms=Soundex)
    sdx_lev = PhoneticDistance(transforms=Soundex(), metric=Levenshtein())
    # Having mixed instantiated & uninstantiated classes is... weird... but
    # this covers another line of code.
    three_jaro = PhoneticDistance(
        transforms=[Porter2, Metaphone, OmissionKey()],
        metric=JaroWinkler,
        encode_alpha=True,
    )

    def test_phonetic_distance_dist(self):
        """Test abydos.distance.PhoneticDistance.dist."""
        # Base cases
        self.assertEqual(self.sdx.dist('', ''), 0.0)
        self.assertEqual(self.sdx.dist('a', ''), 1.0)
        self.assertEqual(self.sdx.dist('', 'a'), 1.0)
        self.assertEqual(self.sdx.dist('abc', ''), 1.0)
        self.assertEqual(self.sdx.dist('', 'abc'), 1.0)
        self.assertEqual(self.sdx.dist('abc', 'abc'), 0.0)
        self.assertEqual(self.sdx.dist('abcd', 'efgh'), 1.0)

        self.assertAlmostEqual(self.sdx.dist('Nigel', 'Niall'), 1.0)
        self.assertAlmostEqual(self.sdx.dist('Niall', 'Nigel'), 1.0)
        self.assertAlmostEqual(self.sdx.dist('Colin', 'Coiln'), 0.0)
        self.assertAlmostEqual(self.sdx.dist('Coiln', 'Colin'), 0.0)
        self.assertAlmostEqual(self.sdx.dist('ATCAACGAGT', 'AACGATTAG'), 1.0)

        self.assertEqual(self.sdx_lev.dist('', ''), 0.0)
        self.assertEqual(self.sdx_lev.dist('a', ''), 0.25)
        self.assertEqual(self.sdx_lev.dist('', 'a'), 0.25)
        self.assertEqual(self.sdx_lev.dist('abc', ''), 0.75)
        self.assertEqual(self.sdx_lev.dist('', 'abc'), 0.75)
        self.assertEqual(self.sdx_lev.dist('abc', 'abc'), 0.0)
        self.assertEqual(self.sdx_lev.dist('abcd', 'efgh'), 0.5)

        self.assertAlmostEqual(self.sdx_lev.dist('Nigel', 'Niall'), 0.5)
        self.assertAlmostEqual(self.sdx_lev.dist('Niall', 'Nigel'), 0.5)
        self.assertAlmostEqual(self.sdx_lev.dist('Colin', 'Coiln'), 0.0)
        self.assertAlmostEqual(self.sdx_lev.dist('Coiln', 'Colin'), 0.0)
        self.assertAlmostEqual(self.sdx_lev.dist('ATCAACGAGT', 'AACGATTAG'),
                               0.5)

        self.assertEqual(self.three_jaro.dist('', ''), 0.0)
        self.assertEqual(self.three_jaro.dist('a', ''), 1.0)
        self.assertEqual(self.three_jaro.dist('', 'a'), 1.0)
        self.assertEqual(self.three_jaro.dist('abc', ''), 1.0)
        self.assertEqual(self.three_jaro.dist('', 'abc'), 1.0)
        self.assertEqual(self.three_jaro.dist('abc', 'abc'), 0.0)
        self.assertAlmostEqual(self.three_jaro.dist('abcd', 'efgh'), 0.4722222)

        self.assertAlmostEqual(self.three_jaro.dist('Nigel', 'Niall'), 1.0)
        self.assertAlmostEqual(self.three_jaro.dist('Niall', 'Nigel'), 1.0)
        self.assertAlmostEqual(self.three_jaro.dist('Colin', 'Coiln'), 0.0)
        self.assertAlmostEqual(self.three_jaro.dist('Coiln', 'Colin'), 0.0)
        self.assertAlmostEqual(self.three_jaro.dist('ATCAACGAGT', 'AACGATTAG'),
                               0.0)

        # More tests to complete coverage
        self.assertEqual(PhoneticDistance().dist('a', 'ab'), 1.0)
        self.assertRaises(TypeError, PhoneticDistance, ['hello!'])
        self.assertRaises(TypeError, PhoneticDistance, 3.14)
        self.assertRaises(TypeError, PhoneticDistance, metric=3.14)
        self.assertEqual(
            PhoneticDistance(lambda s: s.lower()).dist('ONE', 'one'), 0.0)

    def test_phonetic_distance_dist_abs(self):
        """Test abydos.distance.PhoneticDistance.dist_abs."""
        # Base cases
        self.assertEqual(self.sdx.dist_abs('', ''), 0)
        self.assertEqual(self.sdx.dist_abs('a', ''), 1)
        self.assertEqual(self.sdx.dist_abs('', 'a'), 1)
        self.assertEqual(self.sdx.dist_abs('abc', ''), 1)
        self.assertEqual(self.sdx.dist_abs('', 'abc'), 1)
        self.assertEqual(self.sdx.dist_abs('abc', 'abc'), 0)
        self.assertEqual(self.sdx.dist_abs('abcd', 'efgh'), 1)

        self.assertAlmostEqual(self.sdx.dist_abs('Nigel', 'Niall'), 1)
        self.assertAlmostEqual(self.sdx.dist_abs('Niall', 'Nigel'), 1)
        self.assertAlmostEqual(self.sdx.dist_abs('Colin', 'Coiln'), 0)
        self.assertAlmostEqual(self.sdx.dist_abs('Coiln', 'Colin'), 0)
        self.assertAlmostEqual(self.sdx.dist_abs('ATCAACGAGT', 'AACGATTAG'), 1)

        self.assertEqual(self.sdx_lev.dist_abs('', ''), 0)
        self.assertEqual(self.sdx_lev.dist_abs('a', ''), 1)
        self.assertEqual(self.sdx_lev.dist_abs('', 'a'), 1)
        self.assertEqual(self.sdx_lev.dist_abs('abc', ''), 3)
        self.assertEqual(self.sdx_lev.dist_abs('', 'abc'), 3)
        self.assertEqual(self.sdx_lev.dist_abs('abc', 'abc'), 0)
        self.assertEqual(self.sdx_lev.dist_abs('abcd', 'efgh'), 2)

        self.assertAlmostEqual(self.sdx_lev.dist_abs('Nigel', 'Niall'), 2)
        self.assertAlmostEqual(self.sdx_lev.dist_abs('Niall', 'Nigel'), 2)
        self.assertAlmostEqual(self.sdx_lev.dist_abs('Colin', 'Coiln'), 0)
        self.assertAlmostEqual(self.sdx_lev.dist_abs('Coiln', 'Colin'), 0)
        self.assertAlmostEqual(
            self.sdx_lev.dist_abs('ATCAACGAGT', 'AACGATTAG'), 2)