Beispiel #1
0
    def test_caversham(self):
        """Test using Caversham test set (SoundEx, Metaphone, & Caverphone)."""
        soundex = Soundex()
        metaphone = Metaphone()

        with open(_corpus_file('variantNames.csv')) as cav_testset:
            next(cav_testset)
            for cav_line in cav_testset:
                (
                    name1,
                    soundex1,
                    metaphone1,
                    caverphone1,
                    name2,
                    soundex2,
                    metaphone2,
                    caverphone2,
                    soundex_same,
                    metaphone_same,
                    caverphone_same,
                ) = cav_line.strip().split(',')

                self.assertEqual(soundex.encode(name1), soundex1)
                self.assertEqual(soundex.encode(name2), soundex2)
                if soundex_same == '1':
                    self.assertEqual(soundex.encode(name1),
                                     soundex.encode(name2))
                else:
                    self.assertNotEqual(soundex.encode(name1),
                                        soundex.encode(name2))

                self.assertEqual(metaphone.encode(name1), metaphone1)
                self.assertEqual(metaphone.encode(name2), metaphone2)
                if metaphone_same == '1':
                    self.assertEqual(metaphone.encode(name1),
                                     metaphone.encode(name2))
                else:
                    self.assertNotEqual(metaphone.encode(name1),
                                        metaphone.encode(name2))

                self.assertEqual(self.pa.encode(name1), caverphone1)
                self.assertEqual(self.pa.encode(name2), caverphone2)
                if caverphone_same == '1':
                    self.assertEqual(self.pa.encode(name1),
                                     self.pa.encode(name2))
                else:
                    self.assertNotEqual(self.pa.encode(name1),
                                        self.pa.encode(name2))
    def test_caversham(self):
        """Test using Caversham test set (SoundEx, Metaphone, & Caverphone)."""
        soundex = Soundex()
        metaphone = Metaphone()

        with open(_corpus_file('variantNames.csv')) as cav_testset:
            next(cav_testset)
            for cav_line in cav_testset:
                (
                    name1,
                    soundex1,
                    metaphone1,
                    caverphone1,
                    name2,
                    soundex2,
                    metaphone2,
                    caverphone2,
                    soundex_same,
                    metaphone_same,
                    caverphone_same,
                ) = cav_line.strip().split(',')

                self.assertEqual(soundex.encode(name1), soundex1)
                self.assertEqual(soundex.encode(name2), soundex2)
                if soundex_same == '1':
                    self.assertEqual(
                        soundex.encode(name1), soundex.encode(name2)
                    )
                else:
                    self.assertNotEqual(
                        soundex.encode(name1), soundex.encode(name2)
                    )

                self.assertEqual(metaphone.encode(name1), metaphone1)
                self.assertEqual(metaphone.encode(name2), metaphone2)
                if metaphone_same == '1':
                    self.assertEqual(
                        metaphone.encode(name1), metaphone.encode(name2)
                    )
                else:
                    self.assertNotEqual(
                        metaphone.encode(name1), metaphone.encode(name2)
                    )

                self.assertEqual(self.pa.encode(name1), caverphone1)
                self.assertEqual(self.pa.encode(name2), caverphone2)
                if caverphone_same == '1':
                    self.assertEqual(
                        self.pa.encode(name1), self.pa.encode(name2)
                    )
                else:
                    self.assertNotEqual(
                        self.pa.encode(name1), self.pa.encode(name2)
                    )
Beispiel #3
0
    def test_metaphone(self):
        """Test abydos.phonetic.Metaphone."""
        self.assertEqual(self.pa.encode(''), '')
        self.assertEqual(self.pa.encode('...'), '')

        # http://ntz-develop.blogspot.com/2011/03/phonetic-algorithms.html
        self.assertEqual(self.pa4.encode('Fishpool'), 'FXPL')
        self.assertEqual(self.pa4.encode('Fishpoole'), 'FXPL')
        self.assertEqual(self.pa4.encode('Gellately'), 'JLTL')
        self.assertEqual(self.pa4.encode('Gelletly'), 'JLTL')
        self.assertEqual(self.pa4.encode('Lowers'), 'LWRS')
        self.assertEqual(self.pa4.encode('Lowerson'), 'LWRS')
        self.assertEqual(self.pa4.encode('Mallabar'), 'MLBR')
        self.assertEqual(self.pa4.encode('Melbert'), 'MLBR')
        self.assertEqual(self.pa4.encode('Melbourn'), 'MLBR')
        self.assertEqual(self.pa4.encode('Melbourne'), 'MLBR')
        self.assertEqual(self.pa4.encode('Melburg'), 'MLBR')
        self.assertEqual(self.pa4.encode('Melbury'), 'MLBR')
        self.assertEqual(self.pa4.encode('Milberry'), 'MLBR')
        self.assertEqual(self.pa4.encode('Milborn'), 'MLBR')
        self.assertEqual(self.pa4.encode('Milbourn'), 'MLBR')
        self.assertEqual(self.pa4.encode('Milbourne'), 'MLBR')
        self.assertEqual(self.pa4.encode('Milburn'), 'MLBR')
        self.assertEqual(self.pa4.encode('Milburne'), 'MLBR')
        self.assertEqual(self.pa4.encode('Millberg'), 'MLBR')
        self.assertEqual(self.pa4.encode('Mulberry'), 'MLBR')
        self.assertEqual(self.pa4.encode('Mulbery'), 'MLBR')
        self.assertEqual(self.pa4.encode('Mulbry'), 'MLBR')
        self.assertEqual(self.pa4.encode('Saipy'), 'SP')
        self.assertEqual(self.pa4.encode('Sapey'), 'SP')
        self.assertEqual(self.pa4.encode('Sapp'), 'SP')
        self.assertEqual(self.pa4.encode('Sappy'), 'SP')
        self.assertEqual(self.pa4.encode('Sepey'), 'SP')
        self.assertEqual(self.pa4.encode('Seppey'), 'SP')
        self.assertEqual(self.pa4.encode('Sopp'), 'SP')
        self.assertEqual(self.pa4.encode('Zoppie'), 'SP')
        self.assertEqual(self.pa4.encode('Zoppo'), 'SP')
        self.assertEqual(self.pa4.encode('Zupa'), 'SP')
        self.assertEqual(self.pa4.encode('Zupo'), 'SP')
        self.assertEqual(self.pa4.encode('Zuppa'), 'SP')

        # assorted tests to complete code coverage
        self.assertEqual(self.pa.encode('Xavier'), 'SFR')
        self.assertEqual(self.pa.encode('Acacia'), 'AKX')
        self.assertEqual(self.pa.encode('Schuler'), 'SKLR')
        self.assertEqual(self.pa.encode('Sign'), 'SN')
        self.assertEqual(self.pa.encode('Signed'), 'SNT')
        self.assertEqual(self.pa.encode('Horatio'), 'HRX')
        self.assertEqual(self.pa.encode('Ignatio'), 'IKNX')
        self.assertEqual(self.pa.encode('Lucretia'), 'LKRX')
        self.assertEqual(self.pa.encode('Wright'), 'RKT')
        self.assertEqual(self.pa.encode('White'), 'WT')
        self.assertEqual(self.pa.encode('Black'), 'BLK')
        self.assertEqual(self.pa.encode('Chance'), 'XNS')
        self.assertEqual(self.pa.encode('Dgengo'), 'JJNK')
        self.assertEqual(self.pa.encode('Ghost'), 'ST')
        self.assertEqual(self.pa.encode('Qing'), 'KNK')
        self.assertEqual(self.pa.encode('Asia'), 'AX')
        self.assertEqual(self.pa.encode('Ax'), 'AKS')
        self.assertEqual(self.pa.encode('Thegn'), '0N')
        self.assertEqual(self.pa.encode('acknowledged'), 'AKNLJT')
        self.assertEqual(self.pa.encode('awkward'), 'AKWRT')
        self.assertEqual(self.pa.encode('admitted'), 'ATMTT')
        self.assertEqual(self.pa.encode('dahl'), 'TL')
        self.assertEqual(self.pa.encode('autobiography'), 'ATBKRF')
        self.assertEqual(self.pa.encode('exaggerate'), 'EKSKRT')
        self.assertEqual(self.pa.encode('pitch'), 'PX')
        self.assertEqual(self.pa.encode('chracter'), 'KRKTR')

        # assorted tests to complete branch coverage
        self.assertEqual(self.pa.encode('Lamb'), 'LM')
        self.assertEqual(self.pa.encode('science'), 'SNS')

        # max_length bounds tests
        self.assertEqual(Metaphone(max_length=-1).encode('Niall'), 'NL')
        self.assertEqual(Metaphone(max_length=0).encode('Niall'), 'NL')

        # Test wrapper
        self.assertEqual(metaphone('Xavier'), 'SFR')
Beispiel #4
0
 'double_metaphone': DoubleMetaphone().encode,
 'eudex': Eudex().encode,
 'fonem': FONEM().encode,
 'fuzzy_soundex': FuzzySoundex().encode,
 'fuzzy_soundex_0pad_ml8': FuzzySoundex(max_length=8, zero_pad=True).encode,
 'haase_phonetik': Haase().encode,
 'haase_phonetik_primary': Haase(primary_only=True).encode,
 'henry_early': HenryEarly().encode,
 'henry_early_ml8': HenryEarly(max_length=8).encode,
 'koelner_phonetik': Koelner().encode,
 'koelner_phonetik_alpha': Koelner().encode_alpha,
 'lein': LEIN().encode,
 'lein_nopad_ml8': LEIN(max_length=8, zero_pad=False).encode,
 'metasoundex': MetaSoundex().encode,
 'metasoundex_es': MetaSoundex(lang='es').encode,
 'metaphone': Metaphone().encode,
 'mra': MRA().encode,
 'norphone': Norphone().encode,
 'nrl': NRL().encode,
 'nysiis': NYSIIS().encode,
 'nysiis_modified': NYSIIS(modified=True).encode,
 'nysiis_ml_inf': NYSIIS(max_length=-1).encode,
 'onca': ONCA().encode,
 'onca_nopad_ml8': ONCA(max_length=8, zero_pad=False).encode,
 'parmar_kumbharana': ParmarKumbharana().encode,
 'phonem': Phonem().encode,
 'phonet_1': Phonet().encode,
 'phonet_2': Phonet(mode=2).encode,
 'phonet_1_none': Phonet(lang='none').encode,
 'phonet_2_none': Phonet(mode=2, lang='none').encode,
 'phonetic_spanish': PhoneticSpanish().encode,
Beispiel #5
0
 koelner.encode,
 'koelner_phonetik_num_to_alpha': (
     lambda _: koelner._to_alpha(koelner.encode(_))  # noqa: SF01
 ),
 'koelner_phonetik_alpha':
 koelner.encode_alpha,
 'lein':
 LEIN().encode,
 'lein_nopad_ml8':
 LEIN(max_length=8, zero_pad=False).encode,
 'metasoundex':
 MetaSoundex().encode,
 'metasoundex_es':
 MetaSoundex(lang='es').encode,
 'metaphone':
 Metaphone().encode,
 'mra':
 MRA().encode,
 'norphone':
 Norphone().encode,
 'nrl':
 NRL().encode,
 'nysiis':
 NYSIIS().encode,
 'nysiis_modified':
 NYSIIS(modified=True).encode,
 'nysiis_ml_inf':
 NYSIIS(max_length=-1).encode,
 'onca':
 ONCA().encode,
 'onca_nopad_ml8':
Beispiel #6
0
alpha_sis = AlphaSIS()
bm = BeiderMorse()
caverphone = Caverphone()
davidson = Davidson()
dm = DaitchMokotoff()
dolby = Dolby()
double_metaphone = DoubleMetaphone()
eudex = Eudex()
fonem = FONEM()
fuzzy_soundex = FuzzySoundex()
haase = Haase()
henry_early = HenryEarly()
koelner = Koelner()
lein = Lein()
metaphone = Metaphone()
metasoundex = MetaSoundex()
mra = MRA()
norphone = Norphone()
nrl = NRL()
nysiis = NYSIIS()
onca = ONCA()
parmar_kumbharana = ParmarKumbharana()
phonem = Phonem()
phonet = Phonet()
phonetic_spanish = PhoneticSpanish()
phonex = Phonex()
phonix = Phonix()
pshp_soundex_first = PSHPSoundexFirst()
pshp_soundex_last = PSHPSoundexLast()
refined_soundex = RefinedSoundex()
Beispiel #7
0
class MetaphoneTestCases(unittest.TestCase):
    """Test Metaphone functions.

    test cases for abydos.phonetic.Metaphone
    """

    pa = Metaphone()

    def test_metaphone(self):
        """Test abydos.phonetic.Metaphone."""
        self.assertEqual(self.pa.encode(''), '')
        self.assertEqual(self.pa.encode('...'), '')

        # http://ntz-develop.blogspot.com/2011/03/phonetic-algorithms.html
        self.assertEqual(self.pa.encode('Fishpool', 4), 'FXPL')
        self.assertEqual(self.pa.encode('Fishpoole', 4), 'FXPL')
        self.assertEqual(self.pa.encode('Gellately', 4), 'JLTL')
        self.assertEqual(self.pa.encode('Gelletly', 4), 'JLTL')
        self.assertEqual(self.pa.encode('Lowers', 4), 'LWRS')
        self.assertEqual(self.pa.encode('Lowerson', 4), 'LWRS')
        self.assertEqual(self.pa.encode('Mallabar', 4), 'MLBR')
        self.assertEqual(self.pa.encode('Melbert', 4), 'MLBR')
        self.assertEqual(self.pa.encode('Melbourn', 4), 'MLBR')
        self.assertEqual(self.pa.encode('Melbourne', 4), 'MLBR')
        self.assertEqual(self.pa.encode('Melburg', 4), 'MLBR')
        self.assertEqual(self.pa.encode('Melbury', 4), 'MLBR')
        self.assertEqual(self.pa.encode('Milberry', 4), 'MLBR')
        self.assertEqual(self.pa.encode('Milborn', 4), 'MLBR')
        self.assertEqual(self.pa.encode('Milbourn', 4), 'MLBR')
        self.assertEqual(self.pa.encode('Milbourne', 4), 'MLBR')
        self.assertEqual(self.pa.encode('Milburn', 4), 'MLBR')
        self.assertEqual(self.pa.encode('Milburne', 4), 'MLBR')
        self.assertEqual(self.pa.encode('Millberg', 4), 'MLBR')
        self.assertEqual(self.pa.encode('Mulberry', 4), 'MLBR')
        self.assertEqual(self.pa.encode('Mulbery', 4), 'MLBR')
        self.assertEqual(self.pa.encode('Mulbry', 4), 'MLBR')
        self.assertEqual(self.pa.encode('Saipy', 4), 'SP')
        self.assertEqual(self.pa.encode('Sapey', 4), 'SP')
        self.assertEqual(self.pa.encode('Sapp', 4), 'SP')
        self.assertEqual(self.pa.encode('Sappy', 4), 'SP')
        self.assertEqual(self.pa.encode('Sepey', 4), 'SP')
        self.assertEqual(self.pa.encode('Seppey', 4), 'SP')
        self.assertEqual(self.pa.encode('Sopp', 4), 'SP')
        self.assertEqual(self.pa.encode('Zoppie', 4), 'SP')
        self.assertEqual(self.pa.encode('Zoppo', 4), 'SP')
        self.assertEqual(self.pa.encode('Zupa', 4), 'SP')
        self.assertEqual(self.pa.encode('Zupo', 4), 'SP')
        self.assertEqual(self.pa.encode('Zuppa', 4), 'SP')

        # assorted tests to complete code coverage
        self.assertEqual(self.pa.encode('Xavier'), 'SFR')
        self.assertEqual(self.pa.encode('Acacia'), 'AKX')
        self.assertEqual(self.pa.encode('Schuler'), 'SKLR')
        self.assertEqual(self.pa.encode('Sign'), 'SN')
        self.assertEqual(self.pa.encode('Signed'), 'SNT')
        self.assertEqual(self.pa.encode('Horatio'), 'HRX')
        self.assertEqual(self.pa.encode('Ignatio'), 'IKNX')
        self.assertEqual(self.pa.encode('Lucretia'), 'LKRX')

        # assorted tests to complete branch coverage
        self.assertEqual(self.pa.encode('Lamb'), 'LM')
        self.assertEqual(self.pa.encode('science'), 'SNS')

        # max_length bounds tests
        self.assertEqual(self.pa.encode('Niall', max_length=-1), 'NL')
        self.assertEqual(self.pa.encode('Niall', max_length=0), 'NL')

        # Test wrapper
        self.assertEqual(metaphone('Xavier'), 'SFR')
Beispiel #8
0
    def map_answers(self):

        # lists for mapped answers in different categories
        self.mapped_survey_answers = []
        self.first_name_mapped_survey_answers = []
        self.unmapped_survey_answers = []

        # loop through answers and map them
        for answer in self.meds_cleaned:

            # try to get the drugbank ids for the whole answer
            db_ids = self.drug_dictionary.get(answer)

            # regex pattern to isolate first word
            first_word = re.sub('[^\w]+.*$', '', answer)
            first_word_db_ids = self.drug_dictionary.get(first_word)

            # if the name is already in the drug dictionary add to the mapped list
            if db_ids:
                self.mapped_survey_answers.append(answer)
                mapped_db_ids = db_ids

            # if its first name is in the bnf add it to the first name mapped list and add answer to the drug dictionary
            elif first_word_db_ids:
                self.drug_dictionary[answer] = self.drug_dictionary[first_word]
                self.first_name_mapped_survey_answers.append(answer)
                mapped_db_ids = first_word_db_ids

            # otherwise add it to the unmapped list
            else:
                self.unmapped_survey_answers.append(answer)
                mapped_db_ids = set()
            # for each of the drugbank ids, update the frequency dictionary
            for db_id in mapped_db_ids:
                self.drug_frequencies[db_id] += 1

        ## use metaphone to map phonetic encodings to drugbank ids in the drug dictionaries ##

        mp = Metaphone()
        # dictionary for storing encodings mapped to drugbank ids
        encoded_drug_dict = {}
        # list for ambigious encodings (distinct phonetically-identical drugs) - these will be removed from the dictionary
        ambiguous_encodings = []

        # loop through the drug dictionary and encode every entry, saving the corresponding drugbank ids under the encoding
        for drug in self.drug_dictionary:

            # save the encoding for each drug
            encoding = mp.encode(drug)

            # if the encoding is not in the encoding dictionary, add it
            if encoding not in encoded_drug_dict:
                encoded_drug_dict[encoding] = self.drug_dictionary[drug]

            # if the encoding is already in the dictionary and there exists different ids for the same encoding, save it
            elif self.drug_dictionary[drug] != encoded_drug_dict[encoding]:
                ambiguous_encodings.append(encoding)

        # filter for encodings with only one match in the drugbank
        encoded_drug_dict = {
            key: val
            for key, val in encoded_drug_dict.items()
            if key not in ambiguous_encodings
        }

        # get survey answers whose encodings are valid
        self.mapped_by_encoding = [
            answer for answer in self.unmapped_survey_answers
            if mp.encode(answer) in encoded_drug_dict
        ]
        # add answers to the drug dictionary under the encoding's drugbank ids
        for answer in self.mapped_by_encoding:
            self.drug_dictionary[answer] = encoded_drug_dict[mp.encode(answer)]

        # list for drugs still unmapped by phonetic encoding
        self.unmapped_by_encoding = [
            answer for answer in self.unmapped_survey_answers
            if answer not in self.mapped_by_encoding
        ]