def main(): """ Demonstration of the Soundex module, creating lists of name pairs and running them through the soundex method before printing results. """ print("-----------------") print("| codedrome.com |") print("| Soundex |") print("-----------------\n") names1 = [ "Johnson", "Adams", "Davis", "Simons", "Richards", "Taylor", "Carter", "Stevenson", "Taylor", "Smith", "McDonald", "Harris", "Sim", "Williams", "Baker", "Wells", "Fraser", "Jones", "Wilks", "Hunt", "Sanders", "Parsons", "Robson", "Harker" ] names2 = [ "Jonson", "Addams", "Davies", "Simmons", "Richardson", "Tailor", "Chater", "Stephenson", "Naylor", "Smythe", "MacDonald", "Harrys", "Sym", "Wilson", "Barker", "Wills", "Frazer", "Johns", "Wilkinson", "Hunter", "Saunders", "Pearson", "Robertson", "Parker" ] namecount = len(names1) for i in range(0, len(names1)): s1 = soundex.soundex(names1[i]) s2 = soundex.soundex(names2[i]) print("{:20s}{:4s} {:20s}{:4s}".format(names1[i], s1, names2[i], s2))
def similar(word="hallo", filename="/usr/share/dict/words"): checkaskiiword = lambda word: reduce( lambda x, y: x and y, list(map(lambda x: ord(x) < 128, word))) allasciiwords = list(filter(checkaskiiword, open(filename).read().split())) wontedsoundex = soundex(word) result = list(filter(lambda w: soundex(w) == wontedsoundex, allasciiwords)) print(result) return result
def soundex_distance(ovv_snd,cand): try: lev = Levenshtein.distance(unicode(ovv_snd),soundex.soundex(cand.decode("utf-8","ignore"))) except UnicodeEncodeError: print('UnicodeEncodeError[ovv_snd]: %s %s' % (ovv_snd,cand)) lev = Levenshtein.distance(ovv_snd,soundex.soundex(cand.encode("ascii","ignore"))) except UnicodeDecodeError: print('UnicodeDecodeError[ovv_snd]: %s %s' % (ovv_snd,cand)) lev = Levenshtein.distance(ovv_snd,soundex.soundex(cand.decode("ascii","ignore"))) except TypeError: print ('TypeError[ovv_snd]: %s %s' % (ovv_snd,cand)) lev = 10. snd_dis = lev return snd_dis
def test_allFeatures(self): self.assertEqual('T522', soundex('Tymczak')) self.assertEqual('A261', soundex('Ashcraft')) self.assertEqual('A261', soundex('Ashcroft')) self.assertEqual('P236', soundex('Pfister')) self.assertEqual('R150', soundex('Rubin')) self.assertEqual('R163', soundex('Robert')) self.assertEqual('R163', soundex('Rupert')) self.assertEqual('H555', soundex('Honeyman'))
def main(args): if len(args) < 2: print("RTFM") exit file = open(args[1], "r") words = file.readlines() for word in words: cleanWord = "" for letter in word: if letter in list(string.ascii_letters): cleanWord = cleanWord + letter if soundex(args[0]) == soundex(cleanWord): print(cleanWord) file.close() pass
def gen_key(self,val): if self.use_soundex: try: return soundex.soundex(val) except UnicodeEncodeError: return val else: return val
def main(): names1 = ["Johnson", "Adams", "Davis", "Simons", "Richards", "Taylor", "Carter", "Stevenson", "Taylor", "Smith", "McDonald", "Harris", "Sim", "Williams", "Baker", "Wells", "Fraser", "Jones", "Wilks", "Hunt", "Sanders", "Parsons", "Robson", "Harker"] names2 = ["Jonson", "Addams", "Davies", "Simmons", "Richardson", "Tailor", "Chater", "Stephenson", "Naylor", "Smythe", "MacDonald", "Harrys", "Sym", "Wilson", "Barker", "Wills", "Frazer", "Johns", "Wilkinson", "Hunter", "Saunders", "Pearson", "Robertson", "Parker"] namecount = len(names1) for i in range(0, len(names1)): s1 = soundex.soundex(names1[i]) s2 = soundex.soundex(names2[i]) print("{:20s}{:4s} {:20s}{:4s}".format(names1[i], s1, names2[i], s2))
def __init__(self, dbstate, uistate, options_class, name, callback=None): self.label = _('SoundEx code generator') tool.Tool.__init__(self, dbstate, options_class, name) ManagedWindow.ManagedWindow.__init__(self, uistate, [], self.__class__) self.glade = Glade() self.glade.connect_signals({ "destroy_passed_object": self.close, "on_help_clicked": self.on_help_clicked, "on_delete_event": self.close, }) window = self.glade.toplevel self.set_window(window, self.glade.get_object('title'), self.label) self.value = self.glade.get_object("value") self.autocomp = self.glade.get_object("name_list") self.name = self.autocomp.child self.name.connect('changed', self.on_apply_clicked) names = [] person = None for person in self.db.iter_people(): lastname = person.get_primary_name().get_surname() if lastname not in names: names.append(lastname) names.sort() AutoComp.fill_combo(self.autocomp, names) if person: n = person.get_primary_name().get_surname() self.name.set_text(n) try: se_text = soundex.soundex(n) except UnicodeEncodeError: se_text = soundex.soundex('') self.value.set_text(se_text) else: self.name.set_text("") self.show()
def lookup(name): if not data: load_data() name = name.lower() # Start with SOUNDEX match. idx = soundex.soundex(name) d = data.get(idx, {}) l = [] # Narrow to best match. for k, v in d.iteritems(): for x in [ k, v.get('E-mail', ''), v.get('Username', '') ]: x = x.lower() if name in x: l.append(v) break # If no best match, return all. if not l: l = d.values() return l
def checksimilar(filename="/usr/share/dict/words"): checkaskiiword = lambda word: reduce( lambda x, y: x and y, list(map(lambda x: ord(x) < 128, word))) allasciiwords = list(filter(checkaskiiword, open(filename).read().split())) A = {} for i in range(0, len(allasciiwords)): sdx = soundex(allasciiwords[i]) if sdx in A: #A[sdx].append(allasciiwords[i]) A[sdx] = A[sdx] + 1 else: A[sdx] = 1 A_sorted = sorted(A.items(), key=lambda x: x[1], reverse=True) # print(A_sorted) print("Top 10:") for i in range(0, 10): print(A_sorted[i][0], "klingt:", A_sorted[i][1], "mal !") return A_sorted
def main(args): if len(args) < 1: print("RTFM") exit file = open(args[0], "r") words = file.readlines() soundexDict = {} for word in words: cleanWord = "".join([l for l in word if l in string.ascii_letters]) sndX = soundex(cleanWord) soundexDict[sndX] = soundexDict.get(sndX, []) + [cleanWord] wordList = sorted(soundexDict.items(), key=lambda t: -len(t[1]))[0] print(wordList) file.close() pass
def test_upperchars(self): "also uppercase characters" self.assertEqual("s53200", soundex("Soundex")) self.assertEqual("s53200", soundex("soUNDeggs")) self.assertEqual("f46140", soundex("fLuRbEl"))
for (projectPair) in projectPairs: RFname = projectPair[0] RGname = projectPair[1] RFurl = projectPair[2] RGurl = projectPair[3] # lowercase everything RFnameLC = RFname.lower() RGnameLC = RGname.lower() RFurlLC = RFurl.lower() RGurlLC = RGurl.lower() # calculate string metrics levNames = edit_distance(RFnameLC, RGnameLC) levURLs = edit_distance(RFurlLC, RGurlLC) soundexRFname = soundex(RFnameLC) soundexRGname = soundex(RGnameLC) # is the RF project name inside the RG project name? if RFnameLC in RGnameLC: rf_in_rg = 1 else: rf_in_rg = 0 # is the RF project name inside the RG project URL? if RFnameLC in RGurl: rf_in_rgurl = 1 else: rf_in_rgurl = 0 # is any dev on the RF candidate in the dev list for the RG candidate?
def on_apply_clicked(self, obj): try: se_text = soundex.soundex(unicode(obj.get_text())) except UnicodeEncodeError: se_text = soundex.soundex('') self.value.set_text(se_text)
def main(args): override=False # par défaut, ne pas recharger les fichiers locaux (si ils existent) for arg in args: if arg=="-download" : override=True print "--------------------------------------------------------------" print botName,botVersion print "Analyse COG Régions" clk=time.time() regions=insee.insee_region(regionUrl,config.osm_temp_folder) regions.download(override) regions.scan() print "> %d region(s), t=%.2f" % (len(regions.data_list),time.time()-clk) print "Analyse COG Départements" clk=time.time() departements=insee.insee_departement(deptUrl,config.osm_temp_folder) departements.download(override) departements.scan() print "> %d département(s), t=%.2f" % (len(departements.data_list),time.time()-clk) print "Analyse COG Communes" clk=time.time() communes=insee.insee_commune(commUrl,config.osm_temp_folder) communes.download(override) communes.scan() print "> %d communes, t=%.2f" % (len(communes.data_list),time.time()-clk) print "Analyse Recensement %d (populations)" % insee_year clk=time.time() populations=insee.insee_population(popUrl,config.osm_temp_folder) populations.download(override) if override: print "> Download %.2f" % (time.time()-clk) clk=time.time() populations.scan(regions,departements,communes) print "> Scan %.2f" % (time.time()-clk) clk=time.time() dbName=sqlDBFileName if not os.path.isfile(dbName): sql=sqlite3.connect(dbName) sql.execute('''CREATE TABLE regions (id INTEGER PRIMARY KEY NOT NULL,name TEXT,sname TEXT,center TEXT,population INTEGER,year INTEGER);''') sql.execute('''CREATE TABLE departements (id VARCHAR(5) PRIMARY KEY NOT NULL,region INTEGER,name TEXT,sname TEXT,center TEXT,population INTEGER,year INTEGER);''') sql.execute('''CREATE TABLE communes (id VARCHAR(10) PRIMARY KEY NOT NULL,name TEXT,sname TEXT,departement VARCHAR(5),region INTEGER,population INTEGER,year INTEGER,osm_id INTEGER,osm_type VARCHAR(15),latitude FLOAT,longitude FLOAT);''') sql.commit() print "create new database" else: sql=sqlite3.connect(dbName) print "open existing database" c=sql.cursor() nc=0 nu=0 print "update regions data (%d)" % len(regions.data_list) for r in regions.data_list: sname=soundex.soundex(r.name) c.execute('''SELECT * FROM regions WHERE id=%d;''' % r.region) answer=c.fetchone() if answer==None: t=(r.region,r.name,sname,r.cheflieu,r.population,insee_year) c.execute('''INSERT INTO regions (id,name,sname,center,population,year) VALUES (?,?,?,?,?,?);''',t) nc=nc+1 else: t=(r.name,sname,r.cheflieu,r.population,insee_year,r.region) c.execute('''UPDATE regions SET name=?,sname=?,center=?,population=?,year=? WHERE id=?;''',t) nu=nu+1 sql.commit() print "update departements data (%d)" % len(departements.data_list) for d in departements.data_list: sname=soundex.soundex(d.name) c.execute('''SELECT * FROM departements WHERE id="%s";''' % d.dep) answer=c.fetchone() if answer==None: t=(d.dep,d.region,d.name,sname,d.cheflieu,d.population,insee_year) c.execute('''INSERT INTO departements (id,region,name,sname,center,population,year) VALUES (?,?,?,?,?,?,?);''',t) nc=nc+1 else: t=(d.region,d.name,sname,d.cheflieu,d.population,insee_year,d.dep) c.execute('''UPDATE departements SET region=?,name=?,sname=?,center=?,population=?,year=? WHERE id=?;''',t) nu=nu+1 sql.commit() print "update communes data (%d)" % len(communes.data_list) for cc in communes.data_list: sname=soundex.soundex(cc.name) c.execute('''SELECT * FROM communes WHERE id="%s";''' % cc.insee) answer=c.fetchone() if answer==None: t=(cc.insee,cc.name,sname,cc.dep,cc.reg,cc.population,insee_year) try: c.execute('''INSERT INTO communes (id,name,sname,departement,region,population,year) VALUES ("%s","%s","%s","%s",%d,%d,%d);''' % t) except: print "\terror with",cc.insee,cc.nccenr print sys.exc_info() nc=nc+1 else: t=(cc.name,sname,cc.dep,cc.reg,cc.population,insee_year,cc.insee) c.execute('''UPDATE communes SET name="%s",sname="%s",departement="%s",region=%d,population=%d,year=%d WHERE id="%s";''' % t) nu=nu+1 sql.commit() print "> Database update %.2f" % (time.time()-clk) print "database, %d ajout(s) et %d mise à jour" % (nc,nu) print c.close() sql.close() print "--------------------------------------------------------------"
def __add_record(name, d): idx = soundex.soundex(name) if idx in data: data[idx][d['Full Name']] = d else: data[idx] = { d['Full Name']: d }
def testKnownValues(self): """soundex should give known result with known input""" for name, result in self.knownValues: self.assertEqual(soundex.soundex(name), result)
def test_retainsCaseOfFirstChar(self): self.assertEqual('a000', soundex('a')) self.assertEqual('l000', soundex('l')) self.assertEqual('H000', soundex('H'))
def test_removesVowels(self): self.assertEqual('A000', soundex('Aaaa'))
def test_contractsConsonantsSeparatedByHorW(self): self.assertEqual('A400', soundex('Alhl'))
def test_removesHandW(self): self.assertEqual('A000', soundex('Ahw'))
def test_contractConsecutiveConsonants(self): self.assertEqual('A400', soundex('All')) self.assertEqual('A400', soundex('ALl'))
def test_replacesConsonantsWithDigit(self): self.assertEqual('A400', soundex('Al'))
def test_zeroPadsSingleCharWord(self): self.assertEqual('A000', soundex('A'))
def test_singles(self): "single characters" self.assertEqual("a00000", soundex("a")) self.assertEqual("x00000", soundex("x")) self.assertEqual("o00000", soundex("o"))
def test_short(self): "long words shortened a lot" self.assertEqual("s00000", soundex("sAEIOUWYHaeiouwyh")) self.assertEqual("x00000", soundex("x" + "AEIOUWYHaeiouwyh" * 17)) self.assertEqual("a00000", soundex("a" + "AEIOUWYHaeiouwyh" * 42))
def test_examples(self): "examples" self.assertEqual("s53200", soundex("soundex")) self.assertEqual("s53200", soundex("soundeggs")) self.assertEqual("f46140", soundex("flurbel"))
def test_doesNotReplaceConsonantWithDigitIfFirstLetter(self): self.assertEqual('L000', soundex('L'))
def test_alwaysRetunsCodeWithThreeDigits(self): self.assertEqual('A261', soundex('Ashcraft'))
def test_tooooolong(self): "more than 6 chars" self.assertEqual("s53232", soundex("soundexdex")) self.assertEqual("s53232", soundex("soundexdexdex")) self.assertEqual("s53232", soundex("soundexdexdexdex")) self.assertEqual("s53232", soundex("soundexdexdexdexflurbel"))
def test_retainsSoleCharOfWord(self): self.assertEqual('A', first_char(soundex('A'))) self.assertEqual('B', first_char(soundex('B')))
def test_handelsUpperCaseLikeLowerCase(self): self.assertEqual('a400', soundex('aLH'))
def test_all(self): "all characters" chars = "AEIOUWYHaeiouwyh".join("bfpvCGJKQSXZBFPVcgjkqsxzdt") self.assertEqual("s12123", soundex("s" + chars)) chars = "AEIOUWYHaeiouwyh".join("lmnrmnrmnrmnrmnrmnr") self.assertEqual("x45656", soundex("x" + chars))