def extract_keyword_from_text(self, text): ''' Tokenize webpage text and extract keywords Input: text (str): text to extract keywords from Output: keywords (list): keywords extracted and filtered by pre-defined dictionary ''' text = re.sub("[^a-zA-Z+3]", " ", text) text = text.lower().split() cleared_text = [] for punctuation in string.punctuation: for i in range(0, len(text)): text[i].replace(punctuation, ' ') cleared_text += text[i].split() text = cleared_text stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = list(set(text)) keywords = [ str(word) for word in text if (word.lower() in overall_dict) ] keywords = [] for word in text: for skill in overall_dict: if word.lower() == skill.lower() or metaphone.doublemetaphone( word) == metaphone.doublemetaphone(skill): keywords.append(skill) return keywords
def getEpisodesMetadata(self, itemCount, offset, quant, search): searchQuery = search.replace("-", " ") meta = doublemetaphone(searchQuery) quantDigits = len(str(itemCount)) replacer = meta[0][-1:] metaOne = meta[0].replace(replacer, "") metaTwo = meta[1].replace(replacer, "") listLinks = [] for i in range(quant): target = i + offset if target < itemCount: count = target + 1 if count < 10: count = '0' * (quantDigits - 1) + str(count) elif count < 100: count = '0' * (quantDigits - 2) + str(count) else: count = str(count) results = anime.searchAnimesMetadata(searchQuery + ' ' + count, quant=3) for ep in results: if ep != '': returned = doublemetaphone(ep.title.replace("-", "")) returnedOne = returned[0].replace(replacer, "") returnedTwo = returned[1].replace(replacer, "") if (returnedOne == metaOne): listLinks.append({'title': ep.title, 'image': ep.image, 'link': ep.link}) break return listLinks
def getEpisodesMetadata(self, itemCount, offset, quant, search): searchQuery = search.replace("-", " ") meta = doublemetaphone(searchQuery) quantDigits = len(str(itemCount)) replacer = meta[0][-1:] metaOne = meta[0].replace(replacer, "") metaTwo = meta[1].replace(replacer, "") listLinks = [] for i in range(quant): target = i + offset if target < itemCount: count = target + 1 if count < 10: count = '0' * (quantDigits - 1) + str(count) elif count < 100: count = '0' * (quantDigits - 2) + str(count) else: count = str(count) results = anime.searchAnimesMetadata(searchQuery + ' ' + count, quant=3) for ep in results: if ep != '': returned = doublemetaphone(ep.title.replace("-", "")) returnedOne = returned[0].replace(replacer, "") returnedTwo = returned[1].replace(replacer, "") if (returnedOne == metaOne): listLinks.append({ 'title': ep.title, 'image': ep.image, 'link': ep.link }) break return listLinks
def adapted_doublemetaphone(data, language="en"): ''' Adapted function to take into account specific topics not considered in original version it accepts both strings and lists of strings ''' if (isinstance(data, str)): list_data = [data] using_string = True else: list_data = data using_string = False #We perform the operation ina list, and then we return the result result = [] for data2met in list_data: if (language == "es"): if not re.match(r"[Cc]h", data2met): data2met = re.sub(r"h", "", data2met.lower().replace("ph", "f")) #In spanish b and v are pronunced equally, if we know the language is spanish we shall remove! result.append( doublemetaphone(data2met.lower().replace("v", "b").replace( "gi", "ji").replace("ge", "je").replace("ph", "f"))) result.append(doublemetaphone(data2met)) solution = result if using_string: solution = result[0] #There has been an specific case where it has been found a complete second row empty. This is fixing it. if (len(solution) == 4) and (solution[-2:] == [('', ''), ('', '')]): solution = solution[:2] return solution
def check_curse(msg): curse_triggers = ['bitch', 'nigger', 'f**k', 'shit', 'c**k'] for curse in curse_triggers: if doublemetaphone(msg)[0].find(doublemetaphone(curse)[0]) != -1: boto['curse_count'] += 1 return True return False
def __init__(self, ID, name, altNames, latitude, longitutde, country): # Unique ID for this city. Must be hashable. self.ID = ID # UTF-8 Name, stored in all-uppercase for simplicity. self.name = self.preprocess(name) # Alternate names, list of UTF-8 strings, all-uppercase (where applicable) self.altNames = [] for altName in altNames: if (len(altName) > 0): self.altNames.append(self.preprocess(altName)) # Also store original name, as written in the file. # If we decide to strip whitespace, punctuation, etc. want to be able to # return the actual name of city. self.origName = name self.latitude = latitude self.longitude = longitutde # ISO-3166 2-letter country code self.country = country # dic of phonetic (double metaphone) representation of city's name(s) # key is original name (with no preprocessing), value is phonetic name. self.phonetics = {} # Store metaphone representation of city's name and alt names. self.phonetics[self.origName] = doublemetaphone(self.origName) for name in altNames: self.phonetics[name] = doublemetaphone(name)
def _addplayer(opteid, optrid, optplayer): """<eid> <rid> <player name> Adds a new player into the database. Needs a unique EID, RID, and playername (sanitized and parsed). DM will be calculated upon insertion. Ex: 2330 <RID> tom brady """ # everything looks good so lets prep to add. # 2330|1163|tom brady|tom|brady|TM||PRT| optplayer = _sanitizeName(optplayer) # sanitize. namesplit = optplayer.split( ) # now we have to split the optplayer into first, last. (name needs to be parsed before) fndm = doublemetaphone(namesplit[0]) # dm first. lndm = doublemetaphone(namesplit[1]) # dm last. # connect to the db and finally add. with sqlite3.connect(DB) as db: try: cursor = db.cursor() cursor.execute( "INSERT INTO players VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", (opteid, optrid, optplayer, namesplit[0], namesplit[1], fndm[0], fndm[1], lndm[0], lndm[1])) db.commit() #return("I have successfully added player {0}({1}).".format(optplayer, opteid)) return True except sqlite3.Error, e: print("ERROR: I cannot add {0}. Error: '{1}'".format(optplayer, e)) return None
def _rehashdm(eid): """<eid> Recalculate the doublemetaphone for a player (eid) Ex: 2330 """ with sqlite3.connect(DB) as db: cursor = db.cursor() cursor.execute("SELECT firstname, lastname FROM players WHERE eid=?", (eid, )) row = cursor.fetchone() # calculate the dm on first,l ast fndm = doublemetaphone(row[0]) lndm = doublemetaphone(row[1]) # firstname and lastname are tuples. with sqlite3.connect(DB) as db: cursor = db.cursor() try: cursor.execute( "UPDATE players SET fndm1=?, fndm2=?, lndm1=?, lndm2=? WHERE eid=?", ( fndm[0], fndm[1], lndm[0], lndm[1], eid, )) db.commit() #return("I have successfully updated EID {0}'s doublemetaphone ({1}, {2})".format(eid, fndm, lndm)) return True except sqlite3.Error, e: print( "ERROR: _rehashdm: I cannot update EID {0}'s doublemetaphone: '{1}'" .format(eid, e)) return None
def adapted_doublemetaphone(data, language="en"): ''' Adapted function to take into account specific topics not considered in original version it accepts both strings and lists of strings ''' if (isinstance(data, str)): list_data = [data] using_string = True else: list_data = data using_string = False #We perform the operation ina list, and then we return the result result = [] for data2met in list_data: if (language == "es"): if not re.match(r"[Cc]h", data2met): data2met = re.sub(r"h", "", data2met.lower().replace("ph", "f")) #In spanish b and v are pronunced equally, if we know the language is spanish we shall remove! result.append( doublemetaphone(data2met.lower().replace("v", "b").replace( "gi", "ji").replace("ge", "je").replace("ph", "f"))) result.append(doublemetaphone(data2met)) if using_string: return result[0] else: return result
def check_is_metaphone(str_1: str, str_2: str) -> int: """Returns integer depending on if the two words are homophones. Returns 100 if the words are homophones. Otherwise 0. Args: str_1 (str): First text to compare. str_2 (str): Second text to compare. Returns: int: 100 if str_1 and str_2 are homophones. Else 0. Notes: - The doublemetaphone algorithm will return 100 for words with incorrect spacing. - e.g., California == Cali fornia """ str_1_meta = doublemetaphone(str_1) str_2_meta = doublemetaphone(str_2) for e in str_1_meta: if e in str_2_meta and e != '': return 100 # 100 match return 0
def __create_phonetics__(self, word, misspelling_list): word_phon = doublemetaphone(word)[0] new_error_list = [] for error in misspelling_list: error_phon = doublemetaphone(error)[0] if (len(error_phon) != 0 and (error_phon == word_phon)): new_error_list.append(error) return new_error_list, error_phon
def homo_check(pair): word1 = pair[1][0] word2 = pair[1][1] if doublemetaphone(word1) == doublemetaphone(word2): return "homophone" elif stemlemma_check(word1, word2): return "inflection" elif simi_sound(word1, word2): if any(char.isdigit() for char in word1) or any(char.isdigit() for char in word2): return "number" else: return "quasi-homophone" else: return "None"
def is_mispelling(token_left, token_right): dml = set(doublemetaphone(token_left)) dmr = set(doublemetaphone(token_right)) if len(dml.intersection(dmr).difference({''})) > 0: return True if ratio(token_left, token_right) >= 90: return True return False
def compare_rhyme(word1, word2): w1 = list(doublemetaphone(word1)) w2 = list(doublemetaphone(word2)) for i in w1: for j in w2: if i == j: return True elif i[len(i) // 2:] == j[len(i) // 2:]: return True return False
def __hashing_names__(self): for i, row in self.df.iterrows(): if not pd.isna(row['First Name']): # self.df.at[i, 'First Name'] = jellyfish.soundex(row['First Name']) self.df.at[i, 'First Name'] = doublemetaphone(row['First Name'])[0] else: self.df.at[i, 'First Name'] = '' if not pd.isna(row['Last Name']): # self.df.at[i, 'Last Name'] = jellyfish.soundex(row['Last Name']) self.df.at[i, 'Last Name'] = doublemetaphone(row['Last Name'])[0] else: self.df.at[i, 'Last Name'] = ''
def phonetic_score(query,cand): phn_query = doublemetaphone(query) phn_cand = doublemetaphone(cand) score = 1 if phn_query[0] == phn_cand[0]: return 1000000 elif phn_query[1] == phn_cand[0] or phn_query[0] == phn_cand[1]: return 1000 elif phn_query[1] == phn_cand[1]: return 10 else: return 1 return score
def dlbMetaphone(): print('Running Double Metaphone:...') strings = [ 'Ball Bearing', 'bll brng', 'Centrifugal', 'centrifigal', 'PUmp', 'pmp' ] i = 0 for item in strings: rStr = item + str(i) print(item, '->', doublemetaphone(rStr)) result = doublemetaphone(rStr) print('Double Metaphone [0]: ', result[0]) print('Double Metaphone [1]: ', result[1])
def __hashing_names__(self): # Creates a metaphone for First Name and Last Name. We can also use soundex. (That code has been commented below) for i, row in self.df.iterrows(): if not pd.isna(row['First Name']): # self.df.at[i, 'First Name'] = jellyfish.soundex(row['First Name']) self.df.at[i, 'First Name'] = doublemetaphone( row['First Name'])[0] else: self.df.at[i, 'First Name'] = '' if not pd.isna(row['Last Name']): # self.df.at[i, 'Last Name'] = jellyfish.soundex(row['Last Name']) self.df.at[i, 'Last Name'] = doublemetaphone(row['Last Name'])[0] else: self.df.at[i, 'Last Name'] = ''
def get_double_metaphone(df, var): placeholder = [] name_entity = [x.strip() for x in df[var].split(' ')] try: for i in name_entity: placeholder.append(doublemetaphone(i)[0]) if len(doublemetaphone(i)[1]) > 0: placeholder.append(doublemetaphone(i)[1]) liststring = ', '.join(map(str, placeholder)) return liststring except: return ''
def process_definition(message, term): term_db = mongo_client.hamfurs.definitions term = term.lower() definition = term_db.find_one({"index": term}) if definition is None: # Search by any keyword value definition = term_db.find_one({"keywords": term}) # Search by metaphone if definition is None: definition = term_db.find_one({"metaphone": doublemetaphone(term)}) if definition is None: send_editable_message( message, "No definition for the given term found.\n(use /add\_definition to contribute one)", ) return txt = "*{term}*: {definition}\n(Contributed by {contributor} _{last_edit}_)".format( **definition ) send_editable_message( message, txt, parse_mode="Markdown", disable_web_page_preview=True ) return
def fuzz(text): """ Apply porter stemming algorithm then double metaphone algorithm to the passed in String to obtain normalized and misspelling tolerant hash values/tokens """ processed_tokens = [doublemetaphone(stemmer.stem(word)) for word in text.split()] return [token for token in sum(processed_tokens, ()) if token]
def __init__(self): SpellChecker.dictCountMap = self.readDitionary('data/count_1w100k.txt') for key in SpellChecker.dictCountMap: SpellChecker.totalCount += SpellChecker.dictCountMap[key] for word in SpellChecker.dictCountMap: tGList = self.getGrams(word, SpellChecker.invertMapGram) for tgram in tGList: tmpWordList = [] if tgram in SpellChecker.invertTriMap: tmpWordList = SpellChecker.invertTriMap[tgram] tmpWordList.append(word) SpellChecker.invertTriMap[tgram] = tmpWordList tmpWordList = [] # soundexHash = jellyfish.soundex(word) # if soundexHash in SpellChecker.invertSoundexMap: # tmpWordList = SpellChecker.invertSoundexMap[soundexHash] # tmpWordList.append(word) # SpellChecker.invertSoundexMap[soundexHash] = tmpWordList # metaHash = jellyfish.metaphone(word) # if metaHash in SpellChecker.invertMetaMap: # tmpWordList = SpellChecker.invertMetaMap[metaHash] # tmpWordList.append(word) # SpellChecker.invertMetaMap[metaHash] = tmpWordList metaHash = doublemetaphone(word)[0] if metaHash in SpellChecker.invertMetaMap: tmpWordList = SpellChecker.invertMetaMap[metaHash] tmpWordList.append(word) SpellChecker.invertMetaMap[metaHash] = tmpWordList
def _rehashdm(eid): """.""" with sqlite3.connect(DB) as db: cursor = db.cursor() cursor.execute("SELECT firstname, lastname FROM players WHERE eid=?", (eid,)) row = cursor.fetchone() if not row: print "I did not find any player in the db with EID '{0}'".format(eid) return None else: firstname = doublemetaphone(row[0]) lastname = doublemetaphone(row[1]) print "DM :: FIRSTNAME {0} LASTNAME {1}".format(firstname, lastname) return dm
def metaphoneToken(field): return { metaphone_token for metaphone_token in itertools.chain( *(doublemetaphone(token) for token in set(field.split()))) if metaphone_token }
async def search_word(word): logger = logging.getLogger(__name__) metaname = doublemetaphone(word) queryname = metaname[0] + metaname[1] logger.debug('SEARCH WORD') logger.debug([word, queryname]) try: collection = mongo_connect_words() #query = {'_s_word':queryname} query = {"_s_word": {"$eq": queryname}} logger.debug(query) distances = [] # logger.debug('SEARCH WORD A') # logger.debug(collection) async for document in collection.find(query): #: # logger.debug('SEARCH WORD FOUND') # logger.debug(document) distance = lev.jaro_winkler(word, document.get('word')) distances.append({ "word": document.get('word'), "distance": distance, "data": document }) if len(distances) > 0: distances.sort(key=lambda x: x.get('distance'), reverse=True) logger.debug('SEARCH DIST LIST') logger.debug(distances) return distances[0].get('data') else: return None except: logger.debug('SEARCH WORD ERR') e = sys.exc_info() logger.debug(e)
def build_similar_phonetic_words_db(lang, wordslist_path): with open(wordslist_path, 'rb') as wordslist_file: wordslist = [w.strip().lower() for w in wordslist_file.read().splitlines()] for word in tqdm(wordslist): phonetic_primary, phonetic_secondary = doublemetaphone(word) phonetic_word_primary = None phonetic_word_secondary = None if phonetic_primary: phonetic_word_primary = get_or_create(db.session, PhoneticWord, phonetic=phonetic_primary) if phonetic_secondary: phonetic_word_secondary = get_or_create(db.session, PhoneticWord, phonetic=phonetic_secondary) if phonetic_word_primary is not None or \ phonetic_word_secondary is not None: lang_word = get_or_create(db.session, LangWord, lang=lang, word=word, phonetic_word_primary=phonetic_word_primary, phonetic_word_secondary=phonetic_word_secondary) db.session.commit()
def __init__(self, actorType, name, locationID=None, articleIDs=[], connections={}, id=None, db=None, **kwds): ''' Initializes an Actor object. ''' self.actorType = actorType self.name = name metaphone_name = doublemetaphone(name) self._a_name = metaphone_name[0] + metaphone_name[1] self.locationID = locationID self.articleIDs = articleIDs # Dictionary to ActorConnections self.connections = connections if db: self._db = db self._collection = db[Actor._collectionKey] if not id: self.storeDB(db) else: self._mongoID = id self._id = str(id)
def get_dmetaphone_tokens(token): if len(token) > 2 and not any(i.isdigit() for i in token): misspellings = doublemetaphone(token) misspellings = [t for t in misspellings if t != ""] else: misspellings = [] return misspellings
def result(): if request.method == 'POST': string_1 = request.form['text1'] string_2 = request.form['text2'] x_1 = string_1.lower() x_2 = string_2.lower() r_1 = fuzz.ratio(doublemetaphone(x_1)[0], doublemetaphone(x_2)[0]) r_2 = fuzz.ratio(x_1, x_2) if r_1 > 90: return render_template("result.html", value=r_1) elif r_2 > 85: return render_template("result.html", value=r_2) elif doublemetaphone(x_1)[0] is doublemetaphone(x_2)[0] : return render_template("result.html", value=) else: return render_template("result.html", value=0)
def insertName(name: str = "Mr. Sámuel Falledo"): """filters the inserted name string 'Kevin Meyers' and converts it, RETURNs the photetical doublemetaphone 'KFNMRS'""" #lowercase name = name.lower() #doens't seem to be needed #remove accented letters name = unidecode(name) #remove honorifics? # this will work, but it removes everything before a period, not strictly honorifics regex = r"\w+\. *(?=\w+)|,[\s\w]*$" subst = "" name = re.sub(regex, subst, name, 0) #expand common english name abbreviations # https://en.wiktionary.org/wiki/Appendix:Abbreviations_for_English_given_names #sort alphabetically sort_name = name.split() sort_name.sort() name = ' '.join(sort_name) #double metaphone conversion nameTuple = doublemetaphone(name) return nameTuple
def phonetic_rhyme(word: str, all_phonetics, thresh=10, alliteration=False): """This function returns a list of the closest phonetic matches from a given word based on the "phonetic_dist function""" # get phonetic and metaphone of word to be compared info = helper.get_by_id(word, word_relation_table) word_info = [info['id'], info['phonetic'], met.doublemetaphone(info['id'])[0]] print(word_info[1]) matches = [] # Compare distance between input word and all other viable words for i in range(len(all_phonetics)): current_word = all_phonetics[i] # only compares words that differ and long enough words if word_info[0] != current_word[0] and len(current_word[0]) > 3: phon_dist = phonetic_dist(word_info[1], current_word[1], alliteration) # while matches is not full, populate list if len(matches) < thresh: matches.append({"word": current_word[0], "d": phon_dist, "Phon": current_word[1]}) else: if matches[thresh - 1]["d"] > phon_dist: matches[thresh - 1] = {"word": current_word[0], "d": phon_dist, "Phon": current_word[1]} matches = sorted(matches, key=lambda k: k['d']) return matches
def build_roman_dmeta(corpus): """To build and dump double metaphone dictionary to Redis DB.""" from metaphone import doublemetaphone # import redis dmeta = {} for word in corpus: # print n # print word word = word.lower() if word: code = doublemetaphone(word) if code[0] not in dmeta: dmeta[code[0]] = [word] else: if word not in dmeta[code[0]]: dmeta[code[0]].append(word) # print dmeta # values = dict(dmeta[value].most_common()) # print values # r = redis.Redis(host='localhost', port=6379, db=1) r = load_redis(1) for dummy in dmeta.iteritems(): # print dummy # print "testing", ",".join(dummy[1]) temp2 = {dummy[0]: ",".join(dummy[1])} # temp2 = {dummy[0]: dummy[1]} r.hmset('dmeta', temp2) return
def fingerprint(name): """ A fingerprinting function that mimics some of the disambiguation features of OpenRefine. Originally I intended to use this in place of OpenRefine, but that proved less reliable and efficient than using OpenRefine directly. Now this function is used primarily to compare possible author names. """ testname = name if re.search(r"\bjesu", testname.lower()) != None or re.search( r"\bchrist\b", testname.lower()) != None: return ("christ", "") else: testname = re.sub(r"\band\b|\bor\b", "", testname) for k, v in abbrev.items(): patt = f"\\b{k.lower()}\.?\\b" testname = re.sub(patt, v.lower(), testname.lower()) testname = re.sub(r"\bfl\b|\bof\b|\bd\.|\bb\.", "", testname) testname = re.sub(r"[\.\?,!;:\(\)\-\[\]]|'s", "", testname) testname = re.sub(r"\d+", "", testname) if " " in testname: testname = "".join(sorted(testname.split())) dm = doublemetaphone(testname) if dm[1] == '' and re.search(r"[aiouy]$", testname.lower()) != None: return (dm[0], f"{dm[0]}{testname[-1].upper()}") elif dm[1] == '' and re.search(r"^b", testname.lower()) != None: return (dm[0], f"{testname[0].upper()}{dm[0][1:]}") else: return dm
def match(data1, data2, fields1, fields2): data1phonetic = {key: {field: metaphone.doublemetaphone(data1[key][field]) for field in data1[key]} for key in data1} data2phonetic = {key: {field: metaphone.doublemetaphone(data2[key][field]) for field in data2[key]} for key in data2} matches = [] for data1key, data1values in data1phonetic.items(): for data2key, data2values in data2phonetic.items(): match = True for field1, field2 in zip(fields1, fields2): possibilities = [ data1values.get(field1)[0] == data2values.get(field2)[0], data1values.get(field1)[0] == data2values.get(field2)[1], data1values.get(field1)[1] == data2values.get(field2)[0], data1values.get(field1)[1] == data2values.get(field2)[1] != '' ] if True not in possibilities: match = False if match: matches.append((data1key, data2key)) return matches
def _addplayer(opteid, optrid, optplayer): """<eid> <rid> <player name> adds a new player into the database.""" # everything looks good so lets prep to add. # 2330|1163|tom brady|tom|brady|TM||PRT| optplayer = _sanitizeName(optplayer) # sanitize. namesplit = optplayer.split() # now we have to split the optplayer into first, last. fndm = doublemetaphone(namesplit[0]) # dm first. lndm = doublemetaphone(namesplit[1]) # dm last. # connect to the db and finally add. with sqlite3.connect(DB) as db: try: cursor = db.cursor() cursor.execute("INSERT INTO players VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", (opteid, optrid, optplayer, namesplit[0], namesplit[1], fndm[0], fndm[1], lndm[0], lndm[1])) db.commit() return("I have successfully added player {0}({1}).".format(optplayer, opteid)) except sqlite3.Error, e: return("ERROR: I cannot add {0}. Error: '{1}'".format(optplayer, e))
def getMetaphones(wordList): metaphones = [] if isinstance(wordList, basestring): wordList = wordList.split() for word in wordList: metaphones.extend(list(metaphone.doublemetaphone( normalizeUnicode(word)))) return sorted([x for x in set(metaphones) if x])
def create_metaphone_dict(fp): words=(word for line in fp for word in line.split()) words=imap(lambda w: w.lower(), words) metaphoneDict={} for word in words: word2=purifyWord_deep(word) if word not in metaphoneDict: metaphoneDict[word]=doublemetaphone(word2) return metaphoneDict
def create_soundalikes_wordlist(self): words = {} wordlist = open(self.input_file, 'r') for line in wordlist: word = string.rstrip(line) phonemes = doublemetaphone(word)[0] word_info = words.setdefault(phonemes, self.__default_word_info()) word_info['graphemes'].append(word) self.__mark_inappropriate(word_info, word) wordlist.close() self.__export_wordlist(words)
def reduce_word(cls, s): # from metaphone.word import Word import metaphone as fuzzy # fuzzy.DMetaphone does not work with unicode strings, see # https://bitbucket.org/yougov/fuzzy/issue/2/fuzzy-support-for-unicode-strings-with # dm = fuzzy.doublemetaphone(s.encode('utf8')) dm = fuzzy.doublemetaphone(s) dms = dm[0] or dm[1] if dms is None: return '' if isinstance(dms, six.binary_type): dms = dms.decode('utf8') return dms
def metaphone(a): """ Determine the (double) metaphone. """ #print("Metaphone value: %-10s : dmetaphone(%s)" % (a, dmetaphone(a))) #print(doublemetaphone("architect")) #("ARKTKT", "") #print(doublemetaphone("bajador")) #("PJTR", "PHTR") return doublemetaphone(a)
def _addplayer(opteid, optrid, optplayer): """<eid> <rid> <player name> Adds a new player into the database. Needs a unique EID, RID, and playername (sanitized and parsed). DM will be calculated upon insertion. Ex: 2330 <RID> tom brady """ # everything looks good so lets prep to add. # 2330|1163|tom brady|tom|brady|TM||PRT| optplayer = _sanitizeName(optplayer) # sanitize. namesplit = optplayer.split() # now we have to split the optplayer into first, last. (name needs to be parsed before) fndm = doublemetaphone(namesplit[0]) # dm first. lndm = doublemetaphone(namesplit[1]) # dm last. # connect to the db and finally add. with sqlite3.connect(DB) as db: try: cursor = db.cursor() cursor.execute("INSERT INTO players VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", (opteid, optrid, optplayer, namesplit[0], namesplit[1], fndm[0], fndm[1], lndm[0], lndm[1])) db.commit() #return("I have successfully added player {0}({1}).".format(optplayer, opteid)) return True except sqlite3.Error, e: print("ERROR: I cannot add {0}. Error: '{1}'".format(optplayer, e)) return None
def get_all_similar_phonetic_words(dst_lang, src_word): similar_phonetic_words = [] src_word = preprocess_es(src_word) phonetic_primary, phonetic_secondary = doublemetaphone(src_word) if phonetic_primary: similar_phonetic_words = get_similar_phonetic_words(dst_lang, phonetic_primary) if (not similar_phonetic_words or not phonetic_primary) and phonetic_secondary: similar_phonetic_words = get_similar_phonetic_words(dst_lang, phonetic_secondary) return [lang_word.word for lang_word in similar_phonetic_words]
def tokenFeatures(token) : if token in (u'&') : token_clean = token_abbrev = token else : token_clean = re.sub(r'(^[\W]*)|([^.\w]*$)', u'', token.lower()) token_abbrev = re.sub(r'\W', u'', token_clean) metaphone = doublemetaphone(token_abbrev) features = {'nopunc' : token_abbrev, 'abbrev' : token_clean.endswith('.'), 'comma' : token.endswith(','), 'hyphenated' : '-' in token_clean, 'contracted' : "'" in token_clean, 'bracketed' : bool(re.match(r'(["(\']\w+)|(\w+[")\'])', token) and not re.match(r'["(\']\w+[")\']', token)), 'fullbracketed' : bool(re.match(r'["(\']\w+[")\']', token)), 'length' : len(token_abbrev), 'initial' : len(token_abbrev) == 1 and token_abbrev.isalpha(), 'has.vowels' : bool(set(token_abbrev[1:]) & set(VOWELS_Y)), 'just.letters' : token_abbrev.isalpha(), 'roman' : set('xvi').issuperset(token_abbrev), 'endswith.vowel' : token_abbrev.endswith(VOWELS_Y), 'digits' : digits(token_abbrev), 'metaphone1' : metaphone[0], 'metaphone2' : (metaphone[1] if metaphone[1] else metaphone[0]), 'more.vowels' : vowelRatio(token_abbrev), 'in.names' : token_abbrev.upper() in ratios, 'prepositions' : token_abbrev in PREPOSITIONS, 'first.name' : ratios.get(token_abbrev.upper(), 0), 'gender_ratio' : gender_names.get(token_abbrev, False), 'possessive' : token_clean.endswith("'s") } reversed_token = token_abbrev[::-1] for i in range(1, len(token_abbrev)) : features['prefix_%s' % i] = token_abbrev[:i] features['suffix_%s' % i] = reversed_token[:i][::-1] if i > 4 : break for tri_gram in ngrams(token_abbrev, 3) : features[tri_gram] = True for four_gram in ngrams(token_abbrev, 4) : features[four_gram] = True return features
def _rehashdm(eid): """<eid> Recalculate the doublemetaphone for a player (eid) Ex: 2330 """ with sqlite3.connect(DB) as db: cursor = db.cursor() cursor.execute("SELECT firstname, lastname FROM players WHERE eid=?", (eid,)) row = cursor.fetchone() # calculate the dm on first,l ast fndm = doublemetaphone(row[0]) lndm = doublemetaphone(row[1]) # firstname and lastname are tuples. with sqlite3.connect(DB) as db: cursor = db.cursor() try: cursor.execute("UPDATE players SET fndm1=?, fndm2=?, lndm1=?, lndm2=? WHERE eid=?", (fndm[0], fndm[1], lndm[0], lndm[1], eid,)) db.commit() #return("I have successfully updated EID {0}'s doublemetaphone ({1}, {2})".format(eid, fndm, lndm)) return True except sqlite3.Error, e: print("ERROR: _rehashdm: I cannot update EID {0}'s doublemetaphone: '{1}'".format(eid, e)) return None
def suggest_name(name): """ Suggest names based on a name :param name: The name to generate matches on :return: """ name = name.upper() data = names dmeta = doublemetaphone(name)[0] temp_data = {} if dmeta not in data: return [()] else: for lname in data[dmeta]: temp_data[lname] = levenshtein_distance(name, lname) return sorted(temp_data.items(), key=lambda x: x[1])
def match_indices( self, search, indices ): # Look for a set intersection of one element between all source criteria. if self.debug: print ( 'match_indices: searchKeys=', indices ) soundalike = False setCur = None for idx_name in indices: if self.debug: print ( "match_indices: matching on key:", idx_name ) idx = getattr( self, idx_name ) v = getattr( search, self.field_from_index(idx_name), None ) if not v or not idx: setCur = None if self.debug: print ( 'match_indices: missing attribute' ) break try: v = normalize_name_lookup( v ) except: pass if self.debug: print ( 'match_indices: value=', v ) found = set() if idx_name.startswith( 'by_mp_' ): soundalike = True for mp in doublemetaphone(v.replace('-','').encode('utf8')): if mp and mp in idx: found |= set(idx[mp]) elif v in idx: found = set(idx[v]) if setCur is None: setCur = set(found) else: setCur &= set(found) if not setCur: if self.debug: print ( "match_indices: match failed. found=", found ) break if self.debug: print ( "matched:", setCur ) return FindResult( search, setCur, self, soundalike )
def pickle_data(): dbpath = os.path.abspath(os.path.join(PWD, 'data.db')) conn = sqlite3.connect(dbpath) conn.row_factory = dict_factory c = conn.cursor() c.execute("""SELECT * FROM states ORDER BY name""") states = [] for row in c: row['name_metaphone'] = doublemetaphone(row['name'])[0] row['is_territory'] = row['is_territory'] == 1 states.append(row) pkl_path = os.path.abspath(os.path.join(PWD, 'us', 'states.pkl')) with open(pkl_path, 'wb') as pkl_file: pickle.dump(states, pkl_file)
def lookup(val, field=None, use_cache=True): """ Semi-fuzzy state lookup. This method will make a best effort attempt at finding the state based on the lookup value provided. * two digits will search for FIPS code * two letters will search for state abbreviation * anything else will try to match the metaphone of state names Metaphone is used to allow for incorrect, but phonetically accurate, spelling of state names. Exact matches can be done on any attribute on State objects by passing the `field` argument. This skips the fuzzy-ish matching and does an exact, case-sensitive comparison against the specified field. This method caches non-None results, but can the cache can be bypassed with the `use_cache=False` argument. """ from metaphone import doublemetaphone if field is None: if FIPS_RE.match(val): field = 'fips' elif ABBR_RE.match(val): val = val.upper() field = 'abbr' else: val = doublemetaphone(val)[0] field = 'name_metaphone' # see if result is in cache cache_key = "%s:%s" % (field, val) if use_cache and cache_key in _lookup_cache: return _lookup_cache[cache_key] for state in STATES_AND_TERRITORIES: if val == getattr(state, field): _lookup_cache[cache_key] = state return state
def tokenFeatures(token) : if token in (u'&') : token_clean = token_abbrev = token else : token_clean = re.sub(r'(^[\W]*)|([^.\w]*$)', u'', token) token_abbrev = re.sub(r'\W', u'', token_clean.lower()) metaphone = doublemetaphone(token_abbrev) features = {'nopunc' : token_abbrev, 'abbrev' : token_clean.endswith('.'), 'comma' : token.endswith(','), 'hyphenated' : '-' in token_clean, 'contracted' : "'" in token_clean, 'bracketed' : bool(re.match(r'["(\']\w+[")\']', token)), 'length' : len(token_abbrev), 'initial' : len(token_abbrev) == 1 and token_abbrev.isalpha(), 'has.vowels' : bool(set(token_abbrev[1:]) & set(VOWELS_Y)), 'roman' : set('xvi').issuperset(token_abbrev), 'endswith.vowel' : token_abbrev.endswith(VOWELS_Y), 'metaphone1' : metaphone[0], 'metaphone2' : (metaphone[1] if metaphone[1] else metaphone[0]), 'more.vowels' : vowelRatio(token_abbrev), 'in.names' : float(token_abbrev.upper() in ratios), 'first.name' : float(ratios.get(token_abbrev.upper(), 0)), 'possessive' : token_clean.endswith("'s") } reversed_token = token_abbrev[::-1] for i in range(1, len(token_abbrev)) : features['prefix_%s' % i] = token_abbrev[:i] features['suffix_%s' % i] = reversed_token[:i][::-1] if i > 4 : break return features
def add( self, result ): self.results.append( result ) ''' 'by_license', 'by_uci_id', 'by_last_name', 'by_first_name', 'by_mp_last_name', 'by_mp_first_name', 'by_nation_code', 'by_date_of_birth', 'by_age', ''' for field in Result.Fields: if getattr( result, field, None ): self.hasField.add( field ) for idx_name in self.Indices: field = self.field_from_index(idx_name) v = getattr( result, field, None ) if not v: continue idx = getattr( self, idx_name ) if idx_name.startswith( 'by_mp_' ): # Initialize a doublemetaphone (soundalike) index. for mp in doublemetaphone(v.replace('-','').encode('utf8')): if mp: try: idx[mp].append( result ) except KeyError: idx[mp] = [result] else: # Initialize a regular field index. assert idx_name != 'by_license' or v not in idx, 'Duplicate license: {}'.format(v) try: key = normalize_name_lookup(v) except: key = v try: idx[key].append( result ) except KeyError: idx[key] = [result]
from metaphone import doublemetaphone filename = 'dict.txt' file_content = open(filename,'r').read().split() #print file_content file_out = "dict1" file_output = open(file_out,'w') for word in file_content: word_list = doublemetaphone(word) output = str(word_list[0]+"\n"+word_list[1]+"\n") file_output.write(output)
def soundslike(word): if word is None: return None t = doublemetaphone(word) return t[0]+t[1]
def get_metaphone_from_word(word): return doublemetaphone(word)[0] if len(doublemetaphone(word)[0]) > 1 else doublemetaphone(word)[1]
def add_source( self, source ): self.sources.append( source ) if __name__ == '__main__': s = Source( 'CallupTest.xlsx', '2014 Result' ) # errors = s.read( GetExcelReader(self.fname) ) print ( s.by_mp_last_name ) sys.exit() #for r in s.results: # print ( r ) for k, v in sorted( ((k, v) for k, v in s.by_mp_last_name.items()), key=operator.itemgetter(0) ): print ( '{}: {}'.format(k, ', '.join( Utils.removeDiacritic(r.full_name) for r in v )) ) for k, v in sorted( ((k, v) for k, v in s.by_mp_first_name.items()), key=operator.itemgetter(0) ): print ( '{}: {}'.format(k, ', '.join( Utils.removeDiacritic(r.full_name) for r in v )) ) for r in s.results: for p_last in doublemetaphone(r.last_name.replace('-','').encode('utf8')): if not p_last: continue p_last_set = s.by_mp_last_name[p_last] for p_first in doublemetaphone(r.first_name.replace('-','').encode('utf8')): p_first_set = s.by_mp_first_name[p_first] p_last_first_set = p_last_set & p_first_set if len(p_last_first_set) > 1: print ( ', '.join( u'({}, {}, {})'.format( Utils.removeDiacritic(rr.full_name), Utils.removeDiacritic(rr.nation_code), rr.age, ) for rr in p_last_first_set ) )
def soundslike(word): t = doublemetaphone(word) return t[0]+t[1]
def soundalike_match( s1, s2 ): dmp1 = doublemetaphone( s1.replace('-','').encode('utf8') ) dmp2 = doublemetaphone( s2.replace('-','').encode('utf8') ) return any( v in dmp1 for v in dmp2 )
from metaphone import doublemetaphone string = 'data/prior.txt' with open(string, 'rb') as f: prior = pickle.load(f) while True: # print "Please give the word:" input = raw_input() input = input.upper() start = time.time() cands = trie.search(input, 4) # print "Got results from trie" + str(len(cands)) edit = {} prob = {} for i in cands: input_ph = doublemetaphone(input) word_ph = doublemetaphone(i[0]) maxphval = -1 if input_ph[0]!='' and word_ph[0]!='': phonetic_val = lev.distance(input_ph[0],word_ph[0]) if maxphval<phonetic_val: maxphval = phonetic_val if input_ph[0]!='' and word_ph[1]!= '': phonetic_val = lev.distance(input_ph[0],word_ph[1]) if maxphval<phonetic_val: maxphval = phonetic_val if input_ph[1]!= '' and word_ph[0]!='': phonetic_val = lev.distance(input_ph[1],word_ph[0])
def doubleMetaphone(field) : return [metaphone for metaphone in doublemetaphone(field) if metaphone]
def metaphoneToken(field) : return {metaphone_token for metaphone_token in itertools.chain(*(doublemetaphone(token) for token in set(field.split()))) if metaphone_token}