def test_phonetic_tokenize_name_python2(): """Test checking if custom phonetic algorithms from fuzzy packages work.""" import fuzzy soundex = fuzzy.Soundex(5) assert phonetic_tokenize_name("Dupont, René", "nysiis") == ( ((fuzzy.nysiis(u"Dupont"),), (fuzzy.nysiis(u"René"),))) assert phonetic_tokenize_name("Dupont, René", "soundex") == ( # no direct support for unicode in soundex, thus "Rene" ((soundex(u"Dupont"),), (soundex(u"Rene"),)))
def calc_sim(candidate, poem, sim_range): inrange = [] #lines within sim_range if len(poem) <= sim_range: inrange = poem else: inrange = poem[-sim_range:] score = 0 for line in poem: for tok in line: for can_tok in candidate: score += pylev.distance( fuzzy.nysiis(tok), fuzzy.nysiis(can_tok)) return score
def phx(test): print '--> phx' print test # 1 soundex = fuzzy.Soundex(5) print soundex(test) # 2 dmeta = fuzzy.DMetaphone() print dmeta(test) # 3 print fuzzy.nysiis(test)
def filter(self, tokens, replace): """ Filters the tokens. *tokens* (``list``) the tokens (``str``). *replace* (``bool``) is whether tokens created by this filter should replace the encountered tokens (``True``), or if they should be combined (i.e., appended after) the encountered tokens. Default is ``False``. Returns the filtered (``list``) tokens (``str``). """ truncate = self.truncate found = set(t[0] for t in tokens) if not replace else set() final = [] for token in tokens: if not replace: # Since we are not supposed to replace tokens, append each token # before it is consumed. final.append(token) # Run token through nysiis. result = fuzzy.nysiis(token[0])[:truncate] if result and result not in found: found.add(result) final.append((result, token[1], token[2], token[3])) return final
def phonetic_similarity(poem): # Phonetic representation of each line phonetic_lines = [] for line in poem: phon_line = [] for word in line: phon_line.append(fuzzy.nysiis(word)) phonetic_lines.append(phon_line) # Calculate phonetic value phon_value = 0.1 for i in range(0, len(phonetic_lines)-1): last_word = str(phonetic_lines[i][-1]) next_last_word = str(phonetic_lines[i+1][-1]) # Throw out identical sounding words (TODO: Check their original words, not phonetic representations) if last_word != next_last_word and len(last_word) > 0 and len(next_last_word) > 0: lev_dist = pylev.distance(last_word, next_last_word) # Divide distance by word length since eg morning - mourning is a way better similarity than hat - had lev_dist = lev_dist/float((len(last_word) + len(next_last_word))) phon_value += lev_dist #print 'Potential rhyme pair: '+last_word +' '+next_last_word+' with dist: '+str(lev_dist) else: phon_value += 1 # very unsimilar for two words that are too short # Normalize by poem length (in case variable length poems are allowed) phon_value /= len(phonetic_lines) return phon_value
def score_nysiis(words): """Score words using the nysiis algorithm. :param words (list): the list of words. :rtype scores (list): the scored words """ return ['{}: {}'.format(w.lower(), fuzzy.nysiis(w)) for w in words]
def Realm2Slug(region, realm): global realm2slug if not region in realm2slug: realm2slug[region] = LookupRegionSlugs(region) if not realm in realm2slug[region]: # Lets see if we can find an alias for it realm = fuzzy.nysiis(realm) return realm2slug[region].get(realm)
def initialize(): global wordlist words = [] with open('/usr/share/dict/words') as f: words = f.readlines() for w in words: if not w[0].isupper(): wordlist.append( (w, fuzzy.nysiis(w)) )
def find_similar(word): word_sound = fuzzy.nysiis(word) best = None best_dist = 99999 for w in wordlist: if pylev.distance(word_sound, w[1]) < best_dist and word != w[0][:-1]: best_dist = pylev.distance(word_sound, w[1]) best = (w[0], best_dist) return best
def name_nysiis(word): if word: try: result = fuzzy.nysiis(word) if result: word = result except ValueError: pass return word
def end_rhymes_with(self, line): """ Returns a boolean to indicate whether two lines form an end rhyme. """ if not self.has_same_ending_as(line): if (self.ending[0].pronunciation != None) and (line.ending[0].pronunciation != None): for e1 in self.ending: for e2 in line.ending: m1 = re.search('(?:%(VOWELS)s)(?:%(STRESSED)s)(?!.*(?:%(VOWELS)s)(?:%(STRESSED)s)).*$' % SYMBOLS_REGEX_OR,e1.pronunciation) if m1 is not None: r1 = m1.group() else: r1 = None m2 = re.search('(?:%(VOWELS)s)(?:%(STRESSED)s)(?!.*(?:%(VOWELS)s)(?:%(STRESSED)s)).*$' % SYMBOLS_REGEX_OR,e2.pronunciation) if m2 is not None: r2 = m2.group() else: r2 = None #If the two candidate endings rhyme, then the lines rhyme if r1 == r2: return True else: #Rhyme Fallback... If at least one word was not in the dictionary, we have to use the fallback #TODO: Improve the accuracy of this. Currently, we just get the largest substring starting with a vowel. # We should iterate over the substrings and check for matches instead so that we capture more mult-syllablic rhymes. word1_nysiis = fuzzy.nysiis(self.ending[0].word) word2_nysiis = fuzzy.nysiis(line.ending[0].word) if len(word1_nysiis) <= len(word2_nysiis): m = re.search('[aeiou].*',word1_nysiis) if m is not None: ending = m.group() if word2_nysiis.endswith(ending): return True else: m = re.search('[aeiou]',word2_nysiis) if m is not None: ending = m.group() if word1_nysiis.endswith(ending): return True return False
def keyword_matching(arguments, entities): words = arguments['keywords'] phonics = set([]) overlap = [] for w in words: phonics.add(fuzzy.nysiis(w)) for i in xrange(0, len(entities)): entity_name = nltk.word_tokenize(entities[i].name) entity_phonics = set([]) for word in entity_name: entity_phonics.add(fuzzy.nysiis(word)) common = len(phonics & entity_phonics) / len(entity_phonics) if common == 1: arguments['idx'] = i return overlap.append(common) arguments['idx'] = overlap.index(max(overlap))
def generateNysiisHash(self, dictionary, table=None): nysiisHash = {} if table is None else table for name, gender in dictionary.iteritems(): name = self._sanitizeName(name) if len(name) > 1: nysiishash = fuzzy.nysiis(name) self._appendToDict(nysiishash, gender, nysiisHash) return nysiisHash
def determineFromMetaphone(self, firstName): hashTable = {} self.generateMetaphoneHash(self.firstDict, hashTable) self.generateMetaphoneHash(self.secondDict, hashTable) firstName = self._sanitizeName(firstName) nameHash = fuzzy.nysiis(firstName) if nameHash in hashTable: results = hashTable[nameHash] gender = max(results, key=results.get) if results[gender] > 0: return gender return self.options['unknown']
def soundify(word): table = [] table.append( ('', 0) ) word = fuzzy.nysiis(word) for n in range(1,len(word)): opt_n_score = 99999 opt_n_obj = None for m in range(0, n): opt_m = table[m] remaining = find_similar(word[m:n]) if opt_m[1] + remaining[1] < opt_n_score: opt_n_score = opt_m[1] + remaining[1] opt_n_obj = (opt_m[0]+' '+remaining[0], opt_m[1] + remaining[1]) print n table.append(opt_n_obj) return table[-1]
def LookupRegionSlugs(region): global database if not database: ConnectDatabase(True) c = database.cursor() c.execute("""SELECT `name`, `slug` FROM `realmStatus` WHERE `region` = %s;""" , (region,)) d = c.fetchone() h = {} while d is not None: h[d[0]] = d[1] h[fuzzy.nysiis(d[0])] = d[1] d = c.fetchone() c.close() return h
def phonetic_tokenize_name(name, phonetic_algorithm="double_metaphone"): """Create Double Metaphone tokens from the string. Parameters ---------- :param name: string Name of the author. Usually it should be in the format: surnames, first names. :param phonetic algorithm: string Which phonetic algorithm will be used. Options: - "double_metaphone" - "nysiis" (only for Python 2) - "soundex" (only for Python 2) Returns ------- :return: tuple The first element is a tuple with the tokens for surnames, the second is a tuple with the tokens for first names. The tuple always contains exactly two elements. Only the first results of the double metaphone algorithm are included in tuples. """ if sys.version[0] == '2': import fuzzy dm = fuzzy.DMetaphone() soundex = fuzzy.Soundex(5) phonetic_algorithms = { "double_metaphone": lambda y: dm(y)[0] or '', "nysiis": lambda y: fuzzy.nysiis(y), "soundex": lambda y: soundex(y) } else: from ..ext.metaphone import dm phonetic_algorithms = { "double_metaphone": lambda y: dm(y)[0] } tokens = tokenize_name(name) # Use double metaphone tokens = tuple(map(lambda x: tuple(map(lambda y: phonetic_algorithms[ phonetic_algorithm](y), x)), tokens)) return tokens
def aarnel_validation(data): """ COEFICIENTE DE AARNEL EN VERSION GENERALIZADA PARA DOS STRINGS.. Coeficiente de Aarnel es una mezcla de la distancia Levenshtein (LV) y Teoria de Metaphone (ME). Primero se calcula la distancia de los 2 string con LV, paralelamente se extraen las combinaciones foneticas con ME de los strings por individual y como resultado se obtienen 2 nuevos strings a los cuales tambien se le aplicara LV para compara su proximidad fonetica, posteriormente se les saca el promedio para obtener la media del cruce de proximidad literaria y proximidad fonetica. Parameters ---------- address_1 : str String a comparar. address_2 : str String a comparar. Returns ------- True: En caso del porcentaje obtenido sea mayor o igual al especificado por los administradores. False: En caso del porcentaje obtenido sea menor al especificado por los administradores :Authors: - Aarón Dominguez - Nel Perez Last Modification : 13.11.2017 """ pruebanadres4 = 0 address_1 = data.get('address_1') address_2 = data.get('address_2') address_comparation = data.get('comparation_values') if address_1 is None or address_2 is None: return False try: digit = re.match('.+([0-9])[^0-9]*$', address_1) digit = digit.start(1) address_1 = address_1[:digit + 1] address_1 = address_1.upper() except: pass try: digit = re.match('.+([0-9])[^0-9]*$', address_2) digit = digit.start(1) address_2 = address_2[:digit + 1] address_2 = address_2.upper() except: pass cleaner = address_comparation if not cleaner: return False valid = truediv(address_comparation.get('percent_strength'), 100) cleaner_especials = address_comparation.get('special') cleaner_especials = cleaner_especials.split('|') for item in cleaner_especials: address_1 = address_1.replace(item, "") address_2 = address_2.replace(item, "") cleaner_words = address_comparation.get('word').upper() cleaner_words = cleaner_words.split('|') for item in cleaner_words: address_1 = address_1.replace(item + ' ', "") address_2 = address_2.replace(item + ' ', "") address_1 = address_1.replace(' ', "") address_2 = address_2.replace(' ', "") address_1 = address_1.replace(' ', "") address_2 = address_2.replace(' ', "") ratio_lev = Levenshtein.ratio(address_1, address_2) try: sound_first_dir = fuzzy.nysiis(address_1.upper()) sound_second_dir = fuzzy.nysiis(address_2.upper()) ratio_met = Levenshtein.ratio(sound_first_dir, sound_second_dir) media = (ratio_lev + ratio_met) / 2 except: media = 0 if media >= valid: return True else: return False
#!/usr/bin/env python # coding: utf-8 # ## 1. Sound it out! # <p>Grey and Gray. Colour and Color. Words like these have been the cause of many heated arguments between Brits and Americans. Accents (and jokes) aside, there are many words that are pronounced the same way but have different spellings. While it is easy for us to realize their equivalence, basic programming commands will fail to equate such two strings. </p> # <p>More extreme than word spellings are names because people have more flexibility in choosing to spell a name in a certain way. To some extent, tradition sometimes governs the way a name is spelled, which limits the number of variations of any given English name. But if we consider global names and their associated English spellings, you can only imagine how many ways they can be spelled out. </p> # <p>One way to tackle this challenge is to write a program that checks if two strings sound the same, instead of checking for equivalence in spellings. We'll do that here using fuzzy name matching.</p> # In[104]: # Importing the fuzzy package import fuzzy # Exploring the output of fuzzy.nysiis fuzzy.nysiis('tufool') # Testing equivalence of similar sounding words fuzzy.nysiis('tomorrow') == fuzzy.nysiis('tommorow') # In[105]: get_ipython().run_cell_magic( 'nose', '', "import sys\n\ndef test_fuzzy_is_loaded():\n assert 'fuzzy' in sys.modules, \\\n 'The fuzzy module should be loaded'" ) # ## 2. Authoring the authors # <p>The New York Times puts out a weekly list of best-selling books from different genres, and which has been published since the 1930’s. We’ll focus on Children’s Picture Books, and analyze the gender distribution of authors to see if there have been changes over time. We'll begin by reading in the data on the best selling authors from 2008 to 2017.</p> # In[106]:
def generate_nysiis(word): ## https://pypi.python.org/pypi/Fuzzy if isinstance(word,unicode): word=word.encode('UTF-8') return [fuzzy.nysiis(word)]
def makeFuzzy(self, pattern, variants): variants.add(fuzzy.nysiis(pattern).lower()) return self
# Importing the fuzzy package import fuzzy # Exploring the output of fuzzy.nysiis a=fuzzy.nysiis('color') # Testing equivalence of similar sounding words fuzzy.nysiis('color')==fuzzy.nysiis(a) # Reading in datasets/nytkids_yearly.csv, which is semicolon delimited. author_df=pd.read_csv('datasets/nytkids_yearly.csv',';') # Looping through author_df['Author'] to extract the authors first names first_name = [] for name in author_df['Author']: first_name.append(name.split()[0]) # Adding first_name as a column to author_df author_df['first_name'] = first_name # Checking out the first few rows of author_df author_df.head() import numpy # Looping through author's first names to create the nysiis (fuzzy) equivalent nysiis_name = [] for name in author_df['first_name']: nysiis_name.append(fuzzy.nysiis(name)) # Adding nysiis_name as a column to author_df author_df['nysiis_name']=nysiis_name
def score_nysiis(words): scores = [] for word in words: scored = '%s: %s' % (word.lower(), fuzzy.nysiis(word)) scores.append(scored) return scores