Beispiel #1
0
def test_phonetic_tokenize_name_python2():
    """Test checking if custom phonetic algorithms from fuzzy packages work."""
    import fuzzy
    soundex = fuzzy.Soundex(5)
    assert phonetic_tokenize_name("Dupont, René", "nysiis") == (
        ((fuzzy.nysiis(u"Dupont"),), (fuzzy.nysiis(u"René"),)))
    assert phonetic_tokenize_name("Dupont, René", "soundex") == (
        # no direct support for unicode in soundex, thus "Rene"
        ((soundex(u"Dupont"),), (soundex(u"Rene"),)))
Beispiel #2
0
def calc_sim(candidate, poem, sim_range):
    inrange = [] #lines within sim_range
    if len(poem) <= sim_range:
        inrange = poem
    else:
        inrange = poem[-sim_range:]
    score = 0 
    for line in poem:
        for tok in line:
            for can_tok in candidate:
                score += pylev.distance( fuzzy.nysiis(tok), fuzzy.nysiis(can_tok))
    return score
Beispiel #3
0
def phx(test):
    print '--> phx'
    print test
    # 1
    soundex = fuzzy.Soundex(5)
    print soundex(test)

    # 2
    dmeta = fuzzy.DMetaphone()
    print dmeta(test)

    # 3
    print fuzzy.nysiis(test)
Beispiel #4
0
	def filter(self, tokens, replace):
		"""
		Filters the tokens.
		
		*tokens* (``list``) the tokens (``str``).
		
		*replace* (``bool``) is whether tokens created by this filter should
		replace the encountered tokens (``True``), or if they should be
		combined (i.e., appended after) the encountered tokens. Default is
		``False``.
		
		Returns the filtered (``list``) tokens (``str``).
		"""
		truncate = self.truncate
		found = set(t[0] for t in tokens) if not replace else set()
		final = []
		for token in tokens:
			if not replace:
				# Since we are not supposed to replace tokens, append each token
				# before it is consumed.
				final.append(token)
			# Run token through nysiis.
			result = fuzzy.nysiis(token[0])[:truncate]
			if result and result not in found:
				found.add(result)
				final.append((result, token[1], token[2], token[3]))
		return final
Beispiel #5
0
def phonetic_similarity(poem):
    # Phonetic representation of each line
    phonetic_lines = []
    for line in poem:
        phon_line = []
        for word in line:
            phon_line.append(fuzzy.nysiis(word))
        phonetic_lines.append(phon_line)

    # Calculate phonetic value
    phon_value = 0.1
    for i in range(0, len(phonetic_lines)-1):
        last_word = str(phonetic_lines[i][-1])
        next_last_word = str(phonetic_lines[i+1][-1])

        # Throw out identical sounding words (TODO: Check their original words, not phonetic representations)
        if last_word != next_last_word and len(last_word) > 0 and len(next_last_word) > 0:
            lev_dist = pylev.distance(last_word, next_last_word)

            # Divide distance by word length since eg morning - mourning is a way better similarity than hat - had 
            lev_dist = lev_dist/float((len(last_word) + len(next_last_word)))
            phon_value += lev_dist
            #print 'Potential rhyme pair: '+last_word +' '+next_last_word+' with dist: '+str(lev_dist)
        else:
            phon_value += 1 # very unsimilar for two words that are too short

    # Normalize by poem length (in case variable length poems are allowed)
    phon_value /= len(phonetic_lines)
    return phon_value
Beispiel #6
0
def score_nysiis(words):
    """Score words using the nysiis algorithm.

    :param words (list): the list of words.
    :rtype scores (list): the scored words
    """
    return ['{}: {}'.format(w.lower(), fuzzy.nysiis(w)) for w in words]
Beispiel #7
0
def Realm2Slug(region, realm):
    global realm2slug
    if not region in realm2slug:
        realm2slug[region] = LookupRegionSlugs(region)
    if not realm in realm2slug[region]:
        # Lets see if we can find an alias for it
        realm = fuzzy.nysiis(realm)
    return realm2slug[region].get(realm)
def initialize():
  global wordlist
  words = [] 
  with open('/usr/share/dict/words') as f:
    words = f.readlines()
  for w in words:
    if not w[0].isupper():
    	wordlist.append( (w, fuzzy.nysiis(w)) )
def find_similar(word):
  word_sound = fuzzy.nysiis(word)
  best = None 
  best_dist = 99999
  for w in wordlist:
    if pylev.distance(word_sound, w[1]) < best_dist and word != w[0][:-1]:
      best_dist = pylev.distance(word_sound, w[1])
      best = (w[0], best_dist)
  return best    
Beispiel #10
0
def name_nysiis(word):
    if word:
        try:
            result = fuzzy.nysiis(word)
            if result:
                word = result
        except ValueError:
            pass
    return word
Beispiel #11
0
    def end_rhymes_with(self, line):
        """
        Returns a boolean to indicate whether two lines form an end rhyme.
        """
        if not self.has_same_ending_as(line):
            if (self.ending[0].pronunciation != None) and (line.ending[0].pronunciation != None):
                for e1 in self.ending:
                    for e2 in line.ending:
                        m1 = re.search('(?:%(VOWELS)s)(?:%(STRESSED)s)(?!.*(?:%(VOWELS)s)(?:%(STRESSED)s)).*$' % SYMBOLS_REGEX_OR,e1.pronunciation)
                        if m1 is not None:
                            r1 = m1.group()
                        else:
                            r1 = None

                        m2 = re.search('(?:%(VOWELS)s)(?:%(STRESSED)s)(?!.*(?:%(VOWELS)s)(?:%(STRESSED)s)).*$' % SYMBOLS_REGEX_OR,e2.pronunciation)
                        if m2 is not None:
                            r2 = m2.group()
                        else:
                            r2 = None

                        #If the two candidate endings rhyme, then the lines rhyme
                        if r1 == r2:
                            return True
            else:
                #Rhyme Fallback... If at least one word was not in the dictionary, we have to use the fallback
                #TODO: Improve the accuracy of this. Currently, we just get the largest substring starting with a vowel.
                #      We should iterate over the substrings and check for matches instead so that we capture more mult-syllablic rhymes.
                word1_nysiis = fuzzy.nysiis(self.ending[0].word)
                word2_nysiis = fuzzy.nysiis(line.ending[0].word)

                if len(word1_nysiis) <= len(word2_nysiis):
                    m = re.search('[aeiou].*',word1_nysiis)
                    if m is not None:
                        ending = m.group()
                        if word2_nysiis.endswith(ending):
                            return True
                else:
                    m = re.search('[aeiou]',word2_nysiis)
                    if m is not None:
                        ending = m.group()
                        if word1_nysiis.endswith(ending):
                            return True

        return False
Beispiel #12
0
    def keyword_matching(arguments, entities):
        words = arguments['keywords']
        phonics = set([])
        overlap = []

        for w in words:
            phonics.add(fuzzy.nysiis(w))

        for i in xrange(0, len(entities)):
            entity_name = nltk.word_tokenize(entities[i].name)
            entity_phonics = set([])
            for word in entity_name:
                entity_phonics.add(fuzzy.nysiis(word))
            common = len(phonics & entity_phonics) / len(entity_phonics)
            if common == 1:
                arguments['idx'] = i
                return
            overlap.append(common)
        arguments['idx'] = overlap.index(max(overlap))
Beispiel #13
0
    def generateNysiisHash(self, dictionary, table=None):
        nysiisHash = {} if table is None else table

        for name, gender in dictionary.iteritems():
            name = self._sanitizeName(name)

            if len(name) > 1:
                nysiishash = fuzzy.nysiis(name)
                self._appendToDict(nysiishash, gender, nysiisHash)

        return nysiisHash
Beispiel #14
0
    def determineFromMetaphone(self, firstName):
        hashTable = {}
        self.generateMetaphoneHash(self.firstDict, hashTable)
        self.generateMetaphoneHash(self.secondDict, hashTable)

        firstName = self._sanitizeName(firstName)
        nameHash  = fuzzy.nysiis(firstName)

        if nameHash in hashTable:
            results = hashTable[nameHash]
            gender  = max(results, key=results.get)

            if results[gender] > 0:
                return gender

        return self.options['unknown']
Beispiel #15
0
def soundify(word):
	table = []
	table.append( ('', 0) ) 
	word = fuzzy.nysiis(word)
	for n in range(1,len(word)):
		opt_n_score = 99999
		opt_n_obj = None
		for m in range(0, n):
			opt_m = table[m]
			remaining = find_similar(word[m:n])
			if opt_m[1] + remaining[1] < opt_n_score:
				opt_n_score = opt_m[1] + remaining[1]
				opt_n_obj = (opt_m[0]+' '+remaining[0], opt_m[1] + remaining[1])
		print n
		table.append(opt_n_obj)
	return table[-1]
Beispiel #16
0
def LookupRegionSlugs(region):
    global database
    if not database:
        ConnectDatabase(True)

    c = database.cursor()
    c.execute("""SELECT `name`, `slug` FROM `realmStatus` WHERE `region` = %s;""" ,
              (region,))
    d = c.fetchone()
    h = {}

    while d is not None:
        h[d[0]] = d[1]
        h[fuzzy.nysiis(d[0])] = d[1]
        d = c.fetchone()
    c.close()
    return h
Beispiel #17
0
def phonetic_tokenize_name(name, phonetic_algorithm="double_metaphone"):
    """Create Double Metaphone tokens from the string.

     Parameters
    ----------
    :param name: string
        Name of the author. Usually it should be in the format:
        surnames, first names.

    :param phonetic algorithm: string
        Which phonetic algorithm will be used. Options:
        -  "double_metaphone"
        -  "nysiis" (only for Python 2)
        -  "soundex" (only for Python 2)

    Returns
    -------
    :return: tuple
        The first element is a tuple with the tokens for surnames, the second
        is a tuple with the tokens for first names. The tuple always contains
        exactly two elements. Only the first results of the double metaphone
        algorithm are included in tuples.
    """
    if sys.version[0] == '2':
        import fuzzy
        dm = fuzzy.DMetaphone()
        soundex = fuzzy.Soundex(5)
        phonetic_algorithms = {
            "double_metaphone": lambda y: dm(y)[0] or '',
            "nysiis": lambda y: fuzzy.nysiis(y),
            "soundex": lambda y: soundex(y)
        }
    else:
        from ..ext.metaphone import dm
        phonetic_algorithms = {
            "double_metaphone": lambda y: dm(y)[0]
        }

    tokens = tokenize_name(name)
    # Use double metaphone
    tokens = tuple(map(lambda x: tuple(map(lambda y: phonetic_algorithms[
        phonetic_algorithm](y), x)),
        tokens))

    return tokens
Beispiel #18
0
def aarnel_validation(data):
    """
    COEFICIENTE DE AARNEL EN VERSION GENERALIZADA PARA DOS STRINGS..
    Coeficiente de Aarnel es una mezcla de la distancia Levenshtein (LV) y
    Teoria de Metaphone (ME).
    Primero se calcula la distancia de los 2 string con LV, paralelamente se
    extraen las combinaciones foneticas con ME de los strings por individual y
    como resultado se obtienen 2 nuevos strings a los cuales tambien se le
    aplicara LV para compara su proximidad fonetica, posteriormente se les saca
    el promedio para obtener la media del cruce de proximidad literaria y
    proximidad fonetica.
    Parameters
    ----------
    address_1 : str
        String a comparar.
    address_2 : str
        String a comparar.
    Returns
    -------
    True:
        En caso del porcentaje obtenido sea mayor o igual al especificado por
        los administradores.
    False:
        En caso del porcentaje obtenido sea menor al especificado por los
        administradores
    :Authors:
        - Aarón Dominguez
        - Nel Perez
    Last Modification : 13.11.2017
    """
    pruebanadres4 = 0
    address_1 = data.get('address_1')
    address_2 = data.get('address_2')
    address_comparation = data.get('comparation_values')
    if address_1 is None or address_2 is None:
        return False
    try:
        digit = re.match('.+([0-9])[^0-9]*$', address_1)
        digit = digit.start(1)
        address_1 = address_1[:digit + 1]
        address_1 = address_1.upper()
    except:
        pass
    try:
        digit = re.match('.+([0-9])[^0-9]*$', address_2)
        digit = digit.start(1)
        address_2 = address_2[:digit + 1]
        address_2 = address_2.upper()
    except:
        pass
    cleaner = address_comparation
    if not cleaner:
        return False
    valid = truediv(address_comparation.get('percent_strength'), 100)
    cleaner_especials = address_comparation.get('special')
    cleaner_especials = cleaner_especials.split('|')
    for item in cleaner_especials:
        address_1 = address_1.replace(item, "")
        address_2 = address_2.replace(item, "")

    cleaner_words = address_comparation.get('word').upper()
    cleaner_words = cleaner_words.split('|')
    for item in cleaner_words:
        address_1 = address_1.replace(item + ' ', "")
        address_2 = address_2.replace(item + ' ', "")
    address_1 = address_1.replace(' ', "")
    address_2 = address_2.replace(' ', "")

    address_1 = address_1.replace(' ', "")
    address_2 = address_2.replace(' ', "")
    ratio_lev = Levenshtein.ratio(address_1, address_2)
    try:
        sound_first_dir = fuzzy.nysiis(address_1.upper())
        sound_second_dir = fuzzy.nysiis(address_2.upper())
        ratio_met = Levenshtein.ratio(sound_first_dir, sound_second_dir)
        media = (ratio_lev + ratio_met) / 2
    except:
        media = 0
    if media >= valid:
        return True
    else:
        return False
#!/usr/bin/env python
# coding: utf-8

# ## 1. Sound it out!
# <p>Grey and Gray. Colour and Color. Words like these have been the cause of many heated arguments between Brits and Americans. Accents (and jokes) aside, there are many words that are pronounced the same way but have different spellings. While it is easy for us to realize their equivalence, basic programming commands will fail to equate such two strings. </p>
# <p>More extreme than word spellings are names because people have more flexibility in choosing to spell a name in a certain way. To some extent, tradition sometimes governs the way a name is spelled, which limits the number of variations of any given English name. But if we consider global names and their associated English spellings, you can only imagine how many ways they can be spelled out. </p>
# <p>One way to tackle this challenge is to write a program that checks if two strings sound the same, instead of checking for equivalence in spellings. We'll do that here using fuzzy name matching.</p>

# In[104]:

# Importing the fuzzy package
import fuzzy

# Exploring the output of fuzzy.nysiis
fuzzy.nysiis('tufool')

# Testing equivalence of similar sounding words
fuzzy.nysiis('tomorrow') == fuzzy.nysiis('tommorow')

# In[105]:

get_ipython().run_cell_magic(
    'nose', '',
    "import sys\n\ndef test_fuzzy_is_loaded():\n    assert 'fuzzy' in sys.modules, \\\n    'The fuzzy module should be loaded'"
)

# ## 2. Authoring the authors
# <p>The New York Times puts out a weekly list of best-selling books from different genres, and which has been published since the 1930’s.  We’ll focus on Children’s Picture Books, and analyze the gender distribution of authors to see if there have been changes over time. We'll begin by reading in the data on the best selling authors from 2008 to 2017.</p>

# In[106]:
def generate_nysiis(word):
    ## https://pypi.python.org/pypi/Fuzzy
    if isinstance(word,unicode):
        word=word.encode('UTF-8')
    
    return [fuzzy.nysiis(word)]
Beispiel #21
0
 def makeFuzzy(self, pattern, variants):
     variants.add(fuzzy.nysiis(pattern).lower())
     return self
Beispiel #22
0
# Importing the fuzzy package
import fuzzy
# Exploring the output of fuzzy.nysiis
a=fuzzy.nysiis('color')
# Testing equivalence of similar sounding words
fuzzy.nysiis('color')==fuzzy.nysiis(a)
# Reading in datasets/nytkids_yearly.csv, which is semicolon delimited.
author_df=pd.read_csv('datasets/nytkids_yearly.csv',';')

# Looping through author_df['Author'] to extract the authors first names
first_name = []
for name in author_df['Author']:
 first_name.append(name.split()[0])


# Adding first_name as a column to author_df
author_df['first_name'] = first_name 



# Checking out the first few rows of author_df
author_df.head()
import numpy

# Looping through author's first names to create the nysiis (fuzzy) equivalent
nysiis_name = []
for name in author_df['first_name']:
  nysiis_name.append(fuzzy.nysiis(name))  

# Adding nysiis_name as a column to author_df
author_df['nysiis_name']=nysiis_name
Beispiel #23
0
def score_nysiis(words):
    scores = []
    for word in words:
        scored = '%s: %s' % (word.lower(), fuzzy.nysiis(word))
        scores.append(scored)
    return scores