def phonetic_tokenize_name(name, phonetic_algorithm="double_metaphone"): """Create Double Metaphone tokens from the string. Parameters ---------- :param name: string Name of the author. Usually it should be in the format: surnames, first names. :param phonetic algorithm: string Which phonetic algorithm will be used. Options: - "double_metaphone" - "nysiis" - "soundex" Returns ------- :return: tuple The first element is a tuple with the tokens for surnames, the second is a tuple with the tokens for first names. The tuple always contains exactly two elements. Only the first results of the double metaphone algorithm are included in tuples. """ if phonetic_algorithm == "soundex": error = ( "The version of the 'fuzzy' package in use has a buggy soundex" " implementation (see https://github.com/yougov/fuzzy/issues/14 )," " downgrade the package to 1.1 (compatible with Python 2 only) if" " you want to use the soundex phonetic encoding.") try: if fuzzy.Soundex(4)("fuzzy") != "F200": raise ValueError(error) except UnicodeDecodeError: raise ValueError(error) dm = fuzzy.DMetaphone() soundex = fuzzy.Soundex(5) phonetic_algorithms = { "double_metaphone": lambda y: (dm(y)[0] or b'').decode(), "nysiis": lambda y: fuzzy.nysiis(y), "soundex": lambda y: soundex(y) } tokens = tokenize_name(name) # Use double metaphone tokens = tuple( map( lambda x: tuple( map(lambda y: phonetic_algorithms[phonetic_algorithm](y), x)), tokens)) return tokens
def match_citations_with_papers(papers): """ Paramters: papers: Json representing the papers. Each paper has a list of references Returns: Json representing edges in teh citation network Uses fuzzy matching to compare paper titles to the reference titles. If there is a match then an edge is added to the edge list. This is used to build the citations table """ edges = [] i = 1 for source in papers: print "doon", i i += 1 soundex = fuzzy.Soundex(25) for reference in source['references']: for target in papers: s = remove_stopwords(reference.encode('UTF-8')) t = remove_stopwords(target['title'].encode('UTF-8')) if soundex(s) == soundex(t): edges.append({ 'source': source['doi'], 'target': target['doi'] }) return edges
def seq_matcher(name1, name2): name1 = unicode( unicodedata.normalize('NFKD', name1).encode('ascii', 'ignore'), 'utf-8') name2 = unicode(name2, 'utf-8') name2 = unicode( unicodedata.normalize('NFKD', name2).encode('ascii', 'ignore'), 'utf-8') soundex = fuzzy.Soundex(4) name1 = soundex(name1) name2 = soundex(name2) # dmeta = fuzzy.DMetaphone() # name1 = dmeta(name1)[0] # name2 = dmeta(name2)[0] # name1 = fuzzy.nysiis(name1) # name2 = fuzzy.nysiis(name2) m = SequenceMatcher(None, name1, name2) # Calculate an edit distance"abcef" # print 'm',m.ratio() e = editdist.distance(name1, name2) # print 'e',e sm = StringMatcher(seq1=name1, seq2=name2) # return e # print sm.distance() return sm.distance()
def __diffsoundex(self, query, name, value=4): soundex = fuzzy.Soundex(value) a = soundex(name) b = soundex(query) if a[0] == b[0]: return abs(int(a[1:]) - int(b[1:])) else: return abs(int(a[1:]) - int(b[1:])) + 250
def test_phonetic_tokenize_name_python2(): """Test checking if custom phonetic algorithms from fuzzy packages work.""" import fuzzy soundex = fuzzy.Soundex(5) assert phonetic_tokenize_name("Dupont, René", "nysiis") == ( ((fuzzy.nysiis(u"Dupont"),), (fuzzy.nysiis(u"René"),))) assert phonetic_tokenize_name("Dupont, René", "soundex") == ( # no direct support for unicode in soundex, thus "Rene" ((soundex(u"Dupont"),), (soundex(u"Rene"),)))
def get_soundex_dict(idf_dict): """ This methods precomputes the soundex code of every unique token in the corpus. """ soundex_dict = {} soundex = fuzzy.Soundex(4) for term in idf_dict.keys(): soundex_dict[term] = get_soundex(term) return soundex_dict
def compare(input_list, keywords_dictionary, word_weights): # Load phonetics functions dmeta = fuzzy.DMetaphone() metaphone = lambda x: dmeta(x)[0] soundex = fuzzy.Soundex(4) phonetics_methods = [metaphone, soundex] # initiate empty dictionary for scores scores = {} # Iterate through methods for solving, then iterate through words in # scrubbed user input. For each word, compare phonetics to all keywords # and add score to the scores dictionary. After, do normal QWERTY and LD # analyses for method, keywords in keywords_dictionary.iteritems(): scores[method] = 0 # print(method) # Phonetic Scoring methods for phonetic in phonetics_methods: formatted_array = np.asarray(map(phonetic, keywords)) for word in input_list: formatted_word = phonetic(word) dist_array = \ normalized_damerau_levenshtein_distance_withNPArray( formatted_word, formatted_array) dist = min(dist_array) # Handle cases where "not" was found within the input - add to # scores dictionary. weight = word_weights.get(word) if word_weights.get( word) else 1 scores[method] += weight * math.sqrt(dist) # For QWERTY and Damerau-Levenshtein distances, calcuate the differences for word in input_list: # Do QWERTY Keyboard analysis dist_array = normalized_keyboard_word_distance_withNPArray( word, keywords) dist = min(dist_array) # handle weighting for position from "not" weight = word_weights.get(word) if word_weights.get(word) else 1 scores[method] += weight * math.sqrt(dist) # Do normal LD analysis dist_array = normalized_damerau_levenshtein_distance_withNPArray( word, np.asarray(keywords)) dist = min(dist_array) weight = word_weights.get(word) if word_weights.get(word) else 1 scores[method] += weight * math.sqrt(dist) return scores
def phonetic_matching(s1, s2): """Computing the phonetic sound of 2 strings and using LD to compute its similarity. :param s1: First string. :param s2: A second string. :returns: The LD between the 2 string phonetic representations. """ soundex = fuzzy.Soundex(4) return levenshtein_distance(soundex(s1), soundex(s2))
def using_soundex(): soundex = fuzzy.Soundex(4) soundex_predics = [] for element in misspell: temp = [] for elem in dict: if soundex(element) == soundex(elem): temp.append(elem) soundex_predics.append(temp) return soundex_predics
def generateSoundexHash(self, dictionary, table=None): soundexHash = {} if table is None else table for name, gender in dictionary.iteritems(): name = self._sanitizeName(name) if len(name) > 1: soundhash = fuzzy.Soundex(4)(name) self._appendToDict(soundhash, gender, soundexHash) return soundexHash
def checkIfSongExists(curr_song, songs_list): retVal = False matched_song = "" song_name = curr_song['name'] for s in songs_list: #print song song = songs_list[s]['name'] if (len(song) > len(song_name)): soundex = fuzzy.Soundex(len(song)) else: soundex = fuzzy.Soundex(len(song_name)) phonectic_distance = fuzz.ratio(soundex(song), soundex(song_name)) if ('(' in song.lower() and '(' in song_name.lower()): parmatch, tryagain = getparanthesismatch(song.lower(), song_name.lower()) if (parmatch == True): if (curr_song['artistName'].lower() == songs_list[s]['artistName'].lower() and checkFtArtist(curr_song['featArtists'], songs_list[s]['featArtists']) == True): retVal = True #print song_name + ' -------------- ' + song #print "paranthesis match" matched_song = s break normal_distance = fuzz.ratio(song.lower(), song_name.lower()) if (phonectic_distance >= 90 and normal_distance >= 85): if (curr_song['artistName'].lower() != songs_list[s]['artistName'].lower()): continue if (checkFtArtist(curr_song['featArtists'], songs_list[s]['featArtists']) == False): continue retVal = True #print song_name + ' -------------- ' + song #print songs_list[s]['year'] #print curr_song['year'] #print str(phonectic_distance) + " ######### " + str(normal_distance) matched_song = s break return retVal, matched_song
class Validator: soundex = fuzzy.Soundex(4) def __init__(self, key_sentences: dict): self.key_sentences = key_sentences def validate_phonetic_similarities(self, input_text: str, key_sentence: str): key_length = len(key_sentence) if key_length <= 5: levenshtein_max_op = 2 else: levenshtein_max_op = 4 if Levenshtein.distance(input_text, key_sentence) <= levenshtein_max_op: return True # if self.soundex(key_sentence) == self.soundex(input_text): # return True return False def validate_wakeupword(self, input_sentence: str): self.wake_up_word = self.key_sentences.get("wake_up_word") return self.validate_phonetic_similarities(input_sentence, self.wake_up_word) def validate_input(self, input_text): if not isinstance(input_text, str): return DataProcessingResult(success=False, is_wake_up_word=False, sentence=input_text, guess="Invalid") if self.validate_wakeupword(input_text): return DataProcessingResult(success=True, is_wake_up_word=True, sentence=input_text, guess=self.wake_up_word) for key_sentence in self.key_sentences.get("commands"): if self.validate_phonetic_similarities(input_text, key_sentence): return DataProcessingResult(success=True, is_wake_up_word=False, sentence=input_text, guess=key_sentence) return DataProcessingResult(success=False, is_wake_up_word=False, sentence=input_text, guess="No guess")
def __init__(self, token=None, email=None, password=None): self.soundex = fuzzy.Soundex(4) self.token = token self.email = email self.password = password self.access_token = None self.circles = {} self.people = {} self.scan = False self.delay = 10 if self.token is None: self.setDefaultToken()
def tokens_to_vocab(data, phonetic_encoding="none"): """ Used for token level string edit distance or phonetic encoding. Transforms tokens to vocabulary (t1 = a, t2 = b, t3 = c...) :param data: matrix, (data_size by 3) one line is [s1 \t s2 \t label], where s1 is made of tokens t1 t2 t3... :param labels: Ground truth (whether or not these two strings refer to the same entity) :param phonetic_encoding: name of phonetic encoding for string being used. "soundex": Soundex encoding "nysiis": Nysiis encoding "LCS": Longest common subsequence. Anything else: Program will shut down. :Returns array of size :return: (data_size by 3) where each line is s1 \t s2 \t label where s1/2 are encoded appropriately. """ new_data = [] if phonetic_encoding == "soundex": soundex = fuzzy.Soundex(4) ct = 0 for line in data: ct += 1 if ct % 10000 == 0: print("{} tokenized!".format(ct)) tokens_made = [] new_strings = ["", ""] new_string_1 = "" new_string_2 = "" for idx in [0, 1]: phrase = line.rstrip()[idx] for token in phrase.split(" "): #PHONETIC ENCODING WILL ONLY WORK WITH STRING EDIT DISTANCE if phonetic_encoding == "nysiis": token = fuzzy.nysiis(token) elif phonetic_encoding == "soundex": token = token.encode("UTF-8") try: token = soundex(token) except: print("Could not apply soundex to {}".format(token)) token = token else: print("Provide phonetic encoding") sys.exit(1) if token not in tokens_made: tokens_made.append(token) if len(tokens_made) > 100: print("TOO LONG!", tokens_made) new_strings[idx] += chr(ord("!") + tokens_made.index(token)) new_strings[idx] = new_strings[idx].encode("UTF-8") new_data.append(new_strings) return new_data
def determineFromSoundex(self, firstName): hashTable = {} self.generateSoundexHash(self.firstDict, hashTable) self.generateSoundexHash(self.secondDict, hashTable) firstName = self._sanitizeName(firstName) nameHash = fuzzy.Soundex(4)(firstName) if nameHash in hashTable: results = hashTable[nameHash] gender = max(results, key=results.get) if results[gender] > 0: return gender return self.options['unknown']
def phonetic_tokenize_name(name, phonetic_algorithm="double_metaphone"): """Create Double Metaphone tokens from the string. Parameters ---------- :param name: string Name of the author. Usually it should be in the format: surnames, first names. :param phonetic algorithm: string Which phonetic algorithm will be used. Options: - "double_metaphone" - "nysiis" (only for Python 2) - "soundex" (only for Python 2) Returns ------- :return: tuple The first element is a tuple with the tokens for surnames, the second is a tuple with the tokens for first names. The tuple always contains exactly two elements. Only the first results of the double metaphone algorithm are included in tuples. """ if sys.version[0] == '2': import fuzzy dm = fuzzy.DMetaphone() soundex = fuzzy.Soundex(5) phonetic_algorithms = { "double_metaphone": lambda y: dm(y)[0] or '', "nysiis": lambda y: fuzzy.nysiis(y), "soundex": lambda y: soundex(y) } else: from ..ext.metaphone import dm phonetic_algorithms = {"double_metaphone": lambda y: dm(y)[0]} tokens = tokenize_name(name) # Use double metaphone tokens = tuple( map( lambda x: tuple( map(lambda y: phonetic_algorithms[phonetic_algorithm](y), x)), tokens)) return tokens
def main(): dictFp = open(sys.argv[1], 'r') inputFp = open(sys.argv[2], 'r') soundex = fuzzy.Soundex(5) d = [] #print("Dictionary Loading") for line in dictFp: d.append(line.strip()) d = sorted(d) s = [] for a in d: s.append(soundex(a)) #print("Dictionary Loaded") done = set() for line in inputFp: uid, tid, tweet, date = line.split('\t') words = tweet.split() for word in words: if(word in done): continue done.add(word) sdex = soundex(word) output = [] for i in range(len(s)): if(s[i] == sdex): output.append(d[i]) if(len(output)): sys.stdout.write("{}: ".format(word)) i = 0 for w in output: if w != word: sys.stdout.write(w + " ") i+=1 if(i == 10): break; print("")
def match_soundex(token): dictSet = getDict() candidates = [] candidatesG = [] bestMatch = "" soundex = fuzzy.Soundex(4) soundex_token = soundex(token) candidates = [ match for match in dictSet if soundex(match) == soundex_token ] if len(candidates) > 1: G = ngram.NGram(candidates) candidatesG = G.search(token) if len(candidatesG) > 0: bestMatch = candidatesG[0][0] elif len(candidates) == 1: bestMatch = candidates[0] return bestMatch, candidates, candidatesG
def soundex(collection, zero=False): """ Returns a soundexed encoded version of the collection. """ import fuzzy soundex = fuzzy.Soundex(4) try: assert type(collection) == list except AssertionError: print("Input collection is not a list.") collectionEncoded = list() for i, word in enumerate(tqdm(collection)): wordEncoded = soundex(word) if not zero: # Optional: remove 0s. wordEncoded = wordEncoded.strip('0') collectionEncoded.append(wordEncoded) return collectionEncoded
def match_levenshtein_soundex(token): dictSet = getDict() candidates = [] candidatesG = [] bestMatch = "" minDistance = 3 for item in dictSet: distance = Levenshtein.distance(token.lower(), item.lower()) if distance == 0: return item, [], [] elif distance < minDistance: minDistance = distance candidates = [] if distance == minDistance: candidates.append(item.lower()) soundex = fuzzy.Soundex(4) soundex_token = soundex(token) soudex_candidates = [ match for match in candidates if soundex(match) == soundex_token ] if len(soudex_candidates) != 0: candidates = soudex_candidates if len(candidates) > 1: G = ngram.NGram(candidates) candidatesG = G.search(token) if len(candidatesG) > 0: bestMatch = candidatesG[0][0] elif len(candidates) == 1: bestMatch = candidates[0] return bestMatch, candidates, candidatesG
def __init__(self, left_on, right_on=None, **kwargs): super().__init__(**kwargs) self.left_on = listify(left_on) self.right_on = listify(right_on) if right_on else self.left_on self.soundex = fuzzy.Soundex(4)
def fingerprint_word(word): return "%s%02d" % (fuzzy.Soundex(5)(word)[1:],(len(fuzzy.nysiis(word))))
def test_soundex_does_not_mutate_strings(): phrase = 'FancyFree' fuzzy.Soundex(4)(phrase) buffer = ctypes.c_char_p(phrase.encode()) assert buffer.value.decode() == "FancyFree"
def test_soundex_result(): phrase = 'FancyFree' res = fuzzy.Soundex(4)(phrase) assert res == 'F521'
def test_soundex_non_ascii(): assert fuzzy.Soundex(8)('Jéroboam') == 'J615'
def soundex(s, e=fuzzy.Soundex(4)): # Return a list to be like metaphone return [e(s)]
def test_soundex_Test(): assert fuzzy.Soundex(8)('Test') == 'T23'
def __init__(self): import fuzzy self.soundex = fuzzy.Soundex(4)
#!/bin/python import fuzzy wf = open("test_sents.txt", 'r') #lf = open("../hinglishData/lang_ids.txt", 'r') sf = open("test_soundex.txt", 'w') wlines = wf.readlines() #llines = lf.readlines() soundex = fuzzy.Soundex(4) for i in range(len(wlines)): #wlines[i] = "leather bag , belt aur shoe dull dikhne lage ho , to oon par kela ka chhilka ragadne se unme chamak aa jati hain" x = wlines[i].strip().split() sx = [] for el in x: sxcode = "x_x" try: sxcode = soundex(el) if sxcode.strip() == " " or sxcode.strip() == "": sxcode = "x_x" except: sxcode = "x_x" sx.append(sxcode) #print el #print sxcode #raw_input() sx = " ".join(sx) sf.write(sx + "\n") #print wlines[i].strip() #print sx #print len(x)
import fuzzy # Convert up to 10 characters to phonetic character soundex = fuzzy.Soundex(10) # Text to process word = 'phone' soundex(word) #Doc2vec #stemming, lemmatization, n-grams, stop word removal etc #Import packages from gensim.models.doc2vec import Doc2Vec, TaggedDocument from nltk.tokenize import word_tokenize ## Exapmple document (list of sentences) doc = ["I love data science", "I love coding in python", "I love building NLP tool", "This is a good phone", "This is a good TV", "This is a good laptop"] # Tokenization of each document tokenized_doc = [] for d in doc: tokenized_doc.append(word_tokenize(d.lower())) tokenized_doc # Convert tokenized document into gensim formated tagged data tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)] tagged_data