def test_soundex(): tests = [ ('R163', 'Rupert'), ('R163', 'Robert'), ('R150', 'Rubin'), ('A261', 'Ashcroft'), ('A261', 'Ashcraft'), ('T522', 'Tymczak'), ('P123', 'Pfister'), ('A536', 'Andrew'), ('W252', 'Wozniak'), ('C423', 'Callister'), ('H400', 'Hello'), ('M635', 'Martin'), ('B656', 'Bernard'), ('F600', 'Faure'), ('P620', 'Perez'), ('G620', 'Gros'), ('C120', 'Chapuis'), ('B600', 'Boyer'), ('G360', 'Gauthier'), ('R000', 'Rey'), ('B634', 'Barthélémy'), ('H560', 'Henry'), ('M450', 'Moulin'), ('R200', 'Rousseau') ] soundex = Soundex() for test in tests: assert soundex.phonetics(test[1]) == test[0]
def test_soundex(): soundex = Soundex() assert soundex.phonetics('h') == 'H000' assert soundex.phonetics('d') == 'D000' with pytest.raises(EmptyStringError): soundex.phonetics('')
class PhoneticModule: def __init__(self): self.soundex = Soundex() self.metaphone = Metaphone() self.fuzzySoundex = FuzzySoundex() self.lein = Lein() self.refinedSoundex = RefinedSoundex() self.phoneticSimilarityWeight = {} self.phoneticSimilarityWeight['soundex'] = 0.2 self.phoneticSimilarityWeight['metaphone'] = 0.2 self.phoneticSimilarityWeight['fuzzySoundex'] = 0.2 self.phoneticSimilarityWeight['lein'] = 0.2 self.phoneticSimilarityWeight['refinedSoundex'] = 0.2 def Calculation(self, word1, word2): #print(self.soundex.phonetics(word1)) #print(self.soundex.phonetics(word2)) res = 0.0 res = self.SoundexMethod( word1, word2) * self.phoneticSimilarityWeight['soundex'] * 1.0 res = res + self.MetaphoneMethod( word1, word2) * self.phoneticSimilarityWeight['metaphone'] * 1.0 res = res + self.FuzzySoundexMethod( word1, word2) * self.phoneticSimilarityWeight['fuzzySoundex'] * 1.0 res = res + self.LeinMethod( word1, word2) * self.phoneticSimilarityWeight['lein'] * 1.0 res = res + self.RefinedSoundexMethod( word1, word2) * self.phoneticSimilarityWeight['refinedSoundex'] * 1.0 print(res) return res def PhoneticLayerCreation(self, word): string = "" string = self.soundex.phonetics(word) string = string + '_' + self.metaphone.phonetics(word) string = string + '_' + self.fuzzySoundex.phonetics(word) string = string + '_' + self.lein.phonetics(word) string = string + '_' + self.refinedSoundex.phonetics(word) return string def SoundexMethod(self, word1, word2): return self.soundex.distance(word1, word2, metric='levenshtein') def MetaphoneMethod(self, word1, word2): return self.metaphone.distance(word1, word2, metric='levenshtein') def FuzzySoundexMethod(self, word1, word2): return self.fuzzySoundex.distance(word1, word2, metric='levenshtein') def LeinMethod(self, word1, word2): return self.lein.distance(word1, word2, metric='levenshtein') def RefinedSoundexMethod(self, word1, word2): return self.refinedSoundex.distance(word1, word2, metric='levenshtein')
def __init__(self): self.soundex = Soundex() self.metaphone = Metaphone() self.fuzzySoundex = FuzzySoundex() self.lein = Lein() self.refinedSoundex = RefinedSoundex() self.phoneticSimilarityWeight = {} self.phoneticSimilarityWeight['soundex'] = 0.2 self.phoneticSimilarityWeight['metaphone'] = 0.2 self.phoneticSimilarityWeight['fuzzySoundex'] = 0.2 self.phoneticSimilarityWeight['lein'] = 0.2 self.phoneticSimilarityWeight['refinedSoundex'] = 0.2
def test_soundex_homophones(): tests = [ ('Braz', 'Broz'), ('Caren', 'Caron', 'Carren', 'Charon', 'Corain', 'Coram', 'Corran', 'Corrin', 'Corwin', 'Curran', 'Curreen','Currin', 'Currom', 'Currum', 'Curwen'), ('Hairs', 'Hark', 'Hars', 'Hayers', 'Heers', 'Hiers'), ('Lambard', 'Lambart', 'Lambert', 'Lambird', 'Lampaert', 'Lampard', 'Lampart', 'Lamperd', 'Lampert', 'Lamport','Limbert', 'Lombard'), ('Nolton', 'Noulton') ] soundex = Soundex() for test in tests: phonetics = [soundex.phonetics(word) for word in test] assert len(set(phonetics)) == 1 # all phonetics are the same, so set size = 1
def get_info(self): recognize = self.__params__["recognize"] recognize = "".join(recognize) base = self.__params__["base"] attributes = { "index": self.__params__["index"], "miscue_base": base, "miscue_result": recognize } rs = RefinedSoundex() soundex = Soundex() if str(base).isnumeric() or str(recognize).isnumeric(): attributes['type'] = 0 return attributes else: distance = rs.distance(base, recognize) sounds = soundex.sounds_like(base, recognize) if distance < 1 or sounds: attributes['type'] = 0 return attributes else: attributes['type'] = 2 return attributes pass pass
import markovify import string import nltk from nltk.tokenize import RegexpTokenizer from Phyme import Phyme from Phyme.util import flatten from pyphonetics import Soundex from subprocess import Popen nltk.download('cmudict') ph = Phyme() soundex = Soundex() from twilio.rest import Client # Your Account Sid and Auth Token from twilio.com/console # DANGER! This is insecure. See http://twil.io/secure account_sid = ' ' auth_token = ' ' client = Client(account_sid, auth_token) print("Loading model... ") with open('small_markov_model_state_3.json', 'rb') as f: text = f.read() markov_model = markovify.NewlineText.from_json(text) #returns list of words that rhyme def find_rhymes(word): rhyme_list = [] try: rhyme_list.append(
# -*- coding: utf-8 -*- """ Created on Sat Oct 24 16:16:46 2020 @author: Vikee """ from pyphonetics import Soundex soundex = Soundex() a=soundex.phonetics('Allowed') b=soundex.phonetics('Aloud') c=soundex.sounds_like('Allowed', 'Aloud')
def phonetic(data,text): def clean_text(raw_text): regex = re.compile('[^a-zA-Z\s:]') # First parameter is the replacement, second parameter is your input string filtered_text = regex.sub('', raw_text) # Eliminate multiple spaces filtered_text = re.sub(r'[\s]+', ' ', filtered_text) # Strip out terminal and leading spaces return filtered_text.strip() words = data sanitized_text=clean_text(text) token = word_tokenize(sanitized_text) # load stop words stop_words = stopwords.words('english') # Remove stop words token = [word for word in token if word not in stop_words] n = len(token) soundex = Soundex() metaphone = Metaphone() rs = RefinedSoundex() fs = FuzzySoundex() algorithms = [soundex, metaphone, rs, fs] cc = dict() # conversion of list of tuple to list of list for i in range(1, n): ngram_list = list(nltk.ngrams(token, i)) ngram = [" ".join(i) for i in ngram_list] cc[str(i)] = ngram ngrams = sum(cc.values(), []) ngrams = [item for item in ngrams if not item.isdigit()] dict1 = dict() # Iterating over values for i in ngrams: for j in words: total = 0 for entry in algorithms: code1 = entry.phonetics(i) code2 = entry.phonetics(j) similar = entry.sounds_like(i,j) if similar == True: total += 1 if total >= 3: dict1[str(i)] = j total = 0 dict1= dict(reversed(list(dict1.items()))) print(dict1) def multipleReplace(sentence, dict): punct= list(string.punctuation) punct=[i for i in punct if i not in [","]] punct="".join(punct) a=[] for i in sentence.split(): j=i.translate(str.maketrans('','',punct)) a.append(j) sentence=' '.join(a) """ take a text and replace words that match the key in a dictionary with the associated value, return the changed text """ for key in dict: sentence = sentence.replace(key, dict[key]) return sentence result= multipleReplace(text, dict1) print(result) return result
# -- weight weight = { "soundex": 0.2, "caverphone": 0.2, "metaphone": 0.5, "nysiis": 0.1 } # -- algorithms algorithms = ["soundex", "caverphone", "metaphone", "nysiis"] # -- total total = 0.0 for entry in algorithms: code1 = codeList1[entry] code2 = codeList2[entry] lev = levenshtein (code1, code2) currentWeight = weight[entry] print ("comparing %s with %s for %s (%0.2f: weight %0.2f)" % (code1, code2, entry, lev, currentWeight)) subtotal = lev * currentWeight total += subtotal print ("total: %0.2f" % total) """ from pyphonetics import Soundex soundex = Soundex() print(soundex.phonetics('Lucky')) print(soundex.phonetics('Lucki')) print(soundex.sounds_like('Rizvee', 'Rizvi'))
def insert_drug_database(drug_name): drug_id = 0 if drug_name: soundex = Soundex() metaphone = Metaphone() rs = RefinedSoundex() file_path = "utils//DRUGS_ALL_EDITTED.csv" file_path = pkg_resources.resource_filename(__name__, file_path) file = open(file_path, "r") section = file.read() parts = re.split('[\n]', section) min_dist = 100 new_name = re.sub("چ", "غ", drug_name) new_name = re.sub("ﻏ", "غ", new_name) new_name = normalize_arabic(new_name) name_en = translate_drug_name(drug_name) equals = [] min_index = -1 min_dist_all = 100 min_index_all = -1 chosen = False for part in parts: if distance_words(name_en, part) == 0 or distance_words(name_en, part) == 1: chosen = True print(" Matched To ->", part) drug_id = insert_drug(drug_name, normalize_arabic(drug_name), part) return drug_id, drug_name dist = rs.distance(name_en, part) if dist <min_dist_all: min_dist_all = dist min_index_all = parts.index(part) if soundex.sounds_like(new_name, part) or soundex.sounds_like(name_en, part): if rs.distance(new_name, part) < min_dist: min_dist = rs.distance(new_name, part) min_index = parts.index(part) equals.append((part,metaphone.phonetics(part))) if min_index != -1: for equ in equals: if equ[1] == metaphone.phonetics(name_en) or equ == metaphone.phonetics(new_name): drug_id = insert_drug(drug_name, normalize_arabic(drug_name), equ[0]) chosen = True return drug_id, drug_name if not chosen and min_index != -1: chosen = True drug_id = insert_drug(drug_name, normalize_arabic(drug_name), parts[min_index]) return drug_id, drug_name if not chosen: drug_id = insert_drug(drug_name, normalize_arabic(drug_name), parts[min_index_all]) return drug_id, drug_name else: drug_id = insert_drug("----------", "----------", "----------") drug_name = "default" return drug_id, drug_name return drug_id, drug_name