def soundsLike(word): similar = [] rs = RefinedSoundex() for v in vWords: # entire word if rs.distance(v, word) <= 3 and v not in similar: # rs.distance returns integers... similar.append(v) return similar
def correct_words_according_to(word_tuple, word_classifier, dictionary): ''' Take a given word, and feed it into the classifier, returning a list of words that are likely correct. ''' word = word_tuple.value soundex = RefinedSoundex() word_soundex = soundex.phonetics(word) corrections = [] # for every word in the dictionary for candidate, metadata in dictionary.items(): # get the properties of the dictionary word and run that through the # classifier index, candidate_soundex = metadata properties = properties_of(word, word_soundex, candidate, candidate_soundex) # Compute the rank of the word based on how frequently it is used in # the English language. word_rank = frequency.frequency_of(index) assumptions = { 'correct': 1 - word_rank, 'incorrect': word_rank } probability, cls = word_classifier.classify(properties, assumptions=assumptions) if cls == 'correct': corrections.append((probability, candidate)) return word_tuple, sorted(corrections, reverse=True)
def test_refined_soundex(): soundex = RefinedSoundex() assert soundex.phonetics('h') == 'H' assert soundex.phonetics('d') == 'D6' with pytest.raises(EmptyStringError): soundex.phonetics('')
class PhoneticModule: def __init__(self): self.soundex = Soundex() self.metaphone = Metaphone() self.fuzzySoundex = FuzzySoundex() self.lein = Lein() self.refinedSoundex = RefinedSoundex() self.phoneticSimilarityWeight = {} self.phoneticSimilarityWeight['soundex'] = 0.2 self.phoneticSimilarityWeight['metaphone'] = 0.2 self.phoneticSimilarityWeight['fuzzySoundex'] = 0.2 self.phoneticSimilarityWeight['lein'] = 0.2 self.phoneticSimilarityWeight['refinedSoundex'] = 0.2 def Calculation(self, word1, word2): #print(self.soundex.phonetics(word1)) #print(self.soundex.phonetics(word2)) res = 0.0 res = self.SoundexMethod( word1, word2) * self.phoneticSimilarityWeight['soundex'] * 1.0 res = res + self.MetaphoneMethod( word1, word2) * self.phoneticSimilarityWeight['metaphone'] * 1.0 res = res + self.FuzzySoundexMethod( word1, word2) * self.phoneticSimilarityWeight['fuzzySoundex'] * 1.0 res = res + self.LeinMethod( word1, word2) * self.phoneticSimilarityWeight['lein'] * 1.0 res = res + self.RefinedSoundexMethod( word1, word2) * self.phoneticSimilarityWeight['refinedSoundex'] * 1.0 print(res) return res def PhoneticLayerCreation(self, word): string = "" string = self.soundex.phonetics(word) string = string + '_' + self.metaphone.phonetics(word) string = string + '_' + self.fuzzySoundex.phonetics(word) string = string + '_' + self.lein.phonetics(word) string = string + '_' + self.refinedSoundex.phonetics(word) return string def SoundexMethod(self, word1, word2): return self.soundex.distance(word1, word2, metric='levenshtein') def MetaphoneMethod(self, word1, word2): return self.metaphone.distance(word1, word2, metric='levenshtein') def FuzzySoundexMethod(self, word1, word2): return self.fuzzySoundex.distance(word1, word2, metric='levenshtein') def LeinMethod(self, word1, word2): return self.lein.distance(word1, word2, metric='levenshtein') def RefinedSoundexMethod(self, word1, word2): return self.refinedSoundex.distance(word1, word2, metric='levenshtein')
def get_distances(phrases, speech_words): rs = RefinedSoundex() distances = {} for phrase in phrases: for word in speech_words: if word not in distances: distances[word] = {} distances[word][phrase] = rs.distance(word, phrase) return distances
def __init__(self): self.soundex = Soundex() self.metaphone = Metaphone() self.fuzzySoundex = FuzzySoundex() self.lein = Lein() self.refinedSoundex = RefinedSoundex() self.phoneticSimilarityWeight = {} self.phoneticSimilarityWeight['soundex'] = 0.2 self.phoneticSimilarityWeight['metaphone'] = 0.2 self.phoneticSimilarityWeight['fuzzySoundex'] = 0.2 self.phoneticSimilarityWeight['lein'] = 0.2 self.phoneticSimilarityWeight['refinedSoundex'] = 0.2
def test_soundex_refined(): tests = [ ('T6036084', 'testing'), ('T6036084', 'TESTING'), ('T60', 'The'), ('Q503', 'quick'), ('B1908', 'brown'), ('F205', 'fox'), ('J408106', 'jumped'), ('O0209', 'over'), ('L7050', 'lazy'), ('D6043', 'dogs') ] soundex = RefinedSoundex() for test in tests: assert soundex.phonetics(test[1]) == test[0]
def get_info(self): recognize = self.__params__["recognize"] recognize = "".join(recognize) base = self.__params__["base"] attributes = { "index": self.__params__["index"], "miscue_base": base, "miscue_result": recognize } rs = RefinedSoundex() soundex = Soundex() if str(base).isnumeric() or str(recognize).isnumeric(): attributes['type'] = 0 return attributes else: distance = rs.distance(base, recognize) sounds = soundex.sounds_like(base, recognize) if distance < 1 or sounds: attributes['type'] = 0 return attributes else: attributes['type'] = 2 return attributes pass pass
def function(firstname, language1, language2, gender): lang_dict = { 'turkish': turkish_cv, 'Turkish': turkish_cv, 'english': english_cv, 'English': english_cv, 'german': german_cv, 'German': german_cv, 'Spanish': spanish_cv, 'spanish': spanish_cv} if language1 == "English" or language1 == 'english': import eng_to_ipa as ipa dataf= pd.DataFrame([firstname], columns = ["Name"]) dataf['new_column'] = dataf['Name'].apply(ipa.convert, keep_punct=False) dataf['new_column'] = dataf['new_column'].str.replace('ˈ','') dataf['new_column'] = dataf['new_column'].str.replace('ˌ','') dataf['new_new_column'] = dataf['new_column'].apply(name_to_phenome_list, args=(all_phenomes,)) list_of_names = dataf['new_new_column'].to_list() list_of_names = list(list_of_names[0]) elif language1 == "Turkish" or language1 == 'turkish' : dataf= pd.DataFrame([firstname], columns = ["Name"]) dataf['new_column'] = dataf['Name'].apply(epi_turkish.transliterate) dataf['new_column'] = dataf['new_column'].str.replace('ˈ','') dataf['new_column'] = dataf['new_column'].str.replace('ˌ','') dataf['new_new_column'] = dataf['new_column'].apply(name_to_phenome_list, args=(all_phenomes,)) list_of_names = dataf['new_new_column'].to_list() list_of_names = list(list_of_names[0]) elif language1 == "Spanish" or language1 == 'spanish': dataf= pd.DataFrame([firstname], columns = ["Name"]) dataf['new_column'] = dataf['Name'].apply(epi_spanish.transliterate) dataf['new_column'] = dataf['new_column'].str.replace('ˈ','') dataf['new_column'] = dataf['new_column'].str.replace('ˌ','') dataf['new_new_column'] = dataf['new_column'].apply(name_to_phenome_list, args=(all_phenomes,)) list_of_names = dataf['new_new_column'].to_list() list_of_names = list(list_of_names[0]) elif language1 == "German" or language1 == 'german': import eng_to_ipa as ipa dataf= pd.DataFrame([firstname], columns = ["Name"]) dataf['new_column'] = dataf['Name'].apply(epi_german.transliterate) dataf['new_column'] = dataf['new_column'].str.replace('ˈ','') dataf['new_column'] = dataf['new_column'].str.replace('ˌ','') dataf['new_new_column'] = dataf['new_column'].apply(name_to_phenome_list, args=(all_phenomes,)) list_of_names = dataf['new_new_column'].to_list() list_of_names = list(list_of_names[0]) if all(elem in list(set(lang_dict[language1]) & set(lang_dict[language2])) for elem in list_of_names): st.write(str(firstname) + " is pronounceable in both languages.") else: dataframe = pd.concat([dataf, df4.reindex(dataf.index)], axis=1) for i in range(len(dataframe)): # loop over all rows # assuming this column contains an array / list of the phenomes # that is iterable: phenomes = dataframe.iloc[i]['new_new_column'] for p in phenomes: dataframe.at[i,p] = 1 # set corresponding columns to 1 dataframe = dataframe.set_index('Name') dataframe = dataframe.drop(columns=['new_column', 'new_new_column']) clusterer = pickle.load(open('https://github.com/jerman7/clbng_webapp/finalized_model.sav', 'rb')) cluster_labels = pickle.load(open('https://github.com/jerman7/clbng_webapp/finalized_labels.sav', 'rb')) cluster = clusterer.predict(dataframe) clusterprint = df3.loc[cluster_labels==cluster] clusterprint_L1 = clusterprint[(clusterprint['Language']==str(language1).title())] clusterprint_L1_g = clusterprint_L1[(clusterprint_L1['Gender']==str(gender).title())] clusterprint_L2 = clusterprint[(clusterprint['Language']==str(language2).title())] clusterprint_L2_g = clusterprint_L2[(clusterprint_L2['Gender']==str(gender).title())] rs = RefinedSoundex() sharedlist = list(set(lang_dict[language1]) & set(lang_dict[language2])) ## This name is not pronounceable in both languages. st.write("'" + str(firstname) + "'" + " is not pronounceable in " + str(language2) + ".") ## 1 Here are similar names that are spelled and pronounced the same in both languages clusterprint_L1_g = clusterprint_L1_g.reset_index() NewList1 = [clusterprint_L1_g['Name'].iloc[i] for i,nc_val in enumerate(clusterprint_L1_g['new_column']) if set(nc_val).issubset(sharedlist)] clusterprint_L2_g = clusterprint_L2_g.reset_index() NewList2 = [clusterprint_L2_g['Name'].iloc[i] for i,nc_val in enumerate(clusterprint_L2_g['new_column']) if set(nc_val).issubset(sharedlist)] clusterprint_L1_g= clusterprint_L1_g.drop_duplicates(subset=['Name']) clusterprint_L2_g= clusterprint_L2_g.drop_duplicates(subset=['Name']) clusterprint_L3_g = clusterprint_L1_g.append(clusterprint_L2_g) duplicate = clusterprint_L3_g[clusterprint_L3_g.duplicated(['Name', 'new_column'])] NewList3 = duplicate['Name'].tolist() goal = df3.reset_index() NewFrame5 = goal[goal['Language'] == language2] NewFrame5 = NewFrame5.drop_duplicates(subset=['Name']) NewList5 = NewFrame5['Name'].tolist() if len(NewList3) > 0: st.write("Here are similar names that are spelled and pronounced the same in both languages:") st.markdown(NewList3) if len(NewList3) == 0 and not (len(NewList3) == 0 and len(NewList1) == 0 and len(NewList2) == 0 and firstname not in NewList5): st.write("We cannot find similar names that are spelled and pronounced the same in both languages.") ## 2 Here are simialr names in English that are pronounceable in Spanish if len(NewList1) > 0 and len(NewList1) < 40: not_in_List3 = list(set(NewList1) - set(NewList3)) st.write("Here are phonetically similar names in " + str(language1).title() + " that are pronounceable in " + str(language2).title() + ":") st.markdown(not_in_List3) if len(NewList1) >= 40 : not_in_List3 = list(set(NewList1) - set(NewList3)) distances = [rs.distance(namez, str(firstname)) for namez in not_in_List3] indexes = np.argsort(distances) least_names = np.array(not_in_List3)[indexes] st.write("Here are phonetically similar names in " + str(language1).title() + " that are pronounceable in " + str(language2).title() + ":") st.markdown(least_names[:40].tolist()) if len(NewList2)< 40 and len(NewList2) > 0: not_in_List3 = list(set(NewList2) - set(NewList3)) st.write("Here are phonetically similar names in " + str(language2).title() + " that are pronounceable in " + str(language1).title() + ":") st.markdown(not_in_List3) if len(NewList2)>=40: not_in_List3 = list(set(NewList2) - set(NewList3)) distances = [rs.distance(namez, str(firstname)) for namez in not_in_List3] indexes = np.argsort(distances) least_names = np.array(not_in_List3)[indexes] st.write("Here are phonetically similar names in " + str(language2).title() + " that are pronounceable in " + str(language1).title() + ":") st.markdown(least_names[:40].tolist()) if firstname in NewList5: simname = NewFrame5['new_column'][NewFrame5['Name'] == firstname] st.write("While " + "'" + str(firstname) + "'" + " is not pronounceable in " + str(language2) + ", the name '" + str(firstname) + "' exists with the same spelling but a different pronunciation in " + str(language2) + ": " + str(simname.tolist())) if len(NewList3) == 0 and len(NewList1) == 0 and len(NewList2) == 0 and firstname not in NewList5: st.write("Sorry, we cannot find any suggestions.")
def phonetic(data,text): def clean_text(raw_text): regex = re.compile('[^a-zA-Z\s:]') # First parameter is the replacement, second parameter is your input string filtered_text = regex.sub('', raw_text) # Eliminate multiple spaces filtered_text = re.sub(r'[\s]+', ' ', filtered_text) # Strip out terminal and leading spaces return filtered_text.strip() words = data sanitized_text=clean_text(text) token = word_tokenize(sanitized_text) # load stop words stop_words = stopwords.words('english') # Remove stop words token = [word for word in token if word not in stop_words] n = len(token) soundex = Soundex() metaphone = Metaphone() rs = RefinedSoundex() fs = FuzzySoundex() algorithms = [soundex, metaphone, rs, fs] cc = dict() # conversion of list of tuple to list of list for i in range(1, n): ngram_list = list(nltk.ngrams(token, i)) ngram = [" ".join(i) for i in ngram_list] cc[str(i)] = ngram ngrams = sum(cc.values(), []) ngrams = [item for item in ngrams if not item.isdigit()] dict1 = dict() # Iterating over values for i in ngrams: for j in words: total = 0 for entry in algorithms: code1 = entry.phonetics(i) code2 = entry.phonetics(j) similar = entry.sounds_like(i,j) if similar == True: total += 1 if total >= 3: dict1[str(i)] = j total = 0 dict1= dict(reversed(list(dict1.items()))) print(dict1) def multipleReplace(sentence, dict): punct= list(string.punctuation) punct=[i for i in punct if i not in [","]] punct="".join(punct) a=[] for i in sentence.split(): j=i.translate(str.maketrans('','',punct)) a.append(j) sentence=' '.join(a) """ take a text and replace words that match the key in a dictionary with the associated value, return the changed text """ for key in dict: sentence = sentence.replace(key, dict[key]) return sentence result= multipleReplace(text, dict1) print(result) return result
def insert_drug_database(drug_name): drug_id = 0 if drug_name: soundex = Soundex() metaphone = Metaphone() rs = RefinedSoundex() file_path = "utils//DRUGS_ALL_EDITTED.csv" file_path = pkg_resources.resource_filename(__name__, file_path) file = open(file_path, "r") section = file.read() parts = re.split('[\n]', section) min_dist = 100 new_name = re.sub("چ", "غ", drug_name) new_name = re.sub("ﻏ", "غ", new_name) new_name = normalize_arabic(new_name) name_en = translate_drug_name(drug_name) equals = [] min_index = -1 min_dist_all = 100 min_index_all = -1 chosen = False for part in parts: if distance_words(name_en, part) == 0 or distance_words(name_en, part) == 1: chosen = True print(" Matched To ->", part) drug_id = insert_drug(drug_name, normalize_arabic(drug_name), part) return drug_id, drug_name dist = rs.distance(name_en, part) if dist <min_dist_all: min_dist_all = dist min_index_all = parts.index(part) if soundex.sounds_like(new_name, part) or soundex.sounds_like(name_en, part): if rs.distance(new_name, part) < min_dist: min_dist = rs.distance(new_name, part) min_index = parts.index(part) equals.append((part,metaphone.phonetics(part))) if min_index != -1: for equ in equals: if equ[1] == metaphone.phonetics(name_en) or equ == metaphone.phonetics(new_name): drug_id = insert_drug(drug_name, normalize_arabic(drug_name), equ[0]) chosen = True return drug_id, drug_name if not chosen and min_index != -1: chosen = True drug_id = insert_drug(drug_name, normalize_arabic(drug_name), parts[min_index]) return drug_id, drug_name if not chosen: drug_id = insert_drug(drug_name, normalize_arabic(drug_name), parts[min_index_all]) return drug_id, drug_name else: drug_id = insert_drug("----------", "----------", "----------") drug_name = "default" return drug_id, drug_name return drug_id, drug_name