Example #1
0
def soundsLike(word):
    similar = []
    rs = RefinedSoundex()
    for v in vWords:  # entire word
        if rs.distance(v, word) <= 3 and v not in similar:  # rs.distance returns integers...
            similar.append(v)
    return similar
class PhoneticModule:
    def __init__(self):
        self.soundex = Soundex()
        self.metaphone = Metaphone()
        self.fuzzySoundex = FuzzySoundex()
        self.lein = Lein()
        self.refinedSoundex = RefinedSoundex()

        self.phoneticSimilarityWeight = {}
        self.phoneticSimilarityWeight['soundex'] = 0.2
        self.phoneticSimilarityWeight['metaphone'] = 0.2
        self.phoneticSimilarityWeight['fuzzySoundex'] = 0.2
        self.phoneticSimilarityWeight['lein'] = 0.2
        self.phoneticSimilarityWeight['refinedSoundex'] = 0.2

    def Calculation(self, word1, word2):
        #print(self.soundex.phonetics(word1))
        #print(self.soundex.phonetics(word2))
        res = 0.0
        res = self.SoundexMethod(
            word1, word2) * self.phoneticSimilarityWeight['soundex'] * 1.0
        res = res + self.MetaphoneMethod(
            word1, word2) * self.phoneticSimilarityWeight['metaphone'] * 1.0
        res = res + self.FuzzySoundexMethod(
            word1, word2) * self.phoneticSimilarityWeight['fuzzySoundex'] * 1.0
        res = res + self.LeinMethod(
            word1, word2) * self.phoneticSimilarityWeight['lein'] * 1.0
        res = res + self.RefinedSoundexMethod(
            word1,
            word2) * self.phoneticSimilarityWeight['refinedSoundex'] * 1.0
        print(res)
        return res

    def PhoneticLayerCreation(self, word):
        string = ""
        string = self.soundex.phonetics(word)
        string = string + '_' + self.metaphone.phonetics(word)
        string = string + '_' + self.fuzzySoundex.phonetics(word)
        string = string + '_' + self.lein.phonetics(word)
        string = string + '_' + self.refinedSoundex.phonetics(word)
        return string

    def SoundexMethod(self, word1, word2):
        return self.soundex.distance(word1, word2, metric='levenshtein')

    def MetaphoneMethod(self, word1, word2):
        return self.metaphone.distance(word1, word2, metric='levenshtein')

    def FuzzySoundexMethod(self, word1, word2):
        return self.fuzzySoundex.distance(word1, word2, metric='levenshtein')

    def LeinMethod(self, word1, word2):
        return self.lein.distance(word1, word2, metric='levenshtein')

    def RefinedSoundexMethod(self, word1, word2):
        return self.refinedSoundex.distance(word1, word2, metric='levenshtein')
Example #3
0
def get_distances(phrases, speech_words):
    rs = RefinedSoundex()

    distances = {}
    for phrase in phrases:
        for word in speech_words:
            if word not in distances:
                distances[word] = {}
            distances[word][phrase] = rs.distance(word, phrase)

    return distances
    def get_info(self):

        recognize = self.__params__["recognize"]
        recognize = "".join(recognize)
        base = self.__params__["base"]

        attributes = {
            "index": self.__params__["index"],
            "miscue_base": base,
            "miscue_result": recognize
        }

        rs = RefinedSoundex()
        soundex = Soundex()

        if str(base).isnumeric() or str(recognize).isnumeric():
            attributes['type'] = 0
            return attributes
        else:

            distance = rs.distance(base, recognize)

            sounds = soundex.sounds_like(base, recognize)

            if distance < 1 or sounds:

                attributes['type'] = 0
                return attributes

            else:

                attributes['type'] = 2
                return attributes

                pass

        pass
Example #5
0
def function(firstname, language1, language2, gender):

    lang_dict = {
            'turkish': turkish_cv,
            'Turkish': turkish_cv,
            'english': english_cv,
            'English': english_cv,
            'german': german_cv,
            'German': german_cv,
            'Spanish': spanish_cv,
            'spanish': spanish_cv}
    if language1 == "English" or language1 == 'english':
        import eng_to_ipa as ipa
        dataf= pd.DataFrame([firstname], columns = ["Name"])
        dataf['new_column'] = dataf['Name'].apply(ipa.convert, keep_punct=False)
        dataf['new_column'] = dataf['new_column'].str.replace('ˈ','')
        dataf['new_column'] = dataf['new_column'].str.replace('ˌ','')
        dataf['new_new_column'] = dataf['new_column'].apply(name_to_phenome_list, args=(all_phenomes,))  
        list_of_names = dataf['new_new_column'].to_list()    
        list_of_names = list(list_of_names[0])
    elif language1 == "Turkish" or language1 == 'turkish' :
        dataf= pd.DataFrame([firstname], columns = ["Name"])
        dataf['new_column'] = dataf['Name'].apply(epi_turkish.transliterate)
        dataf['new_column'] = dataf['new_column'].str.replace('ˈ','')
        dataf['new_column'] = dataf['new_column'].str.replace('ˌ','')
        dataf['new_new_column'] = dataf['new_column'].apply(name_to_phenome_list, args=(all_phenomes,))  
        list_of_names = dataf['new_new_column'].to_list()    
        list_of_names = list(list_of_names[0])        
    elif language1 == "Spanish" or language1 == 'spanish':
        dataf= pd.DataFrame([firstname], columns = ["Name"])
        dataf['new_column'] = dataf['Name'].apply(epi_spanish.transliterate)
        dataf['new_column'] = dataf['new_column'].str.replace('ˈ','')
        dataf['new_column'] = dataf['new_column'].str.replace('ˌ','')
        dataf['new_new_column'] = dataf['new_column'].apply(name_to_phenome_list, args=(all_phenomes,))  
        list_of_names = dataf['new_new_column'].to_list()    
        list_of_names = list(list_of_names[0])            
    elif language1 == "German" or  language1 == 'german':
        import eng_to_ipa as ipa
        dataf= pd.DataFrame([firstname], columns = ["Name"])
        dataf['new_column'] = dataf['Name'].apply(epi_german.transliterate)
        dataf['new_column'] = dataf['new_column'].str.replace('ˈ','')
        dataf['new_column'] = dataf['new_column'].str.replace('ˌ','')
        dataf['new_new_column'] = dataf['new_column'].apply(name_to_phenome_list, args=(all_phenomes,))  
        list_of_names = dataf['new_new_column'].to_list()    
        list_of_names = list(list_of_names[0])    
    if all(elem in list(set(lang_dict[language1]) & set(lang_dict[language2]))  for elem in list_of_names):
        st.write(str(firstname) + " is pronounceable in both languages.")
    else: 
        dataframe = pd.concat([dataf, df4.reindex(dataf.index)], axis=1)
        for i in range(len(dataframe)): # loop over all rows
                  # assuming this column contains an array / list of the phenomes
                    # that is iterable:
            phenomes = dataframe.iloc[i]['new_new_column']
            for p in phenomes: 
                dataframe.at[i,p] = 1  # set corresponding columns to 1    
        dataframe = dataframe.set_index('Name') 
        dataframe = dataframe.drop(columns=['new_column', 'new_new_column'])
        clusterer = pickle.load(open('https://github.com/jerman7/clbng_webapp/finalized_model.sav', 'rb'))
        cluster_labels = pickle.load(open('https://github.com/jerman7/clbng_webapp/finalized_labels.sav', 'rb'))
        cluster = clusterer.predict(dataframe)
        clusterprint = df3.loc[cluster_labels==cluster]
        clusterprint_L1 = clusterprint[(clusterprint['Language']==str(language1).title())]
        clusterprint_L1_g = clusterprint_L1[(clusterprint_L1['Gender']==str(gender).title())]
        clusterprint_L2 = clusterprint[(clusterprint['Language']==str(language2).title())]
        clusterprint_L2_g = clusterprint_L2[(clusterprint_L2['Gender']==str(gender).title())]
        rs = RefinedSoundex()
        sharedlist = list(set(lang_dict[language1]) & set(lang_dict[language2]))
        ## This name is not pronounceable in both languages. 
        st.write("'" + str(firstname) + "'" + " is not pronounceable in " + str(language2) + ".")  
        
        ## 1 Here are similar names that are spelled and pronounced the same in both languages
        clusterprint_L1_g = clusterprint_L1_g.reset_index()
        NewList1 = [clusterprint_L1_g['Name'].iloc[i] for i,nc_val in enumerate(clusterprint_L1_g['new_column']) if set(nc_val).issubset(sharedlist)]
        clusterprint_L2_g = clusterprint_L2_g.reset_index()
        NewList2 = [clusterprint_L2_g['Name'].iloc[i] for i,nc_val in enumerate(clusterprint_L2_g['new_column']) if set(nc_val).issubset(sharedlist)]
        clusterprint_L1_g= clusterprint_L1_g.drop_duplicates(subset=['Name'])
        clusterprint_L2_g= clusterprint_L2_g.drop_duplicates(subset=['Name'])
        clusterprint_L3_g = clusterprint_L1_g.append(clusterprint_L2_g)
        
        duplicate = clusterprint_L3_g[clusterprint_L3_g.duplicated(['Name', 'new_column'])] 
        NewList3 = duplicate['Name'].tolist()
        goal = df3.reset_index()        
        NewFrame5 = goal[goal['Language'] == language2]
        NewFrame5 = NewFrame5.drop_duplicates(subset=['Name'])

        NewList5 = NewFrame5['Name'].tolist()
    
        if len(NewList3) > 0:
        	st.write("Here are similar names that are spelled and pronounced the same in both languages:")
        	st.markdown(NewList3)

        if len(NewList3) == 0 and not (len(NewList3) == 0 and len(NewList1) == 0 and len(NewList2) == 0 and firstname not in NewList5):
        	st.write("We cannot find similar names that are spelled and pronounced the same in both languages.")
           	
        ## 2 Here are simialr names in English that are pronounceable in Spanish 
        if len(NewList1) > 0 and len(NewList1) < 40:
        	not_in_List3 = list(set(NewList1) - set(NewList3))
	        st.write("Here are phonetically similar names in " + str(language1).title() + " that are pronounceable in " + str(language2).title() + ":")
	        st.markdown(not_in_List3)

        if len(NewList1) >= 40 :
        	not_in_List3 = list(set(NewList1) - set(NewList3))
        	distances = [rs.distance(namez, str(firstname)) for namez in not_in_List3]
        	indexes = np.argsort(distances)
        	least_names = np.array(not_in_List3)[indexes]
	        st.write("Here are phonetically similar names in " + str(language1).title() + " that are pronounceable in " + str(language2).title() + ":")
        	st.markdown(least_names[:40].tolist())

        if len(NewList2)< 40 and len(NewList2) > 0:
        	not_in_List3 = list(set(NewList2) - set(NewList3))
	        st.write("Here are phonetically similar names in " + str(language2).title() + " that are pronounceable in " + str(language1).title() + ":")
	        st.markdown(not_in_List3)

        if len(NewList2)>=40:
        	not_in_List3 = list(set(NewList2) - set(NewList3))
        	distances = [rs.distance(namez, str(firstname)) for namez in not_in_List3]
        	indexes = np.argsort(distances)
        	least_names = np.array(not_in_List3)[indexes]
	        st.write("Here are phonetically similar names in " + str(language2).title() + " that are pronounceable in " + str(language1).title() + ":")
        	st.markdown(least_names[:40].tolist())         
        
        if firstname in NewList5:
        	simname = NewFrame5['new_column'][NewFrame5['Name'] == firstname]
        	st.write("While " + "'" + str(firstname) + "'" + " is not pronounceable in " + str(language2) + ", the name '" +  str(firstname) + "' exists with the same spelling but a different pronunciation in " + str(language2) + ": " + str(simname.tolist()))

               
        if len(NewList3) == 0 and len(NewList1) == 0 and len(NewList2) == 0 and firstname not in NewList5: 
        	st.write("Sorry, we cannot find any suggestions.")
Example #6
0
def insert_drug_database(drug_name):

    drug_id = 0
    if drug_name:
        soundex = Soundex()
        metaphone = Metaphone()
        rs = RefinedSoundex()
        file_path = "utils//DRUGS_ALL_EDITTED.csv"
        file_path = pkg_resources.resource_filename(__name__, file_path)
        file = open(file_path, "r")
        section = file.read()
        parts = re.split('[\n]', section)
        min_dist = 100
        new_name = re.sub("چ", "غ", drug_name)
        new_name = re.sub("ﻏ", "غ", new_name)
        new_name = normalize_arabic(new_name)
        name_en = translate_drug_name(drug_name)
        equals = []
        min_index = -1
        min_dist_all = 100
        min_index_all = -1
        chosen = False

        for part in parts:

            if distance_words(name_en, part) == 0 or distance_words(name_en, part) == 1:
                chosen = True
                print(" Matched To ->", part)
                drug_id = insert_drug(drug_name, normalize_arabic(drug_name), part)
                return drug_id, drug_name

            dist = rs.distance(name_en, part)
            if dist <min_dist_all:
                min_dist_all = dist
                min_index_all = parts.index(part)

            if soundex.sounds_like(new_name, part) or soundex.sounds_like(name_en, part):

                if rs.distance(new_name, part) < min_dist:
                    min_dist = rs.distance(new_name, part)
                    min_index = parts.index(part)
                equals.append((part,metaphone.phonetics(part)))

        if min_index != -1:
            for equ in equals:
                if equ[1] == metaphone.phonetics(name_en) or equ == metaphone.phonetics(new_name):
                    drug_id = insert_drug(drug_name, normalize_arabic(drug_name), equ[0])
                    chosen = True
                    return drug_id, drug_name

        if not chosen and min_index != -1:
            chosen = True
            drug_id = insert_drug(drug_name, normalize_arabic(drug_name), parts[min_index])
            return drug_id, drug_name

        if not chosen:
            drug_id = insert_drug(drug_name, normalize_arabic(drug_name), parts[min_index_all])
            return drug_id, drug_name

    else:
        drug_id = insert_drug("----------", "----------", "----------")
        drug_name = "default"
        return drug_id, drug_name

    return drug_id, drug_name