Example #1
0
def soundsLike(word):
    similar = []
    rs = RefinedSoundex()
    for v in vWords:  # entire word
        if rs.distance(v, word) <= 3 and v not in similar:  # rs.distance returns integers...
            similar.append(v)
    return similar
Example #2
0
def correct_words_according_to(word_tuple, word_classifier, dictionary):
    ''' Take a given word, and feed it into the classifier, returning a list of
    words that are likely correct. '''
    word = word_tuple.value
    soundex = RefinedSoundex()
    word_soundex = soundex.phonetics(word)
    corrections = []
    # for every word in the dictionary
    for candidate, metadata in dictionary.items():
        # get the properties of the dictionary word and run that through the
        # classifier
        index, candidate_soundex = metadata
        properties = properties_of(word,
                                   word_soundex,
                                   candidate,
                                   candidate_soundex)
        # Compute the rank of the word based on how frequently it is used in
        # the English language.
        word_rank = frequency.frequency_of(index)
        assumptions = {
            'correct': 1 - word_rank,
            'incorrect': word_rank
        }
        probability, cls = word_classifier.classify(properties,
                                                    assumptions=assumptions)
        if cls == 'correct':
            corrections.append((probability, candidate))
    return word_tuple, sorted(corrections, reverse=True)
Example #3
0
def test_refined_soundex():
    soundex = RefinedSoundex()

    assert soundex.phonetics('h') == 'H'
    assert soundex.phonetics('d') == 'D6'

    with pytest.raises(EmptyStringError):
        soundex.phonetics('')
class PhoneticModule:
    def __init__(self):
        self.soundex = Soundex()
        self.metaphone = Metaphone()
        self.fuzzySoundex = FuzzySoundex()
        self.lein = Lein()
        self.refinedSoundex = RefinedSoundex()

        self.phoneticSimilarityWeight = {}
        self.phoneticSimilarityWeight['soundex'] = 0.2
        self.phoneticSimilarityWeight['metaphone'] = 0.2
        self.phoneticSimilarityWeight['fuzzySoundex'] = 0.2
        self.phoneticSimilarityWeight['lein'] = 0.2
        self.phoneticSimilarityWeight['refinedSoundex'] = 0.2

    def Calculation(self, word1, word2):
        #print(self.soundex.phonetics(word1))
        #print(self.soundex.phonetics(word2))
        res = 0.0
        res = self.SoundexMethod(
            word1, word2) * self.phoneticSimilarityWeight['soundex'] * 1.0
        res = res + self.MetaphoneMethod(
            word1, word2) * self.phoneticSimilarityWeight['metaphone'] * 1.0
        res = res + self.FuzzySoundexMethod(
            word1, word2) * self.phoneticSimilarityWeight['fuzzySoundex'] * 1.0
        res = res + self.LeinMethod(
            word1, word2) * self.phoneticSimilarityWeight['lein'] * 1.0
        res = res + self.RefinedSoundexMethod(
            word1,
            word2) * self.phoneticSimilarityWeight['refinedSoundex'] * 1.0
        print(res)
        return res

    def PhoneticLayerCreation(self, word):
        string = ""
        string = self.soundex.phonetics(word)
        string = string + '_' + self.metaphone.phonetics(word)
        string = string + '_' + self.fuzzySoundex.phonetics(word)
        string = string + '_' + self.lein.phonetics(word)
        string = string + '_' + self.refinedSoundex.phonetics(word)
        return string

    def SoundexMethod(self, word1, word2):
        return self.soundex.distance(word1, word2, metric='levenshtein')

    def MetaphoneMethod(self, word1, word2):
        return self.metaphone.distance(word1, word2, metric='levenshtein')

    def FuzzySoundexMethod(self, word1, word2):
        return self.fuzzySoundex.distance(word1, word2, metric='levenshtein')

    def LeinMethod(self, word1, word2):
        return self.lein.distance(word1, word2, metric='levenshtein')

    def RefinedSoundexMethod(self, word1, word2):
        return self.refinedSoundex.distance(word1, word2, metric='levenshtein')
Example #5
0
def get_distances(phrases, speech_words):
    rs = RefinedSoundex()

    distances = {}
    for phrase in phrases:
        for word in speech_words:
            if word not in distances:
                distances[word] = {}
            distances[word][phrase] = rs.distance(word, phrase)

    return distances
    def __init__(self):
        self.soundex = Soundex()
        self.metaphone = Metaphone()
        self.fuzzySoundex = FuzzySoundex()
        self.lein = Lein()
        self.refinedSoundex = RefinedSoundex()

        self.phoneticSimilarityWeight = {}
        self.phoneticSimilarityWeight['soundex'] = 0.2
        self.phoneticSimilarityWeight['metaphone'] = 0.2
        self.phoneticSimilarityWeight['fuzzySoundex'] = 0.2
        self.phoneticSimilarityWeight['lein'] = 0.2
        self.phoneticSimilarityWeight['refinedSoundex'] = 0.2
Example #7
0
def test_soundex_refined():
    tests = [
        ('T6036084', 'testing'),
        ('T6036084', 'TESTING'),
        ('T60', 'The'),
        ('Q503', 'quick'),
        ('B1908', 'brown'),
        ('F205', 'fox'),
        ('J408106', 'jumped'),
        ('O0209', 'over'),
        ('L7050', 'lazy'),
        ('D6043', 'dogs')
    ]

    soundex = RefinedSoundex()
    for test in tests:
        assert soundex.phonetics(test[1]) == test[0]
    def get_info(self):

        recognize = self.__params__["recognize"]
        recognize = "".join(recognize)
        base = self.__params__["base"]

        attributes = {
            "index": self.__params__["index"],
            "miscue_base": base,
            "miscue_result": recognize
        }

        rs = RefinedSoundex()
        soundex = Soundex()

        if str(base).isnumeric() or str(recognize).isnumeric():
            attributes['type'] = 0
            return attributes
        else:

            distance = rs.distance(base, recognize)

            sounds = soundex.sounds_like(base, recognize)

            if distance < 1 or sounds:

                attributes['type'] = 0
                return attributes

            else:

                attributes['type'] = 2
                return attributes

                pass

        pass
Example #9
0
def function(firstname, language1, language2, gender):

    lang_dict = {
            'turkish': turkish_cv,
            'Turkish': turkish_cv,
            'english': english_cv,
            'English': english_cv,
            'german': german_cv,
            'German': german_cv,
            'Spanish': spanish_cv,
            'spanish': spanish_cv}
    if language1 == "English" or language1 == 'english':
        import eng_to_ipa as ipa
        dataf= pd.DataFrame([firstname], columns = ["Name"])
        dataf['new_column'] = dataf['Name'].apply(ipa.convert, keep_punct=False)
        dataf['new_column'] = dataf['new_column'].str.replace('ˈ','')
        dataf['new_column'] = dataf['new_column'].str.replace('ˌ','')
        dataf['new_new_column'] = dataf['new_column'].apply(name_to_phenome_list, args=(all_phenomes,))  
        list_of_names = dataf['new_new_column'].to_list()    
        list_of_names = list(list_of_names[0])
    elif language1 == "Turkish" or language1 == 'turkish' :
        dataf= pd.DataFrame([firstname], columns = ["Name"])
        dataf['new_column'] = dataf['Name'].apply(epi_turkish.transliterate)
        dataf['new_column'] = dataf['new_column'].str.replace('ˈ','')
        dataf['new_column'] = dataf['new_column'].str.replace('ˌ','')
        dataf['new_new_column'] = dataf['new_column'].apply(name_to_phenome_list, args=(all_phenomes,))  
        list_of_names = dataf['new_new_column'].to_list()    
        list_of_names = list(list_of_names[0])        
    elif language1 == "Spanish" or language1 == 'spanish':
        dataf= pd.DataFrame([firstname], columns = ["Name"])
        dataf['new_column'] = dataf['Name'].apply(epi_spanish.transliterate)
        dataf['new_column'] = dataf['new_column'].str.replace('ˈ','')
        dataf['new_column'] = dataf['new_column'].str.replace('ˌ','')
        dataf['new_new_column'] = dataf['new_column'].apply(name_to_phenome_list, args=(all_phenomes,))  
        list_of_names = dataf['new_new_column'].to_list()    
        list_of_names = list(list_of_names[0])            
    elif language1 == "German" or  language1 == 'german':
        import eng_to_ipa as ipa
        dataf= pd.DataFrame([firstname], columns = ["Name"])
        dataf['new_column'] = dataf['Name'].apply(epi_german.transliterate)
        dataf['new_column'] = dataf['new_column'].str.replace('ˈ','')
        dataf['new_column'] = dataf['new_column'].str.replace('ˌ','')
        dataf['new_new_column'] = dataf['new_column'].apply(name_to_phenome_list, args=(all_phenomes,))  
        list_of_names = dataf['new_new_column'].to_list()    
        list_of_names = list(list_of_names[0])    
    if all(elem in list(set(lang_dict[language1]) & set(lang_dict[language2]))  for elem in list_of_names):
        st.write(str(firstname) + " is pronounceable in both languages.")
    else: 
        dataframe = pd.concat([dataf, df4.reindex(dataf.index)], axis=1)
        for i in range(len(dataframe)): # loop over all rows
                  # assuming this column contains an array / list of the phenomes
                    # that is iterable:
            phenomes = dataframe.iloc[i]['new_new_column']
            for p in phenomes: 
                dataframe.at[i,p] = 1  # set corresponding columns to 1    
        dataframe = dataframe.set_index('Name') 
        dataframe = dataframe.drop(columns=['new_column', 'new_new_column'])
        clusterer = pickle.load(open('https://github.com/jerman7/clbng_webapp/finalized_model.sav', 'rb'))
        cluster_labels = pickle.load(open('https://github.com/jerman7/clbng_webapp/finalized_labels.sav', 'rb'))
        cluster = clusterer.predict(dataframe)
        clusterprint = df3.loc[cluster_labels==cluster]
        clusterprint_L1 = clusterprint[(clusterprint['Language']==str(language1).title())]
        clusterprint_L1_g = clusterprint_L1[(clusterprint_L1['Gender']==str(gender).title())]
        clusterprint_L2 = clusterprint[(clusterprint['Language']==str(language2).title())]
        clusterprint_L2_g = clusterprint_L2[(clusterprint_L2['Gender']==str(gender).title())]
        rs = RefinedSoundex()
        sharedlist = list(set(lang_dict[language1]) & set(lang_dict[language2]))
        ## This name is not pronounceable in both languages. 
        st.write("'" + str(firstname) + "'" + " is not pronounceable in " + str(language2) + ".")  
        
        ## 1 Here are similar names that are spelled and pronounced the same in both languages
        clusterprint_L1_g = clusterprint_L1_g.reset_index()
        NewList1 = [clusterprint_L1_g['Name'].iloc[i] for i,nc_val in enumerate(clusterprint_L1_g['new_column']) if set(nc_val).issubset(sharedlist)]
        clusterprint_L2_g = clusterprint_L2_g.reset_index()
        NewList2 = [clusterprint_L2_g['Name'].iloc[i] for i,nc_val in enumerate(clusterprint_L2_g['new_column']) if set(nc_val).issubset(sharedlist)]
        clusterprint_L1_g= clusterprint_L1_g.drop_duplicates(subset=['Name'])
        clusterprint_L2_g= clusterprint_L2_g.drop_duplicates(subset=['Name'])
        clusterprint_L3_g = clusterprint_L1_g.append(clusterprint_L2_g)
        
        duplicate = clusterprint_L3_g[clusterprint_L3_g.duplicated(['Name', 'new_column'])] 
        NewList3 = duplicate['Name'].tolist()
        goal = df3.reset_index()        
        NewFrame5 = goal[goal['Language'] == language2]
        NewFrame5 = NewFrame5.drop_duplicates(subset=['Name'])

        NewList5 = NewFrame5['Name'].tolist()
    
        if len(NewList3) > 0:
        	st.write("Here are similar names that are spelled and pronounced the same in both languages:")
        	st.markdown(NewList3)

        if len(NewList3) == 0 and not (len(NewList3) == 0 and len(NewList1) == 0 and len(NewList2) == 0 and firstname not in NewList5):
        	st.write("We cannot find similar names that are spelled and pronounced the same in both languages.")
           	
        ## 2 Here are simialr names in English that are pronounceable in Spanish 
        if len(NewList1) > 0 and len(NewList1) < 40:
        	not_in_List3 = list(set(NewList1) - set(NewList3))
	        st.write("Here are phonetically similar names in " + str(language1).title() + " that are pronounceable in " + str(language2).title() + ":")
	        st.markdown(not_in_List3)

        if len(NewList1) >= 40 :
        	not_in_List3 = list(set(NewList1) - set(NewList3))
        	distances = [rs.distance(namez, str(firstname)) for namez in not_in_List3]
        	indexes = np.argsort(distances)
        	least_names = np.array(not_in_List3)[indexes]
	        st.write("Here are phonetically similar names in " + str(language1).title() + " that are pronounceable in " + str(language2).title() + ":")
        	st.markdown(least_names[:40].tolist())

        if len(NewList2)< 40 and len(NewList2) > 0:
        	not_in_List3 = list(set(NewList2) - set(NewList3))
	        st.write("Here are phonetically similar names in " + str(language2).title() + " that are pronounceable in " + str(language1).title() + ":")
	        st.markdown(not_in_List3)

        if len(NewList2)>=40:
        	not_in_List3 = list(set(NewList2) - set(NewList3))
        	distances = [rs.distance(namez, str(firstname)) for namez in not_in_List3]
        	indexes = np.argsort(distances)
        	least_names = np.array(not_in_List3)[indexes]
	        st.write("Here are phonetically similar names in " + str(language2).title() + " that are pronounceable in " + str(language1).title() + ":")
        	st.markdown(least_names[:40].tolist())         
        
        if firstname in NewList5:
        	simname = NewFrame5['new_column'][NewFrame5['Name'] == firstname]
        	st.write("While " + "'" + str(firstname) + "'" + " is not pronounceable in " + str(language2) + ", the name '" +  str(firstname) + "' exists with the same spelling but a different pronunciation in " + str(language2) + ": " + str(simname.tolist()))

               
        if len(NewList3) == 0 and len(NewList1) == 0 and len(NewList2) == 0 and firstname not in NewList5: 
        	st.write("Sorry, we cannot find any suggestions.")
Example #10
0
def phonetic(data,text):


    def clean_text(raw_text):
        regex = re.compile('[^a-zA-Z\s:]')
        # First parameter is the replacement, second parameter is your input string
        filtered_text = regex.sub('', raw_text)
        # Eliminate multiple spaces
        filtered_text = re.sub(r'[\s]+', ' ', filtered_text)
        # Strip out terminal and leading spaces
        return filtered_text.strip()


    words = data
    sanitized_text=clean_text(text)


    token = word_tokenize(sanitized_text)
    # load stop words
    stop_words = stopwords.words('english')

    # Remove stop words
    token = [word for word in token if word not in stop_words]
    n = len(token)

    soundex = Soundex()
    metaphone = Metaphone()
    rs = RefinedSoundex()
    fs = FuzzySoundex()
    algorithms = [soundex, metaphone, rs, fs]

    cc = dict()
    # conversion of list of tuple to list of list
    for i in range(1, n):
        ngram_list = list(nltk.ngrams(token, i))
        ngram = [" ".join(i) for i in ngram_list]
        cc[str(i)] = ngram

    ngrams = sum(cc.values(), [])
    ngrams = [item for item in ngrams if not item.isdigit()]



    dict1 = dict()


    # Iterating over values
    for i in ngrams:
        for j in words:

            total = 0
            for entry in algorithms:
                code1 = entry.phonetics(i)
                code2 = entry.phonetics(j)

                similar = entry.sounds_like(i,j)
                if similar == True:
                    total += 1

            if total >= 3:
                dict1[str(i)] = j


            total = 0

    dict1= dict(reversed(list(dict1.items())))
    print(dict1)

    def multipleReplace(sentence, dict):
        punct= list(string.punctuation)
        punct=[i for i in punct if i not in [","]]
        punct="".join(punct)
        a=[]
        for i in sentence.split():
            j=i.translate(str.maketrans('','',punct))
            a.append(j)
        sentence=' '.join(a)


        """
        take a text and replace words that match the key in a dictionary
        with the associated value, return the changed text
        """
        for key in dict:
            sentence = sentence.replace(key, dict[key])
        return sentence

    result= multipleReplace(text, dict1)
    print(result)

    return result
Example #11
0
def insert_drug_database(drug_name):

    drug_id = 0
    if drug_name:
        soundex = Soundex()
        metaphone = Metaphone()
        rs = RefinedSoundex()
        file_path = "utils//DRUGS_ALL_EDITTED.csv"
        file_path = pkg_resources.resource_filename(__name__, file_path)
        file = open(file_path, "r")
        section = file.read()
        parts = re.split('[\n]', section)
        min_dist = 100
        new_name = re.sub("چ", "غ", drug_name)
        new_name = re.sub("ﻏ", "غ", new_name)
        new_name = normalize_arabic(new_name)
        name_en = translate_drug_name(drug_name)
        equals = []
        min_index = -1
        min_dist_all = 100
        min_index_all = -1
        chosen = False

        for part in parts:

            if distance_words(name_en, part) == 0 or distance_words(name_en, part) == 1:
                chosen = True
                print(" Matched To ->", part)
                drug_id = insert_drug(drug_name, normalize_arabic(drug_name), part)
                return drug_id, drug_name

            dist = rs.distance(name_en, part)
            if dist <min_dist_all:
                min_dist_all = dist
                min_index_all = parts.index(part)

            if soundex.sounds_like(new_name, part) or soundex.sounds_like(name_en, part):

                if rs.distance(new_name, part) < min_dist:
                    min_dist = rs.distance(new_name, part)
                    min_index = parts.index(part)
                equals.append((part,metaphone.phonetics(part)))

        if min_index != -1:
            for equ in equals:
                if equ[1] == metaphone.phonetics(name_en) or equ == metaphone.phonetics(new_name):
                    drug_id = insert_drug(drug_name, normalize_arabic(drug_name), equ[0])
                    chosen = True
                    return drug_id, drug_name

        if not chosen and min_index != -1:
            chosen = True
            drug_id = insert_drug(drug_name, normalize_arabic(drug_name), parts[min_index])
            return drug_id, drug_name

        if not chosen:
            drug_id = insert_drug(drug_name, normalize_arabic(drug_name), parts[min_index_all])
            return drug_id, drug_name

    else:
        drug_id = insert_drug("----------", "----------", "----------")
        drug_name = "default"
        return drug_id, drug_name

    return drug_id, drug_name