def compare(self, string1, string2): '''Compare soundex of given strings This function checks if 2 given strings are phonetically sounds same by doing soundex code comparison :param string1: First string for comparison :param string2: Second string for comparison :return: Returns 0 if both strings are same, 1 if strings sound phonetically same, 2 if strings are phonetically not same. We can't perform English cross language comparision if English string is passed as one function will return -1. ''' # do a quick check if string1 == string2: return 0 string1_lang = get_language(string1[0]) string2_lang = get_language(string2[0]) if (string1_lang == 'en_US' and string2_lang != 'en_US') or \ (string1_lang != 'en_US' and string2_lang == 'en_US'): # Can't Soundex compare English and Indic string return -1 soundex1 = self.soundex(string1) soundex2 = self.soundex(string2) if soundex1[1:] == soundex2[1:] and soundex1[0] == soundex2[0]: # Strings sound phonetically same and same language return 1 if soundex1[1:] == soundex2[1:] and soundex1[0] != soundex2[0]: # Strings sound phonetically same but different language return 2 else: # Strings are not same return -1
def compare(self, string1, string2): '''Compare soundex of given strings This function checks if 2 given strings are phonetically sounds same by doing soundex code comparison :param string1: First string for comparison :param string2: Second string for comparison :return: Returns 0 if both strings are same, 1 if strings sound phonetically same, 2 if strings are phonetically not same. We can't perform English cross language comparision if English string is passed as one function will return -1. ''' # do a quick check if string1 == string2: return 0 string1_lang = get_language(string1[0]) string2_lang = get_language(string2[0]) if (string1_lang == 'en_US' and string2_lang != 'en_US') or \ (string1_lang != 'en_US' and string2_lang == 'en_US'): # Can't Soundex compare English and Indic string return -1 soundex1 = self.soundex(string1) soundex2 = self.soundex(string2) if soundex1[1:] == soundex2[1:]: # Strings sound phonetically same return 1 # Strings are not same return 2
def soundex(self, name, length=8): '''Calculate soundex of given string This function calculates soundex for Indian language string as well as English string. This function is exposed as service method for JSONRPC in SILPA framework. :param name: String whose Soundex value to be calculated :param length: Length of final Soundex string, if soundex caculated is more than this it will be truncated to length. :return: Soundex string of `name' ''' sndx = [] fc = name[0] # translate alpha chars in name to soundex digits for c in name[1:].lower(): d = str(self.soundexCode(c)) # remove all 0s from the soundex code if d == '0': continue # duplicate consecutive soundex digits are skipped if len(sndx) == 0: sndx.append(d) elif d != sndx[-1]: sndx.append(d) # append first character to result sndx.insert(0, fc) if get_language(name[0]) == 'en_US': # Don't padd return ''.join(sndx) if len(sndx) < length: sndx.extend(repeat('0', length)) return ''.join(sndx[:length]) return ''.join(sndx[:length])
def soundexCode(self, char): '''Return the soundex code for given character :param char: Character whose soundex code is needed :return: Returns soundex code if character is found in charmap else returns 0 ''' lang = get_language(char) try: if lang == "en_US": return _soundex_map["soundex_en"][charmap[lang].index(char)] else: return _soundex_map["soundex"][charmap[lang].index(char)] except: # Case of exception KeyError because we don't have soundex # mapping for the character pass return 0
def test_get_language(): '''TEST: Get language''' assert get_language(u'ನ') == 'kn_IN' assert get_language(u'അ') == 'ml_IN' assert get_language(u'அ') == 'ta_IN' assert get_language(u'అ') == 'te_IN' assert get_language(u'અ') == 'gu_IN' assert get_language(u'অ') == 'bn_IN' assert get_language(u'ਅ') == 'pa_IN' assert get_language(u'अ') == 'hi_IN' assert get_language(u'ଅ') == 'or_IN' assert get_language('a') == 'en_US' assert get_language('eː') == 'IPA' assert get_language('ê') == 'ISO15919' assert get_language('!') is None