def test_normalize(self): self.assertIsNotNone(normalize("พรรค์จันทร์ab์")) # normalize sara e + sara e self.assertEqual(normalize("เเปลก"), "แปลก") # normalize consonant + nikhahit + sara aa self.assertEqual(normalize("นํา"), "นำ") self.assertEqual(normalize("\u0e01\u0e4d\u0e32"), "\u0e01\u0e33") # normalize consonant + tone mark + nikhahit + sara aa self.assertEqual(normalize("\u0e01\u0e48\u0e4d\u0e32"), "\u0e01\u0e48\u0e33") # reorder consonant + follow vowel + tone mark self.assertEqual(normalize("\u0e01\u0e30\u0e48"), "\u0e01\u0e48\u0e30") # reorder consonant + nikhahit + tone mark + sara aa self.assertEqual(normalize("\u0e01\u0e4d\u0e48\u0e32"), "\u0e01\u0e48\u0e33") # reorder consonant + follow vowel + tone mark self.assertEqual(normalize("\u0e01\u0e32\u0e48"), "\u0e01\u0e48\u0e32") # remove repeating following vowels self.assertEqual(normalize("กาา"), "กา") self.assertEqual(normalize("กา า า า"), "กา") self.assertEqual(normalize("กา าาะา"), "กาะา") # remove epeating tone marks self.assertEqual(normalize("\u0e01\u0e48\u0e48"), "\u0e01\u0e48") # remove repeating different ton emarks self.assertEqual(normalize("\u0e01\u0e48\u0e49"), "\u0e01\u0e49") self.assertEqual(normalize("\u0e01\u0e48\u0e49\u0e48\u0e49"), "\u0e01\u0e49") # remove tone mark at the beginning of text self.assertEqual(remove_dangling("\u0e48\u0e01"), "\u0e01") self.assertEqual(remove_dangling("\u0e48\u0e48\u0e01"), "\u0e01") self.assertEqual(remove_dangling("\u0e48\u0e49\u0e01"), "\u0e01") self.assertEqual(remove_dangling("\u0e48\u0e01\u0e48"), "\u0e01\u0e48") # remove duplicate spaces self.assertEqual(remove_dup_spaces(" ab c d "), "ab c d") self.assertEqual(remove_dup_spaces("\nab c \n d \n"), "ab c\nd") # remove tone marks self.assertEqual(remove_tonemark("จิ้น"), "จิน") self.assertEqual(remove_tonemark("เก๋า"), "เกา") self.assertEqual(delete_tone("เจ๋งเป้ง"), remove_tonemark("เจ๋งเป้ง")) with self.assertWarns(DeprecationWarning): delete_tone("ค้าบ") # remove zero width chars self.assertEqual(remove_zw("กา\u200b"), "กา") self.assertEqual(remove_zw("ก\u200cา"), "กา") self.assertEqual(remove_zw("\u200bกา"), "กา") self.assertEqual(remove_zw("กา\u200b\u200c\u200b"), "กา")
def lk82(text: str) -> str: """ This function converts Thai text into phonetic code with the a Thai soundex algorithm named **LK82** [#lk82]_. :param str text: Thai word :return: LK82 soundex of the given Thai word :rtype: str :Example: :: from pythainlp.soundex import lk82 lk82("ลัก") # output: 'ร1000' lk82("รัก") # output: 'ร1000' lk82("รักษ์") # output: 'ร1000' lk82("บูรณการ") # output: 'บE419' lk82("ปัจจุบัน") # output: 'ป3E54' """ if not text or not isinstance(text, str): return "" text = remove_tonemark(text) # 4. remove tone marks text = _RE_KARANT.sub("", text) # 4. remove "karat" characters text = _RE_SIGN.sub("", text) # 5. remove Mai tai khu, if not text: return "" # 6. encode the first character res = [] if "ก" <= text[0] <= "ฮ": res.append(text[0].translate(_TRANS1)) text = text[1:] else: if len(text) > 1: res.append(text[1].translate(_TRANS1)) res.append(text[0].translate(_TRANS2)) text = text[2:] # encode the rest i_v = None # ตำแหน่งตัวคั่นล่าสุด (สระ) len_text = len(text) for i, c in enumerate(text): if ( c in "\u0e30\u0e31\u0e34\u0e35" ): # 7. ตัวคั่นเฉยๆ/ Sara A, Mai Han-Akat, Sara I, Sara II i_v = i res.append("") elif ( c in "\u0e32\u0e36\u0e37\u0e39\u0e45" ): # 8. คั่นและใส่/ Sara Aa, Sara Ue, Sara Uee, Sara Uu, Lankkhangyao i_v = i res.append(c.translate(_TRANS2)) elif c == "\u0e38": # 9. สระอุ / Sara U i_v = i if i == 0 or (text[i - 1] not in "ตธ"): res.append(c.translate(_TRANS2)) else: res.append("") elif c in "\u0e2b\u0e2d": # หอ if i + 1 < len_text and ( text[i + 1] in "\u0e36\u0e37\u0e38\u0e39" ): # Sara Ue, Sara Uee, Sara U, Sara Uu res.append(c.translate(_TRANS2)) elif c in "\u0e22\u0e23\u0e24\u0e26\u0e27": if i_v == i - 1 or ( i + 1 < len_text and (text[i + 1] in "\u0e36\u0e37\u0e38\u0e39") ): # Sara Ue, Sara Uee, Sara U, Sara Uu res.append(c.translate(_TRANS2)) else: res.append(c.translate(_TRANS2)) # 12. # 13. remove repetitives res2 = [res[0]] for i in range(1, len(res)): if res[i] != res[i - 1]: res2.append(res[i]) # 14. fill zeros return ("".join(res2) + "0000")[:5]