def test_normalize(self):
        self.assertIsNotNone(normalize("พรรค์จันทร์ab์"))

        # normalize sara e + sara e
        self.assertEqual(normalize("เเปลก"), "แปลก")

        # normalize consonant + nikhahit + sara aa
        self.assertEqual(normalize("นํา"), "นำ")
        self.assertEqual(normalize("\u0e01\u0e4d\u0e32"), "\u0e01\u0e33")

        # normalize consonant + tone mark + nikhahit + sara aa
        self.assertEqual(normalize("\u0e01\u0e48\u0e4d\u0e32"),
                         "\u0e01\u0e48\u0e33")

        # reorder consonant + follow vowel + tone mark
        self.assertEqual(normalize("\u0e01\u0e30\u0e48"), "\u0e01\u0e48\u0e30")

        # reorder consonant + nikhahit + tone mark + sara aa
        self.assertEqual(normalize("\u0e01\u0e4d\u0e48\u0e32"),
                         "\u0e01\u0e48\u0e33")

        # reorder consonant + follow vowel + tone mark
        self.assertEqual(normalize("\u0e01\u0e32\u0e48"), "\u0e01\u0e48\u0e32")

        # remove repeating following vowels
        self.assertEqual(normalize("กาา"), "กา")
        self.assertEqual(normalize("กา า  า  า"), "กา")
        self.assertEqual(normalize("กา าาะา"), "กาะา")

        # remove epeating tone marks
        self.assertEqual(normalize("\u0e01\u0e48\u0e48"), "\u0e01\u0e48")

        # remove repeating different ton emarks
        self.assertEqual(normalize("\u0e01\u0e48\u0e49"), "\u0e01\u0e49")
        self.assertEqual(normalize("\u0e01\u0e48\u0e49\u0e48\u0e49"),
                         "\u0e01\u0e49")

        # remove tone mark at the beginning of text
        self.assertEqual(remove_dangling("\u0e48\u0e01"), "\u0e01")
        self.assertEqual(remove_dangling("\u0e48\u0e48\u0e01"), "\u0e01")
        self.assertEqual(remove_dangling("\u0e48\u0e49\u0e01"), "\u0e01")
        self.assertEqual(remove_dangling("\u0e48\u0e01\u0e48"), "\u0e01\u0e48")

        # remove duplicate spaces
        self.assertEqual(remove_dup_spaces("  ab  c d  "), "ab c d")
        self.assertEqual(remove_dup_spaces("\nab  c   \n d \n"), "ab c\nd")

        # remove tone marks
        self.assertEqual(remove_tonemark("จิ้น"), "จิน")
        self.assertEqual(remove_tonemark("เก๋า"), "เกา")
        self.assertEqual(delete_tone("เจ๋งเป้ง"), remove_tonemark("เจ๋งเป้ง"))
        with self.assertWarns(DeprecationWarning):
            delete_tone("ค้าบ")

        # remove zero width chars
        self.assertEqual(remove_zw("กา\u200b"), "กา")
        self.assertEqual(remove_zw("ก\u200cา"), "กา")
        self.assertEqual(remove_zw("\u200bกา"), "กา")
        self.assertEqual(remove_zw("กา\u200b\u200c\u200b"), "กา")
Example #2
0
def lk82(text: str) -> str:
    """
    This function converts Thai text into phonetic code with the a
    Thai soundex algorithm named **LK82** [#lk82]_.

    :param str text: Thai word

    :return: LK82 soundex of the given Thai word
    :rtype: str

    :Example:
    ::

        from pythainlp.soundex import lk82

        lk82("ลัก")
        # output: 'ร1000'

        lk82("รัก")
        # output: 'ร1000'

        lk82("รักษ์")
        # output: 'ร1000'

        lk82("บูรณการ")
        # output: 'บE419'

        lk82("ปัจจุบัน")
        # output: 'ป3E54'
    """
    if not text or not isinstance(text, str):
        return ""

    text = remove_tonemark(text)  # 4. remove tone marks
    text = _RE_KARANT.sub("", text)  # 4. remove "karat" characters
    text = _RE_SIGN.sub("", text)  # 5. remove Mai tai khu,

    if not text:
        return ""

    # 6. encode the first character
    res = []
    if "ก" <= text[0] <= "ฮ":
        res.append(text[0].translate(_TRANS1))
        text = text[1:]
    else:
        if len(text) > 1:
            res.append(text[1].translate(_TRANS1))
        res.append(text[0].translate(_TRANS2))
        text = text[2:]

    # encode the rest
    i_v = None  # ตำแหน่งตัวคั่นล่าสุด (สระ)
    len_text = len(text)
    for i, c in enumerate(text):
        if (
            c in "\u0e30\u0e31\u0e34\u0e35"
        ):  # 7. ตัวคั่นเฉยๆ/ Sara A, Mai Han-Akat, Sara I, Sara II
            i_v = i
            res.append("")
        elif (
            c in "\u0e32\u0e36\u0e37\u0e39\u0e45"
        ):  # 8. คั่นและใส่/ Sara Aa, Sara Ue, Sara Uee, Sara Uu, Lankkhangyao
            i_v = i
            res.append(c.translate(_TRANS2))
        elif c == "\u0e38":  # 9. สระอุ / Sara U
            i_v = i
            if i == 0 or (text[i - 1] not in "ตธ"):
                res.append(c.translate(_TRANS2))
            else:
                res.append("")
        elif c in "\u0e2b\u0e2d":  # หอ
            if i + 1 < len_text and (
                text[i + 1] in "\u0e36\u0e37\u0e38\u0e39"
            ):  # Sara Ue, Sara Uee, Sara U, Sara Uu
                res.append(c.translate(_TRANS2))
        elif c in "\u0e22\u0e23\u0e24\u0e26\u0e27":
            if i_v == i - 1 or (
                i + 1 < len_text
                and (text[i + 1] in "\u0e36\u0e37\u0e38\u0e39")
            ):  # Sara Ue, Sara Uee, Sara U, Sara Uu
                res.append(c.translate(_TRANS2))
        else:
            res.append(c.translate(_TRANS2))  # 12.

    # 13. remove repetitives
    res2 = [res[0]]
    for i in range(1, len(res)):
        if res[i] != res[i - 1]:
            res2.append(res[i])

    # 14. fill zeros
    return ("".join(res2) + "0000")[:5]