def test_normalize(self): self.assertIsNotNone(normalize("พรรค์จันทร์ab์")) # normalize sara e + sara e self.assertEqual(normalize("เเปลก"), "แปลก") # normalize consonant + nikhahit + sara aa self.assertEqual(normalize("นํา"), "นำ") self.assertEqual(normalize("\u0e01\u0e4d\u0e32"), "\u0e01\u0e33") # normalize consonant + tone mark + nikhahit + sara aa self.assertEqual(normalize("\u0e01\u0e48\u0e4d\u0e32"), "\u0e01\u0e48\u0e33") # reorder consonant + follow vowel + tone mark self.assertEqual(normalize("\u0e01\u0e30\u0e48"), "\u0e01\u0e48\u0e30") # reorder consonant + nikhahit + tone mark + sara aa self.assertEqual(normalize("\u0e01\u0e4d\u0e48\u0e32"), "\u0e01\u0e48\u0e33") # reorder consonant + follow vowel + tone mark self.assertEqual(normalize("\u0e01\u0e32\u0e48"), "\u0e01\u0e48\u0e32") # remove repeating following vowels self.assertEqual(normalize("กาา"), "กา") self.assertEqual(normalize("กา า า า"), "กา") self.assertEqual(normalize("กา าาะา"), "กาะา") # remove epeating tone marks self.assertEqual(normalize("\u0e01\u0e48\u0e48"), "\u0e01\u0e48") # remove repeating different ton emarks self.assertEqual(normalize("\u0e01\u0e48\u0e49"), "\u0e01\u0e49") self.assertEqual(normalize("\u0e01\u0e48\u0e49\u0e48\u0e49"), "\u0e01\u0e49") # remove tone mark at the beginning of text self.assertEqual(remove_dangling("\u0e48\u0e01"), "\u0e01") self.assertEqual(remove_dangling("\u0e48\u0e48\u0e01"), "\u0e01") self.assertEqual(remove_dangling("\u0e48\u0e49\u0e01"), "\u0e01") self.assertEqual(remove_dangling("\u0e48\u0e01\u0e48"), "\u0e01\u0e48") # remove duplicate spaces self.assertEqual(remove_dup_spaces(" ab c d "), "ab c d") self.assertEqual(remove_dup_spaces("\nab c \n d \n"), "ab c\nd") # remove tone marks self.assertEqual(remove_tonemark("จิ้น"), "จิน") self.assertEqual(remove_tonemark("เก๋า"), "เกา") self.assertEqual(delete_tone("เจ๋งเป้ง"), remove_tonemark("เจ๋งเป้ง")) with self.assertWarns(DeprecationWarning): delete_tone("ค้าบ") # remove zero width chars self.assertEqual(remove_zw("กา\u200b"), "กา") self.assertEqual(remove_zw("ก\u200cา"), "กา") self.assertEqual(remove_zw("\u200bกา"), "กา") self.assertEqual(remove_zw("กา\u200b\u200c\u200b"), "กา")
def test_delete_tone(self): self.assertEqual(delete_tone("จิ้น"), "จิน") self.assertEqual(delete_tone("เก๋า"), "เกา") # Commented out until this unittest bug get fixed: # https://bugs.python.org/issue29620 # with self.assertWarns(DeprecationWarning): # deletetone("จิ้น") self.assertEqual(deletetone("จิ้น"), delete_tone("จิ้น"))