def test_easy_first(self): word3 = Word(u"لي") self.assertEqual(word3.stems, [(Prefix(u"ل"), u"ي", Suffix(u""))]) word4 = Word(u"لكم") self.assertEqual(word4.stems, [(Prefix(u"ل"), u"كم", Suffix(u""))]) word5 = Word(u"لكما") self.assertEqual(word5.stems, [(Prefix(u"ل"), u"كما", Suffix(u""))])
def test_prefix_DT(self): result = prefix_DT_no_suffix(Prefix(u'', classe=u'', desc=u''), u'', Suffix(u'', classe=u'', desc=u'')) self.assertTrue(result) result = prefix_DT_no_suffix(Prefix(u'وال', classe=u'pN2', desc=u'CC+DT'), u'', Suffix(u'', classe=u'', desc=u'')) self.assertTrue(result) result = prefix_DT_no_suffix(Prefix(u'ال', classe=u'pN1', desc=u'DT'), u'', Suffix(u'', classe=u'', desc=u'')) self.assertTrue(result) result = prefix_DT_no_suffix(Prefix(u'وال', classe=u'pN2', desc=u'CC+DT'), u'', Suffix("ي", desc="PRP|OBJP", classe="sC2")) self.assertFalse(result)
def test_valid_len(self): result = has_valid_len(Prefix(u'', classe=u'', desc=u''), u'', Suffix(u'', classe=u'', desc=u'')) self.assertFalse(result) result = has_valid_len(Prefix(u'', classe=u'', desc=u''), u'ا', Suffix(u'', classe=u'', desc=u'')) self.assertFalse(result) result = has_valid_len(Prefix(u'', classe=u'', desc=u''), u'بحباني', Suffix(u'', classe=u'', desc=u'')) self.assertTrue(result) result = has_valid_len(Prefix(u'و', classe=u'pC2', desc=u'CC'), u'يعرب', Suffix(u'', classe=u'', desc=u'')) self.assertTrue(result)
def test_fixer(self): result = fix_stem((Prefix(u''), u'', Suffix(u''))) self.assertEqual(result, (Prefix(u''), u'', Suffix(u''))) result = fix_stem((Prefix(u'ال', classe=u'pN1'), u'تلفزيون', Suffix(u'', classe=u'pC1'))) self.assertEqual(result, (Prefix(u'ال', classe=u'pN1'), u'تلفزيون', Suffix(u'', classe=u'pC1'))) result = fix_stem((Prefix(u'', classe=u'pC1'), u'لاجئ', Suffix(u'نا', classe=u'sC3'))) self.assertEqual( result, (Prefix(u'', classe=u'pC1'), u'لاجء', Suffix(u'نا', classe=u'sC3'))) result = fix_stem((Prefix(u'لل', classe=u'pC1'), u'لاجئ', Suffix(u'نا', classe=u'Sc3'))) self.assertEqual(result, (Prefix(u'ل ال', classe=u'pC1'), u'لاجء', Suffix(u'نا', classe=u'Sc3'))) result = fix_stem((Prefix(u'لل', classe=u'pN13'), u'تلفزيون', Suffix(u'', classe=u'pC1'))) self.assertEqual(result, (Prefix(u'ل ال', classe=u'pN13'), u'تلفزيون', Suffix(u'', classe=u'pC1')))
def test_segment_words(self): ''' test segment words to all possible prefixes, circufixes, and suffixes ''' word = Word(u"") self.assertEqual(word.string, u"") self.assertEqual(word.stems, []) word2 = Word(u"الله") self.assertEqual([case for case in word2], [ (Prefix(u'', classe=u'pC1'), u'الله', Suffix(u'', classe=u'sC1')), (Prefix(u'', classe=u'pC1'), u'الل', Suffix(u'ه', classe=u'sC10')), (Prefix(u'ال', classe=u'pN1'), u'له', Suffix(u'', classe=u'sC1')), ])
def test_tag(self): processed = tag(u'install') self.assertEqual([[u'install', u'NN']], processed) processed = tag(u'1- بسمك اللهم، :)') self.assertEqual(processed[0], [u'1', u'CD']) self.assertEqual(processed[1], [u'-', u'PUNC']) self.assertEqual(processed[2], [ (Prefix(u'', classe=u'pC1'), u'بسمك', Suffix(u'', classe=u'sC1')), (Prefix(u'', classe=u'pC1'), u'بسم', Suffix(u'ك', classe=u'sC4', desc="PRP|OBJP")), (Prefix(u'ب', classe=u'pN25', desc="IN"), u'سمك', Suffix(u'', classe=u'sC1')), (Prefix(u'ب', classe=u'pN25', desc="IN"), u'سم', Suffix(u'ك', classe=u'sC4', desc="PRP|OBJP")), ]) self.assertEqual(processed[3], [ (Prefix(u'', classe=u'pC1', desc=u''), u'اللهم', Suffix(u'', classe=u'sC1', desc=u'')), (Prefix(u'', classe=u'pC1', desc=u''), u'الل', Suffix(u'هم', classe=u'sC13', desc=u"PRP|OBJP")), (Prefix(u'ال', classe=u'pN1', desc=u"DT"), u'لهم', Suffix(u'', classe=u'sC1', desc=u'')), ]) self.assertEqual(processed[4], [u'،', u'PUNC']) self.assertEqual(processed[5], [u':)', u'EMO'])
def test_hamza_fix(self): result = hamza_fix((Prefix(u''), u'', Suffix(u''))) self.assertEqual(result, (Prefix(u''), u'', Suffix(u''))) result = hamza_fix((Prefix(u'', classe=u'pC1'), u'لاجئ', Suffix(u'نا', classe=u'sC3'))) self.assertEqual( result, (Prefix(u'', classe=u'pC1'), u'لاجء', Suffix(u'نا', classe=u'sC3'))) result = hamza_fix((Prefix(u'ال', classe=u'pN1'), u'تلفزيون', Suffix(u'', classe=u'pC1'))) self.assertEqual(result, (Prefix(u'ال', classe=u'pN1'), u'تلفزيون', Suffix(u'', classe=u'pC1')))
def test_lam_lam_fix(self): result = lam_lam_fix((Prefix(u''), u'', Suffix(u''))) self.assertEqual(result, (Prefix(u''), u'', Suffix(u''))) result = lam_lam_fix((Prefix(u'لل', classe=u'pN13'), u'تلفزيون', Suffix(u'', classe=u'pC1'))) self.assertEqual(result, (Prefix(u'ل ال', classe=u'pN13'), u'تلفزيون', Suffix(u'', classe=u'pC1'))) result = lam_lam_fix((Prefix(u'ال', classe=u'pN1'), u'تلفزيون', Suffix(u'', classe=u'pC1'))) self.assertEqual(result, (Prefix(u'ال', classe=u'pN1'), u'تلفزيون', Suffix(u'', classe=u'pC1')))
def test_variations(self): word = Word(u'مدرستها') self.assertEqual(word.stems, [(Prefix(u'', classe=u'', desc=u''), u'مدرستها', Suffix(u'', classe=u'', desc=u'')), (Prefix(u'', classe=u'', desc=u''), u'مدرست', Suffix(u'ها', classe=u'sC11', desc=u'PRP|OBJP')), (Prefix(u'', classe=u'', desc=u''), u'مدرسة', Suffix(u'ها', classe=u'sC11', desc=u'PRP|O,BJP')), (Prefix(u'م', classe=u'pV1', desc=u'NG'), u'درستها', Suffix(u'', classe=u'', desc=u'')), (Prefix(u'م', classe=u'pV1', desc=u'NG'), u'درست', Suffix(u'ها', classe=u'sC11', desc=u'PRP|OBJP')), (Prefix(u'م', classe=u'pV1', desc=u'NG'), u'درسة', Suffix(u'ها', classe=u'sC11', desc=u'PRP|OBJP'))])
":-?": "teasing/playful", ":-b": "teasing/playful", ":b": "teasing/playful", ";)": "wink", u"º)": "wink", ";-)": "wink", ";]": "wink", u"^Ü^": "happy", } special_tokens = EMOTICONS from DAPOS.data.variation import Prefix, Suffix EASY_WORDS = { u"ليا": [(Prefix(u"ل"), u"يا", Suffix(u""))], u"لي": [(Prefix(u"ل"), u"ي", Suffix(u""))], u"لكم": [(Prefix(u"ل"), u"كم", Suffix(u""))], u"لكما": [(Prefix(u"ل"), u"كما", Suffix(u""))], u"له": [(Prefix(u"ل"), u"ه", Suffix(u""))], u"لها": [(Prefix(u"ل"), u"ها", Suffix(u""))], u"لهم": [(Prefix(u"ل"), u"هم", Suffix(u""))], u"لهما": [(Prefix(u"ل"), u"هما", Suffix(u""))], u"لهن": [(Prefix(u"ل"), u"هم", Suffix(u""))], u"بيا": [(Prefix(u"ب"), u"يا", Suffix(u""))], u"بي": [(Prefix(u"ب"), u"ي", Suffix(u""))], u"بك": [(Prefix(u"ب"), u"ك", Suffix(u""))], u"بكم": [(Prefix(u"ب"), u"كم", Suffix(u""))], u"بكما": [(Prefix(u"ب"), u"كما", Suffix(u""))], u"به": [(Prefix(u"ب"), u"ه", Suffix(u""))], u"بها": [(Prefix(u"ب"), u"ها", Suffix(u""))],