def standard_harakat(word): """ Treat Harakat on the word before output. معالجة الحركات قبل الإخراج، @param word: given vocalized word. @type word: unicode. @return: <vocalized word with ajusted harakat. @rtype: unicode. """ k = 1 new_word = word[0] len_word = len(word) while k < len_word: # الحروف من دون العلة لا تؤخذ بيعين الاعتبار، كما لا تؤخذ إذا كانت في أول الكلمة if word[k] not in (ALEF, YEH, WAW, ALEF_MAKSURA): new_word += word[k] else: ##إذا كان الحرف علة ولم يكن في أول الكلمة ##إذا كان ما قبله ليس حركة، ومابعده ليس حركة، أو انتهت الكلمة if not araby.is_shortharaka(word[k-1]) and \ (k+1 >= len_word or not araby.is_shortharaka(word[k+1])) : if word[k] == ALEF: new_word += FATHA+ALEF elif word[k] == WAW : new_word += DAMMA+WAW elif word[k] == YEH: new_word += KASRA+YEH else: new_word += word[k] else: new_word += word[k] k += 1 return new_word
def standard_harakat(word): """ Treat Harakat on the word before output. معالجة الحركات قبل الإخراج، @param word: given vocalized word. @type word: unicode. @return: <vocalized word with ajusted harakat. @rtype: unicode. """ k = 1 new_word = word[0] len_word = len(word) while k < len_word: # الحروف من دون العلة لا تؤخذ بيعين الاعتبار، كما لا تؤخذ إذا كانت في أول الكلمة if word[k] not in (ALEF, YEH, WAW, ALEF_MAKSURA): new_word += word[k] else: ##إذا كان الحرف علة ولم يكن في أول الكلمة ##إذا كان ما قبله ليس حركة، ومابعده ليس حركة، أو انتهت الكلمة if not araby.is_shortharaka(word[k-1]) and \ (k+1 >= len_word or not araby.is_shortharaka(word[k+1])) : if word[k] == ALEF: new_word += FATHA + ALEF elif word[k] == WAW: new_word += DAMMA + WAW elif word[k] == YEH: new_word += KASRA + YEH else: new_word += word[k] else: new_word += word[k] k += 1 return new_word
def uniformate_suffix(word): """ separate the harakat and the letters of the given word, it return two strings ( the word without harakat and the harakat). If the weaked letters are reprsented as long harakat and striped from the word. """ ## type : affix : uniformate affixes ## word = normalize_affix(word) word = word.replace(SHADDA, SUKUN + SHADDA) shakl = u"" word_nm = u"" i = 0 len_word = len(word) # print "len word", len(word) while i < len_word: if not araby.is_shortharaka(word[i]): # not in HARAKAT: word_nm += word[i] if i + 1 < len(word) and araby.is_shortharaka(word[i + 1]): if word[i + 1] == FATHA: if i+2 < len(word) and word[i+2] == ALEF and \ i+3 < len(word): shakl += vconst.ALEF_HARAKA i += 3 else: shakl += FATHA i += 2 elif word[i+1] == DAMMA and i+2 < len(word) and \ word[i+2] == WAW: if i + 3 >= len(word) or not araby.is_shortharaka( word[i + 3]): shakl += vconst.WAW_HARAKA i += 3 else: shakl += DAMMA i += 2 elif word[i+1] == KASRA and i+2 < len(word) and \ word[i+2] == YEH: if i + 3 >= len(word) or not araby.is_shortharaka( word[i + 3]): shakl += vconst.YEH_HARAKA i += 3 else: shakl += KASRA i += 2 else: shakl += word[i + 1] i += 2 elif i + 1 < len(word) and araby.is_haraka(word[i + 1]): shakl += word[i + 1] else: shakl += vconst.NOT_DEF_HARAKA i += 1 else: i += 1 if len(word_nm) == len(shakl): return (word_nm, shakl) else: return (u"", u"")
def uniformate_suffix(word): """ separate the harakat and the letters of the given word, it return two strings ( the word without harakat and the harakat). If the weaked letters are reprsented as long harakat and striped from the word. """ ## type : affix : uniformate affixes ## word = normalize_affix(word) word = word.replace(SHADDA, SUKUN+SHADDA) shakl = u"" word_nm = u"" i = 0 len_word = len(word) # print "len word", len(word) while i < len_word: if not araby.is_shortharaka(word[i]): # not in HARAKAT: word_nm += word[i] if i+1 < len(word) and araby.is_shortharaka(word[i+1]): if word[i+1] == FATHA : if i+2 < len(word) and word[i+2] == ALEF and \ i+3 < len(word): shakl += vconst.ALEF_HARAKA i += 3 else : shakl += FATHA i += 2 elif word[i+1] == DAMMA and i+2 < len(word) and \ word[i+2] == WAW: if i+3 >= len(word) or not araby.is_shortharaka(word[i+3]): shakl += vconst.WAW_HARAKA i += 3 else : shakl += DAMMA i += 2 elif word[i+1] == KASRA and i+2 < len(word) and \ word[i+2] == YEH: if i+3 >= len(word) or not araby.is_shortharaka(word[i+3]): shakl += vconst.YEH_HARAKA i += 3 else : shakl += KASRA i += 2 else : shakl += word[i+1] i += 2 elif i+1 < len(word) and araby.is_haraka(word[i+1]): shakl += word[i+1] else: shakl += vconst.NOT_DEF_HARAKA i += 1 else: i += 1 if len(word_nm) == len(shakl): return (word_nm, shakl) else: return (u"", u"")
def normalize(word, wordtype="affix"): """ Normalize the word, by unifoming hamzat, Alef madda, shadda, and lamalefs. @param word: given word. @type word: unicode. @param type: if the word is an affix @type type: unicode. @return: converted word. @rtype: unicode. """ # تحويل الكلمة إلى شكلها النظري. # الشكل اللإملائي للكلمة هو طريقة كتابتها حسب قواعد الإملاء # الشكل النظري هو الشكل المتخيل للكلمة دون تطبيق قواعد اللغة # ويخص عادة الأشكال المتعددة للهمزة، و التي تكتب همزة على السطر # أمثلة # إملائي نظري #إِمْلَائِي ءِمْلَاءِي #سَاَلَ سَءَلَ # الهدف : تحويل الكلمة إلى شكل نظري، #ومن ثم إمكانية تصريفها بعيدا عن قواعد الإملاء، #وبعد التصريف يتم تطبيق قواعد الإملاء من جديد. #الفرضية: الكلمات المدخلة مشكولة شكلا تاما. #الطريقة: # 1-تحويل جميع أنواع الهمزات إلى همزة على السطر # 1-فك الإدغام i = 0 # strip tatweel # the tatweel is used to uniformate the affix # when the Haraka is used separetely if wordtype != "affix": word = araby.strip_tatweel(word) ## تستبدل الألف الممدودة في , ل الكلمة بهمزة قطع بعدها همزة أخرى if word.startswith(ALEF_MADDA): word = normalize_alef_madda(word) # ignore harakat at the begin of the word len_word = len(word) while i < len_word and araby.is_shortharaka(word[i]): # in HARAKAT: i += 1 word = word[i:] # convert all Hamza from into one form word = araby.normalize_hamza(word) #Convert All LAM ALEF Ligature into separate letters word = word.replace(LAM_ALEF, SIMPLE_LAM_ALEF) word = word.replace(LAM_ALEF_HAMZA_ABOVE, SIMPLE_LAM_ALEF_HAMZA_ABOVE) word = word.replace(LAM_ALEF_MADDA_ABOVE, SIMPLE_LAM_ALEF_MADDA_ABOVE) return word
def normalize(word, wordtype = "affix"): """ Normalize the word, by unifoming hamzat, Alef madda, shadda, and lamalefs. @param word: given word. @type word: unicode. @param type: if the word is an affix @type type: unicode. @return: converted word. @rtype: unicode. """ # تحويل الكلمة إلى شكلها النظري. # الشكل اللإملائي للكلمة هو طريقة كتابتها حسب قواعد الإملاء # الشكل النظري هو الشكل المتخيل للكلمة دون تطبيق قواعد اللغة # ويخص عادة الأشكال المتعددة للهمزة، و التي تكتب همزة على السطر # أمثلة # إملائي نظري #إِمْلَائِي ءِمْلَاءِي #سَاَلَ سَءَلَ # الهدف : تحويل الكلمة إلى شكل نظري، #ومن ثم إمكانية تصريفها بعيدا عن قواعد الإملاء، #وبعد التصريف يتم تطبيق قواعد الإملاء من جديد. #الفرضية: الكلمات المدخلة مشكولة شكلا تاما. #الطريقة: # 1-تحويل جميع أنواع الهمزات إلى همزة على السطر # 1-فك الإدغام i = 0 # strip tatweel # the tatweel is used to uniformate the affix # when the Haraka is used separetely if wordtype != "affix": word = araby.strip_tatweel(word) ## تستبدل الألف الممدودة في , ل الكلمة بهمزة قطع بعدها همزة أخرى if word.startswith(ALEF_MADDA): word = normalize_alef_madda(word) # ignore harakat at the begin of the word len_word = len(word) while i < len_word and araby.is_shortharaka(word[i]): # in HARAKAT: i += 1 word = word[i:] # convert all Hamza from into one form word = araby.normalize_hamza(word) #Convert All LAM ALEF Ligature into separate letters word = word.replace(LAM_ALEF, SIMPLE_LAM_ALEF) word = word.replace(LAM_ALEF_HAMZA_ABOVE, SIMPLE_LAM_ALEF_HAMZA_ABOVE) word = word.replace(LAM_ALEF_MADDA_ABOVE, SIMPLE_LAM_ALEF_MADDA_ABOVE) return word
def test_is_letter(self): self.assertTrue(Araby.is_sukun(Araby.SUKUN)) self.assertTrue(Araby.is_shadda(Araby.SHADDA)) self.assertTrue(Araby.is_tatweel(Araby.TATWEEL)) for archar in Araby.TANWIN: self.assertTrue(Araby.is_tanwin(archar)) for archar in Araby.TASHKEEL: self.assertTrue(Araby.is_tashkeel(archar)) for haraka in Araby.HARAKAT: self.assertTrue(Araby.is_haraka(haraka)) for short_haraka in Araby.SHORTHARAKAT: self.assertTrue(Araby.is_shortharaka(short_haraka)) for liguature in Araby.LIGUATURES: self.assertTrue(Araby.is_ligature(liguature)) for hamza in Araby.HAMZAT: self.assertTrue(Araby.is_hamza(hamza)) for alef in Araby.ALEFAT: self.assertTrue(Araby.is_alef(alef)) for yeh in Araby.YEHLIKE: self.assertTrue(Araby.is_yehlike(yeh)) for waw in Araby.WAWLIKE: self.assertTrue(Araby.is_wawlike(waw)) for teh in Araby.TEHLIKE: self.assertTrue(Araby.is_teh) for small in Araby.SMALL: self.assertTrue(Araby.is_small(small)) for weak in Araby.WEAK: self.assertTrue(Araby.is_weak(weak)) for archar in Araby.MOON: self.assertTrue(Araby.is_moon(archar)) for archar in Araby.SUN: self.assertTrue(Araby.is_sun(archar))
def get_haraka_by_name(haraka_name): """ Convert an arabic named harakat to a real haraka values - Fahta:(فتحة) - DAMMA:(ضمة) - KASRA:(كسرة) @param haraka_name: the arabic name of haraka. @type haraka_name: unicode @return: the arabic name of haraka . @rtype: unicode char """ if araby.is_shortharaka(haraka_name): return haraka_name if haraka_name == u"فتحة": return FATHA elif haraka_name == u"ضمة": return DAMMA elif haraka_name == u"كسرة": return KASRA elif haraka_name == u"سكون": return SUKUN else: return False
def get_haraka_by_name(haraka_name): """ Convert an arabic named harakat to a real haraka values - Fahta:(فتحة) - DAMMA:(ضمة) - KASRA:(كسرة) @param haraka_name: the arabic name of haraka. @type haraka_name: unicode @return: the arabic name of haraka . @rtype: unicode char """ if araby.is_shortharaka(haraka_name): return haraka_name if haraka_name == u"فتحة" : return FATHA elif haraka_name == u"ضمة": return DAMMA elif haraka_name == u"كسرة": return KASRA elif haraka_name == u"سكون": return SUKUN else: return False
import sys sys.path.append("../") #~ import pyarabic.harf as harf #from pyarabic import harf import pyarabic.araby as araby for c in araby.arabicrange(): print(c, '\t', araby.name(c), end=" ") print('\t', end=" ") if araby.is_sukun(c): print("sukun", end=" ") if araby.is_haraka(c): print("haraka", end=" ") if araby.is_shadda(c): print("shadda", end=" ") if araby.is_tatweel(c): print("tatweel", end=" ") if araby.is_tashkeel(c): print("tashkeel", end=" ") if araby.is_tanwin(c): print("tanwin", end=" ") if araby.is_shortharaka(c): print("short haraka", end=" ") if araby.is_ligature(c): print(" ligature", end=" ") if araby.is_ligature(c): print('ligature', end=" ") if araby.is_hamza(c): print('hamza', end=" ") if araby.is_alef(c): print('alef', end=" ") if araby.is_yehlike(c): print('yeh', end=" ") if araby.is_wawlike(c): print('waw', end=" ") if araby.is_teh(c): print('teh', end=" ") if araby.is_small(c): print('small', end=" ") if araby.is_weak(c): print('weak', end=" ") if araby.is_moon(c): print('moon', end=" ") if araby.is_sun(c): print('sun', end=" ") print(araby.order(c), end=" ") print() word = u"الْعَرَيِيّةُ" word_list = [
# -*- coding: utf-8 -*- import sys sys.path.append('../') from pyarabic import araby for c in araby.arabicrange(): print (c,'\t', araby.name(c)) print ('\t') if araby.is_sukun(c): print ("sukun") if araby.is_haraka(c): print ("haraka") if araby.is_shadda(c): print ("shadda") if araby.is_tatweel(c): print ("tatweel") if araby.is_tashkeel(c): print ("tashkeel") if araby.is_tanwin(c): print ("tanwin") if araby.is_shortharaka(c): print ("short haraka"), if araby.is_ligature(c):print (" ligature"), if araby.is_ligature(c):print ('ligature'), if araby.is_hamza(c): print ('hamza'), if araby.is_alef(c): print ('alef'), if araby.is_yehlike(c): print ('yeh'), if araby.is_wawlike(c): print ('waw'), if araby.is_teh(c): print ('teh'), if araby.is_small(c): print ('small'), if araby.is_weak(c): print ('weak'), if araby.is_moon(c): print ('moon'), if araby.is_sun(c):print ('sun'), print (araby.order(c)), print (); word=u"الْعَرَيِيّةُ" word_list=[
def uniformate_verb(word): """ Separate the harakat and the letters of the given word, it return two strings ( the word without harakat and the harakat). If the weaked letters are reprsented as long harakat and striped from the word. @param word: given word. @type word: unicode. @return: (letters, harakat). @rtype: tuple of unicode. """ if word == "": return ("", "") #normalize ALEF MADDA if word.startswith(ALEF_MADDA): word = word.replace(ALEF_MADDA, HAMZA+HAMZA) else: word = word.replace(ALEF_MADDA, HAMZA+ALEF) word_nm = araby.strip_harakat(word) length = len(word_nm) if len(word_nm) != 3: # تستعمل الهمزات لتخمين حركات الفعل الثلاثي # normalize hamza here, because we use it to # detect harakat on the trilateral verb. word_nm = vconst.HAMZAT_PATTERN.sub(HAMZA, word_nm) # length of word after normalization # اهمزات تستعمل لكشف تشكيل الفعل، يتم توحيدها لاحقا if length == 3: if word_nm[1]in (ALEF, ALEF_HAMZA_ABOVE) or \ word_nm[2] in (ALEF_MAKSURA, ALEF_HAMZA_ABOVE, ALEF): marks = FATHA+FATHA+FATHA elif word[1] == YEH_HAMZA or word[2] in (YEH, YEH_HAMZA): marks = FATHA+KASRA+FATHA else: # let the verb haraka i = 0 ## ignore harakat at the began of the word while araby.is_shortharaka(word[i]):# in HARAKAT: i += 1 # الحرف الأول if not araby.is_shortharaka(word[i]):#not in HARAKAT: i += 1 # الحركة الأولى while araby.is_shortharaka(word[i]):#word[i] in HARAKAT: i += 1 # الحرف الثاني if not araby.is_shortharaka(word[i]):#word[i] not in HARAKAT: i += 1 #الحركة الثانية if not araby.is_shortharaka(word[i]):#word[i] not in HARAKAT: #وجدنا مشاكل في تصريف الفعل المضاعف في الماضي # نجعل الحركة الثانية فتحة مؤقتا #ToDo: review this case secondharaka = FATHA else: secondharaka = word[i] marks = u''.join([FATHA, secondharaka, FATHA]) # تستعمل الهمزات لتخمين حركات الفعل الثلاثي # normalize hamza here, because we use it to # detect harakat on the trilateral verb. word_nm = vconst.HAMZAT_PATTERN.sub(HAMZA, word_nm) elif length == 4: marks = vconst.UNIFORMATE_MARKS_4 elif length == 5: if word_nm.startswith(TEH): marks = vconst.UNIFORMATE_MARKS_5TEH else : marks = vconst.UNIFORMATE_MARKS_5 elif length == 6: marks = vconst.UNIFORMATE_MARKS_6 else: marks = FATHA*len(word_nm) i = 1 # first added automaticlly new_word = word_nm[0] new_harakat = marks[0] # between the first and the last while i < length-1: if word_nm[i] == ALEF: new_harakat = new_harakat[:-1]+vconst.ALEF_HARAKA else: new_harakat += marks[i] new_word += word_nm[i] i += 1 # the last letter ## حالة الفعل عيا، أعيا، عيّا والتي يتحول إلى ياء بدلا عن واو if word_nm[i] == ALEF: if len(word_nm) == 3 and word_nm[1] != YEH: new_word += vconst.ALEF_MAMDUDA else: new_word += YEH else: new_word += word_nm[i] new_harakat += marks[i] ## new_word += word_nm[i] return (new_word, new_harakat)
import sys sys.path.append("../") #~ import pyarabic.harf as harf #from pyarabic import harf import pyarabic.araby as araby for c in araby.arabicrange(): print c.encode('utf8'),'\t', araby.name(c).encode('utf8'), print '\t', if araby.is_sukun(c): print "sukun", if araby.is_haraka(c): print "haraka", if araby.is_shadda(c): print "shadda", if araby.is_tatweel(c): print "tatweel", if araby.is_tashkeel(c): print "tashkeel", if araby.is_tanwin(c): print "tanwin", if araby.is_shortharaka(c): print "short haraka", if araby.is_ligature(c):print " ligature", if araby.is_ligature(c):print 'ligature', if araby.is_hamza(c): print 'hamza', if araby.is_alef(c): print 'alef', if araby.is_yehlike(c): print 'yeh', if araby.is_wawlike(c): print 'waw', if araby.is_teh(c): print 'teh', if araby.is_small(c): print 'small', if araby.is_weak(c): print 'weak', if araby.is_moon(c): print 'moon', if araby.is_sun(c):print 'sun', print araby.order(c), print; word=u"الْعَرَيِيّةُ" word_list=[
def uniformate_verb(word): """ Separate the harakat and the letters of the given word, it return two strings ( the word without harakat and the harakat). If the weaked letters are reprsented as long harakat and striped from the word. @param word: given word. @type word: unicode. @return: (letters, harakat). @rtype: tuple of unicode. """ if word == "": return ("", "") #normalize ALEF MADDA if word.startswith(ALEF_MADDA): word = normalize_alef_madda(word) else: word = word.replace(ALEF_MADDA, HAMZA + ALEF) word_nm = araby.strip_harakat(word) length = len(word_nm) if len(word_nm) != 3: # تستعمل الهمزات لتخمين حركات الفعل الثلاثي # normalize hamza here, because we use it to # detect harakat on the trilateral verb. word_nm = vconst.HAMZAT_PATTERN.sub(HAMZA, word_nm) # length of word after normalization # اهمزات تستعمل لكشف تشكيل الفعل، يتم توحيدها لاحقا if length == 3: if word_nm[1]in (ALEF, ALEF_HAMZA_ABOVE) or \ word_nm[2] in (ALEF_MAKSURA, ALEF_HAMZA_ABOVE, ALEF): marks = FATHA + FATHA + FATHA elif word[1] == YEH_HAMZA or word[2] in (YEH, YEH_HAMZA): marks = FATHA + KASRA + FATHA else: # let the verb haraka i = 0 ## ignore harakat at the began of the word while araby.is_shortharaka(word[i]): # in HARAKAT: i += 1 # الحرف الأول if not araby.is_shortharaka(word[i]): #not in HARAKAT: i += 1 # الحركة الأولى while araby.is_shortharaka(word[i]): #word[i] in HARAKAT: i += 1 # الحرف الثاني if not araby.is_shortharaka(word[i]): #word[i] not in HARAKAT: i += 1 #الحركة الثانية if not araby.is_shortharaka(word[i]): #word[i] not in HARAKAT: #وجدنا مشاكل في تصريف الفعل المضاعف في الماضي # نجعل الحركة الثانية فتحة مؤقتا #ToDo: review this case secondharaka = FATHA else: secondharaka = word[i] marks = u''.join([FATHA, secondharaka, FATHA]) # تستعمل الهمزات لتخمين حركات الفعل الثلاثي # normalize hamza here, because we use it to # detect harakat on the trilateral verb. word_nm = vconst.HAMZAT_PATTERN.sub(HAMZA, word_nm) elif length == 4: marks = vconst.UNIFORMATE_MARKS_4 elif length == 5: if word_nm.startswith(TEH): marks = vconst.UNIFORMATE_MARKS_5TEH else: marks = vconst.UNIFORMATE_MARKS_5 elif length == 6: marks = vconst.UNIFORMATE_MARKS_6 else: marks = FATHA * len(word_nm) i = 1 # first added automaticlly new_word = word_nm[0] new_harakat = marks[0] # between the first and the last while i < length - 1: if word_nm[i] == ALEF: new_harakat = new_harakat[:-1] + vconst.ALEF_HARAKA else: new_harakat += marks[i] new_word += word_nm[i] i += 1 # the last letter ## حالة الفعل عيا، أعيا، عيّا والتي يتحول إلى ياء بدلا عن واو if word_nm[i] == ALEF: if len(word_nm) == 3 and word_nm[1] != YEH: new_word += vconst.ALEF_MAMDUDA else: new_word += YEH else: new_word += word_nm[i] new_harakat += marks[i] ## new_word += word_nm[i] return (new_word, new_harakat)
import sys sys.path.append("../") #~ import pyarabic.harf as harf #from pyarabic import harf import pyarabic.araby as araby for c in araby.arabicrange(): print c.encode('utf8'), '\t', araby.name(c).encode('utf8'), print '\t', if araby.is_sukun(c): print "sukun", if araby.is_haraka(c): print "haraka", if araby.is_shadda(c): print "shadda", if araby.is_tatweel(c): print "tatweel", if araby.is_tashkeel(c): print "tashkeel", if araby.is_tanwin(c): print "tanwin", if araby.is_shortharaka(c): print "short haraka", if araby.is_ligature(c): print " ligature", if araby.is_ligature(c): print 'ligature', if araby.is_hamza(c): print 'hamza', if araby.is_alef(c): print 'alef', if araby.is_yehlike(c): print 'yeh', if araby.is_wawlike(c): print 'waw', if araby.is_teh(c): print 'teh', if araby.is_small(c): print 'small', if araby.is_weak(c): print 'weak', if araby.is_moon(c): print 'moon', if araby.is_sun(c): print 'sun', print araby.order(c), print word = u"الْعَرَيِيّةُ" word_list = [