def check_shadda(word_vocalised, resulted_data, fully_vocalized_input=False): """ if the entred word is like the found word in dictionary, to treat some normalized cases, the analyzer return the vocalized like words. This function treat the Shadda case. @param word_vocalised: the input word. @type word_vocalised: unicode. @param resulted_data: the founded resulat from dictionary. @type resulted_data: list of dict. @param fully_vocalized_input: if the two words must resect the shadda and vocalized. @type fully_vocalized_input: Boolean, default is False. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ #~return filter(lambda item: araby.shaddalike(word_vocalised, #~item.__dict__.get('vocalized', '')), resulted_data) #~x for x in [1, 1, 2] if x == 1 #~ return [ #~ x for x in resulted_data #~ if araby.shaddalike(word_vocalised, x.__dict__.get('vocalized', '')) ] if fully_vocalized_input: return [ x for x in resulted_data if araby.strip_harakat(word_vocalised) == araby.strip_harakat( x.__dict__.get('vocalized', '')) ] else: return [ x for x in resulted_data if araby.shaddalike( word_vocalised, x.__dict__.get('vocalized', '')) ]
def normalizeText(self, text): normalized_text = araby.strip_tatweel(text) normalized_text = araby.strip_tashkeel(normalized_text) normalized_text = araby.strip_harakat(normalized_text) normalized_text = araby.normalize_hamza(normalized_text) return normalized_text
def normalize_alef_madda(word): """ Convert Alef madda into two letters. @param word: given word. @type word: unicode. @return: converted word. @rtype: unicode. """ if word.startswith(ALEF_MADDA): word_nm = araby.strip_harakat(word) #print word, word_nm, len(word), len(word_nm) if len(word_nm) == 2: return word_nm.replace(ALEF_MADDA, HAMZA + ALEF) elif len(word_nm) == 3: if word_nm in vconst.ALEF_MADDA_VERB_TABLE: #print word, "exists in madd table", vconst.ALEF_MADDA_VERB_TABLE[word_nm][0] #return the first one only #mylist = ALEF_MADDA_VERB_TABLE[word_nm] return vconst.ALEF_MADDA_VERB_TABLE[word_nm][0] else: return word_nm.replace(ALEF_MADDA, HAMZA + ALEF) else: return word_nm.replace(ALEF_MADDA, HAMZA + ALEF) else: return word_nm
def find_triliteral_verb(db_base_path, triliteralverb, givenharaka): """ Find the triliteral verb in the dictionary, return a list of possible verb forms @param db_base_path: the database path @type db_base_path: path string. @param triliteralverb: given verb. @type triliteralverb: unicode. @param givenharaka: given haraka of tuture type of the verb. @type givenharaka: unicode. @return: list of triliteral verbs. @rtype: list of unicode. """ liste = [] try: import sqlite3 as sqlite import os # db_path = os.path.join(_base_directory(req), "data/verbdict.db") #db_path = os.path.join(db_base_path, "data/verbdict.db") conn = sqlite.connect(db_path) cursor = conn.cursor() verb_nm = araby.strip_harakat(triliteralverb) tup = (verb_nm, ) cursor.execute("""select verb_vocalised, haraka, transitive from verbdict where verb_unvocalised = ?""", tup) for row in cursor: verb_vocalised = row[0] haraka = row[1] transitive = row[2] # Return the transitivity option #MEEM is transitive # KAF is commun ( transitive and intransitive) # LAM is intransitive if transitive in (araby.KAF, araby.MEEM): transitive = True else: transitive = False # if the given verb is the list, #it will be inserted in the top of the list, #to be treated in prior if triliteralverb == verb_vocalised and givenharaka == haraka: liste.insert(0, {"verb":verb_vocalised, "haraka":haraka, "transitive":transitive}) # else the verb is appended in the liste else: liste.append({"verb":verb_vocalised, "haraka":haraka, "transitive":transitive}) cursor.close() return liste except IOError: return None
def is_triliteral_verb(verb): """ Test if the verb is triliteral, used in selectionof verbs from the triliteral verb dictionnary @param verb: given verb. @type verb: unicode. @return: True if the verb is triliteral. @rtype: Boolean. """ verb_nm = araby.strip_harakat(verb) verb_nm = verb_nm.replace(ALEF_MADDA, HAMZA+ALEF) if len(verb_nm) == 3: return True else : return False
def create_index_triverbtable(): """ Create index from the verb dictionary to accelerate the search in the dictionary for verbs @return: create the TRIVERBTABLE_INDEX @rtype: None """ # the key is the vocverb + the bab number for key in triverbtable.TriVerbTable.keys(): vocverb = triverbtable.TriVerbTable[key]['verb'] unvverb = araby.strip_harakat(vocverb) normverb = araby.normalize_hamza(unvverb) if TRIVERBTABLE_INDEX.has_key(normverb): TRIVERBTABLE_INDEX[normverb].append(key) else: TRIVERBTABLE_INDEX[normverb] = [key, ]
def test_strip(self): # strip_harakat(text): assert Araby.strip_harakat(u"الْعَرَبِيّةُ") == u'العربيّة' # strip_lastharaka(text) assert Araby.strip_lastharaka(u"الْعَرَبِيّةُ") == u'الْعَرَبِيّة' # strip_tashkeel(text) assert Araby.strip_tashkeel(u"الْعَرَبِيّةُ") == u'العربية' # strip_tatweel(text): assert Araby.strip_tatweel(u"العـــــربية") == u'العربية' # strip_shadda(text): assert Araby.strip_shadda(u"الشّمسيّة") == u'الشمسية'
def find_alltriverb(triverb, givenharaka=araby.FATHA, vocalised_entree=False): """ Find the triliteral verb in the dictionary (TriVerbTable) return a list of possible verb forms each item contains: - 'root': - 'haraka: - 'bab': - 'transitive': @param triverb: given verb. @type triverb: unicode. @param givenharaka: given haraka of tuture type of the verb, default(FATHA). @type givenharaka: unicode. @param VocalisedEntree: True if the given verb is vocalized, default False. @type VocalisedEntree: Boolean. @return: list of triliteral verbs. @rtype: list of dicts. """ liste = [] if vocalised_entree: verb_nm = araby.strip_harakat(triverb) else: verb_nm = triverb normalized = araby.normalize_hamza(verb_nm) if TRIVERBTABLE_INDEX.has_key(normalized): for verb_voc_id in TRIVERBTABLE_INDEX[normalized]: if triverb == triverbtable.TriVerbTable[verb_voc_id]['verb'] and \ givenharaka == triverbtable.TriVerbTable[verb_voc_id]['haraka']: liste.insert(0, triverbtable.TriVerbTable[verb_voc_id]) # if VocalisedEntree: #if verb_voc_id[:-1] == triverb: # liste.append(TriVerbTable[verb_voc_id]) else: liste.append(triverbtable.TriVerbTable[verb_voc_id]) else: print("triverb has no verb") return liste
def find_alltriverb(triverb, givenharaka = araby.FATHA, vocalised_entree = False): """ Find the triliteral verb in the dictionary (TriVerbTable) return a list of possible verb forms each item contains: - 'root': - 'haraka: - 'bab': - 'transitive': @param triverb: given verb. @type triverb: unicode. @param givenharaka: given haraka of tuture type of the verb, default(FATHA). @type givenharaka: unicode. @param VocalisedEntree: True if the given verb is vocalized, default False. @type VocalisedEntree: Boolean. @return: list of triliteral verbs. @rtype: list of dicts. """ liste = [] if vocalised_entree: verb_nm = araby.strip_harakat(triverb) else: verb_nm = triverb normalized = araby.normalize_hamza(verb_nm) if TRIVERBTABLE_INDEX.has_key(normalized): for verb_voc_id in TRIVERBTABLE_INDEX[normalized]: if triverb == triverbtable.TriVerbTable[verb_voc_id]['verb'] and \ givenharaka == triverbtable.TriVerbTable[verb_voc_id]['haraka']: liste.insert(0, triverbtable.TriVerbTable[verb_voc_id]) # if VocalisedEntree: #if verb_voc_id[:-1] == triverb: # liste.append(TriVerbTable[verb_voc_id]) else: liste.append(triverbtable.TriVerbTable[verb_voc_id]) else: print "triverb has no verb" return liste
def preprocess(sentences, stopwords, isStopword = False): """ This takes in an array of complete araic sentences, and performs th following operations on all of them: 1.) strips tashkeel 2.) strips harakat 3.) strips lastharaka 4.) strips tatweel 5.) Strips shadda 6.) normalize lam alef ligatures 7.) normalize hamza 8.) tokenize Returns a 2D martix, where each row represents normalized, tokens of each sentence """ #print("SENTENCE INDEX!!!", sentences[0]) output = [] for sentence in sentences: #print("Before Preprocessing:"+ sentence) #print(sentence) text = araby.strip_harakat(sentence) #print("TEXT!!!!", text) text = araby.strip_tashkeel(text) text = araby.strip_lastharaka(text) text = araby.strip_tatweel(text) text = araby.strip_shadda(text) text = araby.normalize_ligature(text) text = araby.normalize_hamza(text) text = clean_str(text) #print("After Preprocessing:"+ text) #print("----") #print(text) try: text = re.match(r'[^\\n\\s\\p{Latin}]+', text).group() tokens = araby.tokenize(text) if not isStopword: tokens = remove_stopwords(stopwords, tokens) tokens = [t for t in tokens if t != '\n'] output.append(tokens) except: pass return output
def display_rows(self, listtense): """ Display The conjugation result for a list of tenses, as text in rows. every row contains: - unvocalized conjugation, - unvocalized conjugation, - pronoun - tense, - transitive, - original verb - tasrif bab @param listtense: the given tenses list to display result @type listtense: list of unicode @return: the result as text in row. @rtype: unicode. """ text = u"" transitive = "0" if self.transitive: transitive = '1' for pronoun in vconst.PronounsTable: ## text += u"%s" % (pronoun) for tense in listtense: # print (self.verb).encode("utf-8"), if self.tab_conjug[tense][pronoun] != "": text += "\t".join([ araby.strip_harakat(self.tab_conjug[tense][pronoun]), self.tab_conjug[tense][pronoun], TAB_DISPLAY[pronoun], TAB_DISPLAY[tense], transitive, self.verb, self.bab, ]) text += u"\n" return text
def display_rows(self, listtense ): """ Display The conjugation result for a list of tenses, as text in rows. every row contains: - unvocalized conjugation, - unvocalized conjugation, - pronoun - tense, - transitive, - original verb - tasrif bab @param listtense: the given tenses list to display result @type listtense: list of unicode @return: the result as text in row. @rtype: unicode. """ text = u"" transitive = "0" if self.transitive: transitive = '1' for pronoun in vconst.PronounsTable: ## text += u"%s" % (pronoun) for tense in listtense: # print (self.verb).encode("utf-8"), if self.tab_conjug[tense][pronoun] != "": text += "\t".join([ araby.strip_harakat(self.tab_conjug[tense][pronoun]), self.tab_conjug[tense][pronoun], TAB_DISPLAY[pronoun], TAB_DISPLAY[tense], transitive, self.verb, self.bab, ]) text += u"\n" return text
def normalize_alef_madda(word): """ Convert Alef madda into two letters. @param word: given word. @type word: unicode. @return: converted word. @rtype: unicode. """ if word.startswith(ALEF_MADDA): word_nm = araby.strip_harakat(word) if len(word_nm) == 2: return word_nm.replace(ALEF_MADDA, HAMZA+ALEF) elif len(word_nm) == 3: if vconst.ALEF_MADDA_VERB_TABLE.has_key(word_nm): #return the first one only #mylist = ALEF_MADDA_VERB_TABLE[word_nm] return vconst.ALEF_MADDA_VERB_TABLE[word_nm][0] else: return word_nm.replace(ALEF_MADDA, HAMZA+ALEF) else: return word_nm.replace(ALEF_MADDA, HAMZA+ALEF) else: return word_nm
def Strip_tashkeel(self): return strip_harakat(self.text)
def suggest_verb(verb): """ Generate a list of valid infinitive verb for an invalid infinitive form. @param verb: given verb, of invalid infinitive form. @type verb: unicode. @return: a list of suggested infinitive verb forms @rtype: list of unicode. """ # the verb is invalid list_suggest = [] # first strip harakat, shadda is not striped verb = araby.strip_harakat(verb) # second strip all inacceptable letters in an infinivive form verb = re.sub(u"[%s%s%s%s]"%( TEH_MARBUTA, DAMMATAN, KASRATAN, FATHATAN), \ '', verb) # test the resulted verb if it's valid, if ok, # add it to the suggestion list. if is_valid_infinitive_verb(verb): list_suggest.append(verb) return list_suggest # if the verb starts by ALEF_HAMZA_BELOW like إستعمل, #replace if by an ALEF, because it's a common error. # if the result is valid add it to the suggestions list elif verb.startswith(ALEF_HAMZA_BELOW): verb = re.sub(ALEF_HAMZA_BELOW, ALEF, verb) if is_valid_infinitive_verb(verb): list_suggest.append(verb) return list_suggest # if the verb starts by ALEF like اضرب, #replace if by an ALEF_HAMZA_ABOVE, because it's a common error. # if the result is valid add it to the suggestions list elif verb.startswith(ALEF): verb_one = re.sub(ALEF, ALEF_HAMZA_ABOVE+FATHA, verb, 1) if is_valid_infinitive_verb(verb_one): list_suggest.append(verb_one) return list_suggest # if the verb is 2 letters length, # suggest to add the third letter as : # Shadda, Alef, Alef Maksura, Yeh at the end # if the result is valid add it to the suggestions list elif len(verb) == 2: verb = re.sub(ALEF, ALEF_HAMZA_ABOVE, verb, 1) #suggest to add the third letter as : Shadda at the end verb_one = verb+SHADDA if is_valid_infinitive_verb(verb_one): list_suggest.append(verb_one) #suggest to add the third letter as : Alef Maksura verb_one = verb+ALEF_MAKSURA if is_valid_infinitive_verb(verb_one): list_suggest.append(verb_one) #suggest to add the third letter as :Alef at the end verb_one = verb+ALEF if is_valid_infinitive_verb(verb_one): list_suggest.append(verb_one) #suggest to add the third letter as :Alef in middle verb_one = verb[0]+ALEF+verb[1] if is_valid_infinitive_verb(verb_one): list_suggest.append(verb_one) return list_suggest elif len(verb) >= 6: # if the verb is more than 6 letters length, #suggest to replace the over added letters by Alef # if the result is valid add it to the suggestions list for i in range(len(verb)-6): verb_one = ALEF+verb[i:i+5] if is_valid_infinitive_verb(verb_one): list_suggest.append(verb_one) elif len(verb) == 5: # if the verb is 5 letters length, suggest # if the result is valid add it to the suggestions list # ToDo: review this part for i in range(len(verb)-5): verb_one = ALEF+verb[i:i+4] if is_valid_infinitive_verb(verb_one): list_suggest.append(verb_one) elif len(verb) == 4: # if the verb is 5 letters length, #suggest to replace the over added letters by Alef # if the result is valid add it to the suggestions list # فعال = > فاعل #فّعل = > فعّل if verb[2] == ALEF or verb[1] == SHADDA: verb_one = verb[0]+verb[2]+verb[1]+verb[3] if is_valid_infinitive_verb(verb_one): list_suggest.append(verb_one) if verb.endswith(SHADDA): # if the verb is 4 letters length, #suggest to correct the alef and shadda position # if the result is valid add it to the suggestions list #فعلّ = > فعّل verb_one = verb[0]+verb[1]+verb[3]+verb[2] if is_valid_infinitive_verb(verb_one): list_suggest.append(verb_one) return list_suggest else: # else sugest to conjugate another verb list_suggest.append(u"كتب") return list_suggest return list_suggest
def removeHarakat(self, t): return strip_harakat(t)
def uniformate_verb(word): """ Separate the harakat and the letters of the given word, it return two strings ( the word without harakat and the harakat). If the weaked letters are reprsented as long harakat and striped from the word. @param word: given word. @type word: unicode. @return: (letters, harakat). @rtype: tuple of unicode. """ if word == "": return ("", "") #normalize ALEF MADDA if word.startswith(ALEF_MADDA): word = word.replace(ALEF_MADDA, HAMZA+HAMZA) else: word = word.replace(ALEF_MADDA, HAMZA+ALEF) word_nm = araby.strip_harakat(word) length = len(word_nm) if len(word_nm) != 3: # تستعمل الهمزات لتخمين حركات الفعل الثلاثي # normalize hamza here, because we use it to # detect harakat on the trilateral verb. word_nm = vconst.HAMZAT_PATTERN.sub(HAMZA, word_nm) # length of word after normalization # اهمزات تستعمل لكشف تشكيل الفعل، يتم توحيدها لاحقا if length == 3: if word_nm[1]in (ALEF, ALEF_HAMZA_ABOVE) or \ word_nm[2] in (ALEF_MAKSURA, ALEF_HAMZA_ABOVE, ALEF): marks = FATHA+FATHA+FATHA elif word[1] == YEH_HAMZA or word[2] in (YEH, YEH_HAMZA): marks = FATHA+KASRA+FATHA else: # let the verb haraka i = 0 ## ignore harakat at the began of the word while araby.is_shortharaka(word[i]):# in HARAKAT: i += 1 # الحرف الأول if not araby.is_shortharaka(word[i]):#not in HARAKAT: i += 1 # الحركة الأولى while araby.is_shortharaka(word[i]):#word[i] in HARAKAT: i += 1 # الحرف الثاني if not araby.is_shortharaka(word[i]):#word[i] not in HARAKAT: i += 1 #الحركة الثانية if not araby.is_shortharaka(word[i]):#word[i] not in HARAKAT: #وجدنا مشاكل في تصريف الفعل المضاعف في الماضي # نجعل الحركة الثانية فتحة مؤقتا #ToDo: review this case secondharaka = FATHA else: secondharaka = word[i] marks = u''.join([FATHA, secondharaka, FATHA]) # تستعمل الهمزات لتخمين حركات الفعل الثلاثي # normalize hamza here, because we use it to # detect harakat on the trilateral verb. word_nm = vconst.HAMZAT_PATTERN.sub(HAMZA, word_nm) elif length == 4: marks = vconst.UNIFORMATE_MARKS_4 elif length == 5: if word_nm.startswith(TEH): marks = vconst.UNIFORMATE_MARKS_5TEH else : marks = vconst.UNIFORMATE_MARKS_5 elif length == 6: marks = vconst.UNIFORMATE_MARKS_6 else: marks = FATHA*len(word_nm) i = 1 # first added automaticlly new_word = word_nm[0] new_harakat = marks[0] # between the first and the last while i < length-1: if word_nm[i] == ALEF: new_harakat = new_harakat[:-1]+vconst.ALEF_HARAKA else: new_harakat += marks[i] new_word += word_nm[i] i += 1 # the last letter ## حالة الفعل عيا، أعيا، عيّا والتي يتحول إلى ياء بدلا عن واو if word_nm[i] == ALEF: if len(word_nm) == 3 and word_nm[1] != YEH: new_word += vconst.ALEF_MAMDUDA else: new_word += YEH else: new_word += word_nm[i] new_harakat += marks[i] ## new_word += word_nm[i] return (new_word, new_harakat)
u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1=u"" for word in word_list: print word.encode('utf8'),'\t', if araby.is_vocalized(word): print ' is vocalized', ## if araby.isArabicstring(word): print ' iisArabicstring', ## else:print ' invalid arabicstring', if araby.is_vocalizedtext(word): print ' is vocalized text', if araby.is_arabicword(word): print ' is valid word', else: print "invalid arabic word", print ' strip harakat', araby.strip_harakat(word).encode('utf8'), print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'), print ' strip tatweel',araby.strip_tatweel(word).encode('utf8'), print ' normalize ligature ', araby.normalize_ligature(word).encode('utf8'), if araby.vocalizedlike(word, word1): print "vocalized_like", print; word1=word; if araby.vocalizedlike(u"العربية",u"العرَبية"): print "vocalized_like", word=u"الْعَرَيِيّةُ" word_list=[ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ]
def is_valid_infinitive_verb(word, vocalized = True): """ Determine if the given word is a valid infinitive form of an arabic verb. A word is not valid infinitive if - lenght < 3 letters. - starts with : ALEF_MAKSURA, WAW_HAMZA, YEH_HAMZA, HARAKAT - contains TEH_MARBUTA, Tanwin - contains non arabic letters. - contains ALEF_MAKSURA not in the end. - contains double haraka : a warning @param word: given word. @type word: unicode. @param is_vocalized: if the given word is vocalized. @type is_vocalized:Boolean, default(True). @return: True if the word is a valid infinitive form of verb. @rtype: Boolean. """ # test if the word is an arabic valid word, if not araby.is_arabicword(word): return False if vocalized : word_nm = araby.strip_harakat(word) else: word_nm = word # the alef_madda is considered as 2 letters word_nm = word_nm.replace(ALEF_MADDA, HAMZA+ALEF) length = len(word_nm) # lenght with shadda must be between 3 and 6 if length < 3 or length >= 7: return False # a 3 length verb can't start by Alef or Shadda, #and the second letter can't be shadda elif length == 3 and (word_nm[0] == ALEF or word_nm[0] == SHADDA \ or word_nm[1] == SHADDA): return False # a 5 length verb must start by ALEF or TEH elif length == 5 and word_nm[0] not in (TEH, ALEF): return False # a 6 length verb must start by ALEF elif length == 6 and word_nm[0] != ALEF: return False # contains some invalide letters in verb elif re.search(u"[%s%s%s%s%s]"%(ALEF_HAMZA_BELOW, TEH_MARBUTA, DAMMATAN, KASRATAN, FATHATAN), word): return False # contains some SHADDA sequence letters in verb # Like shadda shadda, shadda on alef, start # by shadda, shadda on alef_ maksura, # ALEF folowed by (ALEF, ALEF_MAKSURA) # ALEF Folowed by a letter and ALEF # end with ALEF folowed by (YEH, ALEF_MAKSURA) # first letter is alef and ALLw alef and two letters aand shadda elif re.search(u"([%s%s%s]%s|^%s|^%s..%s|^.%s|%s.%s|%s%s|%s[%s%s]$)"%( ALEF, ALEF_MAKSURA, SHADDA, SHADDA, SHADDA, ALEF, SHADDA, SHADDA, ALEF, ALEF, ALEF, ALEF, ALEF, ALEF_MAKSURA, YEH), word_nm): return False # Invalid root form some letters : #~ # initial YEH folowed by #~ ((THEH, JEEM, HAH, KHAH, THAL, ZAIN, SHEEN, SAD, DAD, #~ TAH, ZAH, GHAIN, KAF, HEH, YEH)) elif re.search(u"^%s[%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s]"%( YEH, THEH, JEEM, HAH, KHAH, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH, GHAIN, KAF, HEH, YEH), word_nm): return False # TEH After (DAL, THAL, TAH, ZAH, DAD) elif re.search(u"[%s%s%s%s%s]%s"%(DAL, THAL, DAD, TAH, ZAH, TEH), word_nm): return False # Contains invalid root sequence in arabic, near in phonetic # like BEH and FEH, LAM And REH elif re.search(u"%s%s|%s%s|%s%s|%s%s|%s%s|%s%s|%s%s"%( LAM, REH, REH, LAM, FEH, BEH, BEH, FEH, NOON, LAM, HEH, HAH, HAH, HEH), word_nm): return False # in non 5 letters verbs :initial TEH followed by # (THEH, DAL, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH) elif length != 5 and word_nm.startswith(TEH) and word_nm[1] in ( TEH, THEH, DAL, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH): return False # if word start by the same letter doubled elif word_nm[0] == word_nm[1] and word[0] != TEH: return False #verify the wazn of the verb elif length == 3: if re.match("^[^%s][^%s].$"%(ALEF, SHADDA), word_nm): return True # الأوزان المقبولة هي فعل، فعّ، # الأوزان غير المقبولة # اعل، فّل else: return False elif length == 4: #1- أفعل، 2- فاعل، 3 فعّل 4 فعلل if re.match(\ "^([%s%s][^%s]{2}.|[^%s%s]%s[^%s%s].|[^%s%s]{2}%s[^%s]|[^%s%s]{4})$"\ %(ALEF_HAMZA_ABOVE, HAMZA, SHADDA, ALEF, SHADDA, ALEF, ALEF, SHADDA, ALEF, SHADDA, SHADDA, SHADDA, ALEF, SHADDA), word_nm): return True # الأوزان المقبولة هي فعل، فعّ، # الأوزان غير المقبولة # افعل: يجب تثبيت همزة القطع #فّعل، فعلّ: الشدة لها موضع خاص # فعال، فعلا: للألف موضع خاص else: return False elif length == 5: if word_nm.startswith(ALEF): if re.match(u"^ا...ّ$", word_nm): return True # حالة اتخذ أو اذّكر أو اطّلع if re.match(u"^%s[%s%s%s]%s..$"%(ALEF, TEH, THAL, TAH, SHADDA), \ word_nm): return True # انفعل elif re.match(u"^ان...$", word_nm): return True #افتعل elif re.match(u"^(ازد|اصط|اضط)..$", word_nm): return True elif re.match(u"^ا[^صضطظد]ت..$", word_nm): return True elif re.match(u"^ا...ّ$", word_nm): return True # حالة اتخذ أو اذّكر أو اطّلع elif re.match(u"^ا.ّ..$", word_nm): return True elif re.match(u"^ا...ى$", word_nm): return True else: return False elif word_nm.startswith(TEH): return True else: return False # الأوزان المقبولة هي فعل، فعّ، # الأوزان غير المقبولة #للشدة موضع خاص: تفعّل، افتعّ # للألف مواضع خاصة، elif length == 6: if not (word_nm.startswith(ALEF) or word_nm.startswith(TEH)): return False if VALID_INFINITIVE_VERB6_PATTERN.match(word_nm): return True # الأوزان المقبولة هي فعل، فعّ، # الأوزان غير المقبولة #للشدة موضع خاص: تفعّل، افتعّ # للألف مواضع خاصة، else: return False return True
if araby.is_weak(c): print ('weak'), if araby.is_moon(c): print ('moon'), if araby.is_sun(c):print ('sun'), print (araby.order(c)), print (); word=u"الْعَرَيِيّةُ" word_list=[ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", u"سئل لأنه يؤم الإمام" ] word1=u"" for word in word_list: print (word) if araby.is_vocalized(word): print (' is vocalized') if araby.is_vocalizedtext(word): print (' is vocalized text') if araby.is_arabicword(word): print (' is valid word') else: print ("invalid arabic word") print (' strip harakat', araby.strip_harakat(word)) print (' strip tashkeel', araby.strip_tashkeel(word)) print (' strip tatweel',araby.strip_tatweel(word)) print (' normalize ligature ', araby.normalize_ligature(word)) print (' normalize hamza', araby.normalize_hamza(word)) if araby.vocalizedlike(word, word1): print ("vocalized_like") word1=word; if araby.vocalizedlike(u"العربية",u"العرَبية"): print ("vocalized_like")
def is_valid_infinitive_verb(word, vocalized=True): """ Determine if the given word is a valid infinitive form of an arabic verb. A word is not valid infinitive if - lenght < 3 letters. - starts with : ALEF_MAKSURA, WAW_HAMZA, YEH_HAMZA, HARAKAT - contains TEH_MARBUTA, Tanwin - contains non arabic letters. - contains ALEF_MAKSURA not in the end. - contains double haraka : a warning @param word: given word. @type word: unicode. @param is_vocalized: if the given word is vocalized. @type is_vocalized:Boolean, default(True). @return: True if the word is a valid infinitive form of verb. @rtype: Boolean. """ # test if the word is an arabic valid word, if not araby.is_arabicword(word): return False if vocalized: word_nm = araby.strip_harakat(word) else: word_nm = word # the alef_madda is considered as 2 letters word_nm = word_nm.replace(ALEF_MADDA, HAMZA + ALEF) length = len(word_nm) # lenght with shadda must be between 3 and 6 if length < 3 or length >= 7: return False # a 3 length verb can't start by Alef or Shadda, #and the second letter can't be shadda elif length == 3 and (word_nm[0] == ALEF or word_nm[0] == SHADDA \ or word_nm[1] == SHADDA): return False # a 5 length verb must start by ALEF or TEH elif length == 5 and word_nm[0] not in (TEH, ALEF): return False # a 6 length verb must start by ALEF elif length == 6 and word_nm[0] != ALEF: return False # contains some invalide letters in verb elif re.search( u"[%s%s%s%s%s]" % (ALEF_HAMZA_BELOW, TEH_MARBUTA, DAMMATAN, KASRATAN, FATHATAN), word): return False # contains some SHADDA sequence letters in verb # Like shadda shadda, shadda on alef, start # by shadda, shadda on alef_ maksura, # ALEF folowed by (ALEF, ALEF_MAKSURA) # ALEF Folowed by a letter and ALEF # end with ALEF folowed by (YEH, ALEF_MAKSURA) # first letter is alef and ALLw alef and two letters aand shadda elif re.search( u"([%s%s%s]%s|^%s|^%s..%s|^.%s|%s.%s|%s%s|%s[%s%s]$)" % (ALEF, ALEF_MAKSURA, SHADDA, SHADDA, SHADDA, ALEF, SHADDA, SHADDA, ALEF, ALEF, ALEF, ALEF, ALEF, ALEF_MAKSURA, YEH), word_nm): return False # Invalid root form some letters : #~ # initial YEH folowed by #~ ((THEH, JEEM, HAH, KHAH, THAL, ZAIN, SHEEN, SAD, DAD, #~ TAH, ZAH, GHAIN, KAF, HEH, YEH)) elif re.search( u"^%s[%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s]" % (YEH, THEH, JEEM, HAH, KHAH, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH, GHAIN, KAF, HEH, YEH), word_nm): return False # TEH After (DAL, THAL, TAH, ZAH, DAD) elif re.search(u"[%s%s%s%s%s]%s" % (DAL, THAL, DAD, TAH, ZAH, TEH), word_nm): return False # Contains invalid root sequence in arabic, near in phonetic # like BEH and FEH, LAM And REH elif re.search( u"%s%s|%s%s|%s%s|%s%s|%s%s|%s%s|%s%s" % (LAM, REH, REH, LAM, FEH, BEH, BEH, FEH, NOON, LAM, HEH, HAH, HAH, HEH), word_nm): return False # in non 5 letters verbs :initial TEH followed by # (THEH, DAL, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH) elif length != 5 and word_nm.startswith(TEH) and word_nm[1] in ( TEH, THEH, DAL, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH): return False # if word start by the same letter doubled elif word_nm[0] == word_nm[1] and word[0] != TEH: return False #verify the wazn of the verb elif length == 3: if re.match("^[^%s][^%s].$" % (ALEF, SHADDA), word_nm): return True # الأوزان المقبولة هي فعل، فعّ، # الأوزان غير المقبولة # اعل، فّل else: return False elif length == 4: #1- أفعل، 2- فاعل، 3 فعّل 4 فعلل if re.match(\ "^([%s%s][^%s]{2}.|[^%s%s]%s[^%s%s].|[^%s%s]{2}%s[^%s]|[^%s%s]{4})$"\ %(ALEF_HAMZA_ABOVE, HAMZA, SHADDA, ALEF, SHADDA, ALEF, ALEF, SHADDA, ALEF, SHADDA, SHADDA, SHADDA, ALEF, SHADDA), word_nm): return True # الأوزان المقبولة هي فعل، فعّ، # الأوزان غير المقبولة # افعل: يجب تثبيت همزة القطع #فّعل، فعلّ: الشدة لها موضع خاص # فعال، فعلا: للألف موضع خاص else: return False elif length == 5: if word_nm.startswith(ALEF): if re.match(u"^ا...ّ$", word_nm): return True # حالة اتخذ أو اذّكر أو اطّلع if re.match(u"^%s[%s%s%s]%s..$"%(ALEF, TEH, THAL, TAH, SHADDA), \ word_nm): return True # انفعل elif re.match(u"^ان...$", word_nm): return True #افتعل elif re.match(u"^(ازد|اصط|اضط)..$", word_nm): return True elif re.match(u"^ا[^صضطظد]ت..$", word_nm): return True elif re.match(u"^ا...ّ$", word_nm): return True # حالة اتخذ أو اذّكر أو اطّلع elif re.match(u"^ا.ّ..$", word_nm): return True elif re.match(u"^ا...ى$", word_nm): return True else: return False elif word_nm.startswith(TEH): return True else: return False # الأوزان المقبولة هي فعل، فعّ، # الأوزان غير المقبولة #للشدة موضع خاص: تفعّل، افتعّ # للألف مواضع خاصة، elif length == 6: if not (word_nm.startswith(ALEF) or word_nm.startswith(TEH)): return False if VALID_INFINITIVE_VERB6_PATTERN.match(word_nm): return True # الأوزان المقبولة هي فعل، فعّ، # الأوزان غير المقبولة #للشدة موضع خاص: تفعّل، افتعّ # للألف مواضع خاصة، else: return False return True
def __init__(self, verb, transitive, future_type=FATHA): """ init method @param verb: the given verb @type verb: unicode. @param transitive: the verb is transitive or not @type transitive: Boolean. @param future_type: The mark of the third radical letter in the verb, used for triletiral verb only. Default value is Fatha; @type future_type: unicode; one arabic letter (Fatha, Damma, Kasra). """ self.verb = verb # this cache is used to avoid duplicated operatioon in standardisation, # treat_sukun, and uniformate suffix self.cache_standard = cache_standard self.internal_verb = ar_verb.normalize(verb) self.future_type = ar_verb.get_future_type_by_name(future_type) (self.word_letters, self.word_marks) = ar_verb.uniformate_verb(verb) #Before last haraka in the past self.past_haraka = araby.secondlast_char(self.word_marks) self.word_marks = ar_verb.uniformate_alef_origin(self.word_marks, self.internal_verb, self.future_type) self.transitive = transitive self.hamza_zaida = False self.tab_conjug_stem = {} verb = self.verb tab_type = [u"", u"", u"", u"فعل ثلاثي", u"فعل رباعي", u"فعل خماسي", u"فعل سداسي", u"فعل سباعي", u"فعل ثماني", u"فعل تساعي"] verb = ar_verb.normalize(verb) self.unvocalized = araby.strip_harakat(verb) verb_nm = self.unvocalized self.vlength = len(verb_nm) self.vtype = tab_type[self.vlength] # الهمزة زائدة self.hamza_zaida = self._is_hamza_zaida(verb_nm) # التاء الزائدة #deprecated #self.teh_zaida=self.is_teh_zaida(verb_nm) # معالجة حالة الأفعال الشاذة # إذا كان الفعل من الشواذ، استخرجنا جذوع التصريف من جدوله #وإلا ولّدنا جذوع تصريفه # في المضارع والأمر فقط # أما الماضي فليس فيه شذوذ self.past_stem = "" self._prepare_past_stem() self._prepare_passive_past_stem() if self._is_irregular_verb(): self._prepare_irregular_future_imperative_stem() else: self._prepare_future_imperative_stem() # display object self.conj_display = conjugatedisplay.ConjugateDisplay(self.verb) if self.transitive : self.conj_display.add_attribut(u"اللزوم/التعدي", u"متعدي") else : self.conj_display.add_attribut(u"اللزوم/التعدي", u"لازم") self.conj_display.add_attribut(u"الفعل", self.verb) self.conj_display.add_attribut(u"نوع الفعل", self.vtype) self.future_form = self.conjugate_tense_pronoun(vconst.TenseFuture, vconst.PronounHuwa) self.conj_display.set_future_form(self.future_form) if self.transitive : self.conj_display.settransitive() self.conj_display.setbab(self.future_type)
from pyarabic.araby import strip_tashkeel from pyarabic.araby import strip_harakat if __name__ == '__main__': text = u"الْعَرَبِيّةُ" print(strip_harakat(text)) text = u"الْعَرَبِيّةُ" print(strip_tashkeel(text))
def test_strip_harakat(self): """Test striped tashkeel for العربية?""" word = u"الْعَرَبِيَّةُ" word_nm = u'العربيّة' self.assertEqual(ar.strip_harakat(word), word_nm) self.assertNotEqual(ar.strip_harakat(word), word)
from pyarabic.araby import strip_harakat with open('./nawar-raw/nawar.csv', encoding='utf-8') as f: with open('nawar_stripped.txt', mode='w', encoding="utf-8") as fw: fw.write(strip_harakat(f.read()))
def get_all_place_pre(self): place_pre = self.voweled_df[self.voweled_df['@type'] == '50'] place_pre['@voweledform'] = place_pre['@voweledform'].apply( lambda x: strip_harakat(x)) return place_pre['@voweledform'].tolist()
def get_all_time_pre(self): time_pre = self.voweled_df[self.voweled_df['@type'] == '49'] time_pre['@voweledform'] = time_pre['@voweledform'].apply( lambda x: strip_harakat(x)) return time_pre['@voweledform'].tolist()
u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1 = u"" for word in word_list: print word.encode('utf8'), '\t', if araby.is_vocalized(word): print ' is vocalized', ## if araby.isArabicstring(word): print ' iisArabicstring', ## else:print ' invalid arabicstring', if araby.is_vocalizedtext(word): print ' is vocalized text', if araby.is_arabicword(word): print ' is valid word', else: print "invalid arabic word", print ' strip harakat', araby.strip_harakat(word).encode('utf8'), print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'), print ' strip tatweel', araby.strip_tatweel(word).encode('utf8'), print ' normalize ligature ', araby.normalize_ligature(word).encode( 'utf8'), if araby.vocalizedlike(word, word1): print "vocalized_like", print word1 = word if araby.vocalizedlike(u"العربية", u"العرَبية"): print "vocalized_like", word = u"الْعَرَيِيّةُ" word_list = [ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha",
def suggest_verb(verb): """ Generate a list of valid infinitive verb for an invalid infinitive form. @param verb: given verb, of invalid infinitive form. @type verb: unicode. @return: a list of suggested infinitive verb forms @rtype: list of unicode. """ # the verb is invalid list_suggest = [] # first strip harakat, shadda is not striped verb = araby.strip_harakat(verb) # second strip all inacceptable letters in an infinivive form verb = re.sub(u"[%s%s%s%s]"%( TEH_MARBUTA, DAMMATAN, KASRATAN, FATHATAN), \ '', verb) # test the resulted verb if it's valid, if ok, # add it to the suggestion list. if is_valid_infinitive_verb(verb): list_suggest.append(verb) return list_suggest # if the verb starts by ALEF_HAMZA_BELOW like إستعمل, #replace if by an ALEF, because it's a common error. # if the result is valid add it to the suggestions list elif verb.startswith(ALEF_HAMZA_BELOW): verb = re.sub(ALEF_HAMZA_BELOW, ALEF, verb) if is_valid_infinitive_verb(verb): list_suggest.append(verb) return list_suggest # if the verb starts by ALEF like اضرب, #replace if by an ALEF_HAMZA_ABOVE, because it's a common error. # if the result is valid add it to the suggestions list elif verb.startswith(ALEF): verb_one = re.sub(ALEF, ALEF_HAMZA_ABOVE + FATHA, verb, 1) if is_valid_infinitive_verb(verb_one): list_suggest.append(verb_one) return list_suggest # if the verb is 2 letters length, # suggest to add the third letter as : # Shadda, Alef, Alef Maksura, Yeh at the end # if the result is valid add it to the suggestions list elif len(verb) == 2: verb = re.sub(ALEF, ALEF_HAMZA_ABOVE, verb, 1) #suggest to add the third letter as : Shadda at the end verb_one = verb + SHADDA if is_valid_infinitive_verb(verb_one): list_suggest.append(verb_one) #suggest to add the third letter as : Alef Maksura verb_one = verb + ALEF_MAKSURA if is_valid_infinitive_verb(verb_one): list_suggest.append(verb_one) #suggest to add the third letter as :Alef at the end verb_one = verb + ALEF if is_valid_infinitive_verb(verb_one): list_suggest.append(verb_one) #suggest to add the third letter as :Alef in middle verb_one = verb[0] + ALEF + verb[1] if is_valid_infinitive_verb(verb_one): list_suggest.append(verb_one) return list_suggest elif len(verb) >= 6: # if the verb is more than 6 letters length, #suggest to replace the over added letters by Alef # if the result is valid add it to the suggestions list for i in range(len(verb) - 6): verb_one = ALEF + verb[i:i + 5] if is_valid_infinitive_verb(verb_one): list_suggest.append(verb_one) elif len(verb) == 5: # if the verb is 5 letters length, suggest # if the result is valid add it to the suggestions list # ToDo: review this part for i in range(len(verb) - 5): verb_one = ALEF + verb[i:i + 4] if is_valid_infinitive_verb(verb_one): list_suggest.append(verb_one) elif len(verb) == 4: # if the verb is 5 letters length, #suggest to replace the over added letters by Alef # if the result is valid add it to the suggestions list # فعال = > فاعل #فّعل = > فعّل if verb[2] == ALEF or verb[1] == SHADDA: verb_one = verb[0] + verb[2] + verb[1] + verb[3] if is_valid_infinitive_verb(verb_one): list_suggest.append(verb_one) if verb.endswith(SHADDA): # if the verb is 4 letters length, #suggest to correct the alef and shadda position # if the result is valid add it to the suggestions list #فعلّ = > فعّل verb_one = verb[0] + verb[1] + verb[3] + verb[2] if is_valid_infinitive_verb(verb_one): list_suggest.append(verb_one) return list_suggest else: # else sugest to conjugate another verb list_suggest.append(u"كتب") return list_suggest return list_suggest
word = u"الْعَرَيِيّةُ" word_list = [ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1 = u"" for word in word_list: print(word, '\t', end=" ") if araby.is_vocalized(word): print(' is vocalized', end=" ") if araby.is_vocalizedtext(word): print(' is vocalized text', end=" ") if araby.is_arabicword(word): print(' is valid word', end=" ") else: print("invalid arabic word", end=" ") print(' strip harakat', araby.strip_harakat(word), end=" ") print(' strip tashkeel', araby.strip_tashkeel(word), end=" ") print(' strip tatweel', araby.strip_tatweel(word), end=" ") print(' normalize ligature ', araby.normalize_ligature(word), end=" ") if araby.vocalizedlike(word, word1): print("vocalized_like", end=" ") print() word1 = word if araby.vocalizedlike(u"العربية", u"العرَبية"): print("vocalized_like", end=" ") word = u"الْعَرَيِيّةُ" word_list = [ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha",
def uniformate_verb(word): """ Separate the harakat and the letters of the given word, it return two strings ( the word without harakat and the harakat). If the weaked letters are reprsented as long harakat and striped from the word. @param word: given word. @type word: unicode. @return: (letters, harakat). @rtype: tuple of unicode. """ if word == "": return ("", "") #normalize ALEF MADDA if word.startswith(ALEF_MADDA): word = normalize_alef_madda(word) else: word = word.replace(ALEF_MADDA, HAMZA + ALEF) word_nm = araby.strip_harakat(word) length = len(word_nm) if len(word_nm) != 3: # تستعمل الهمزات لتخمين حركات الفعل الثلاثي # normalize hamza here, because we use it to # detect harakat on the trilateral verb. word_nm = vconst.HAMZAT_PATTERN.sub(HAMZA, word_nm) # length of word after normalization # اهمزات تستعمل لكشف تشكيل الفعل، يتم توحيدها لاحقا if length == 3: if word_nm[1]in (ALEF, ALEF_HAMZA_ABOVE) or \ word_nm[2] in (ALEF_MAKSURA, ALEF_HAMZA_ABOVE, ALEF): marks = FATHA + FATHA + FATHA elif word[1] == YEH_HAMZA or word[2] in (YEH, YEH_HAMZA): marks = FATHA + KASRA + FATHA else: # let the verb haraka i = 0 ## ignore harakat at the began of the word while araby.is_shortharaka(word[i]): # in HARAKAT: i += 1 # الحرف الأول if not araby.is_shortharaka(word[i]): #not in HARAKAT: i += 1 # الحركة الأولى while araby.is_shortharaka(word[i]): #word[i] in HARAKAT: i += 1 # الحرف الثاني if not araby.is_shortharaka(word[i]): #word[i] not in HARAKAT: i += 1 #الحركة الثانية if not araby.is_shortharaka(word[i]): #word[i] not in HARAKAT: #وجدنا مشاكل في تصريف الفعل المضاعف في الماضي # نجعل الحركة الثانية فتحة مؤقتا #ToDo: review this case secondharaka = FATHA else: secondharaka = word[i] marks = u''.join([FATHA, secondharaka, FATHA]) # تستعمل الهمزات لتخمين حركات الفعل الثلاثي # normalize hamza here, because we use it to # detect harakat on the trilateral verb. word_nm = vconst.HAMZAT_PATTERN.sub(HAMZA, word_nm) elif length == 4: marks = vconst.UNIFORMATE_MARKS_4 elif length == 5: if word_nm.startswith(TEH): marks = vconst.UNIFORMATE_MARKS_5TEH else: marks = vconst.UNIFORMATE_MARKS_5 elif length == 6: marks = vconst.UNIFORMATE_MARKS_6 else: marks = FATHA * len(word_nm) i = 1 # first added automaticlly new_word = word_nm[0] new_harakat = marks[0] # between the first and the last while i < length - 1: if word_nm[i] == ALEF: new_harakat = new_harakat[:-1] + vconst.ALEF_HARAKA else: new_harakat += marks[i] new_word += word_nm[i] i += 1 # the last letter ## حالة الفعل عيا، أعيا، عيّا والتي يتحول إلى ياء بدلا عن واو if word_nm[i] == ALEF: if len(word_nm) == 3 and word_nm[1] != YEH: new_word += vconst.ALEF_MAMDUDA else: new_word += YEH else: new_word += word_nm[i] new_harakat += marks[i] ## new_word += word_nm[i] return (new_word, new_harakat)