Example #1
0
 def check_shadda(word_vocalised,
                  resulted_data,
                  fully_vocalized_input=False):
     """
     if the entred word is like the found word in dictionary,
     to treat some normalized cases,
     the analyzer return the vocalized like words.
     This function treat the Shadda case.
     @param word_vocalised: the input word.
     @type word_vocalised: unicode.
     @param resulted_data: the founded resulat from dictionary.
     @type resulted_data: list of dict.
     @param fully_vocalized_input: if the two words must resect the shadda and vocalized.
     @type fully_vocalized_input: Boolean, default is False.
     @return: list of dictionaries of analyzed words with tags.
     @rtype: list.
     """
     #~return filter(lambda item: araby.shaddalike(word_vocalised,
     #~item.__dict__.get('vocalized', '')), resulted_data)
     #~x for x in [1, 1, 2] if x == 1
     #~ return [
     #~ x for x in resulted_data
     #~ if araby.shaddalike(word_vocalised, x.__dict__.get('vocalized', '')) ]
     if fully_vocalized_input:
         return [
             x for x in resulted_data
             if araby.strip_harakat(word_vocalised) == araby.strip_harakat(
                 x.__dict__.get('vocalized', ''))
         ]
     else:
         return [
             x for x in resulted_data if araby.shaddalike(
                 word_vocalised, x.__dict__.get('vocalized', ''))
         ]
    def normalizeText(self, text):
        normalized_text = araby.strip_tatweel(text)
        normalized_text = araby.strip_tashkeel(normalized_text)
        normalized_text = araby.strip_harakat(normalized_text)
        normalized_text = araby.normalize_hamza(normalized_text)

        return normalized_text
Example #3
0
def normalize_alef_madda(word):
    """
    Convert Alef madda into two letters.
    @param word: given word.
    @type word: unicode.
    @return: converted word.
    @rtype: unicode.
    """
    if word.startswith(ALEF_MADDA):
        word_nm = araby.strip_harakat(word)
        #print word, word_nm, len(word), len(word_nm)
        if len(word_nm) == 2:
            return word_nm.replace(ALEF_MADDA, HAMZA + ALEF)
        elif len(word_nm) == 3:
            if word_nm in vconst.ALEF_MADDA_VERB_TABLE:
                #print word, "exists in madd table", vconst.ALEF_MADDA_VERB_TABLE[word_nm][0]
                #return the first one only
                #mylist = ALEF_MADDA_VERB_TABLE[word_nm]
                return vconst.ALEF_MADDA_VERB_TABLE[word_nm][0]
            else:
                return word_nm.replace(ALEF_MADDA, HAMZA + ALEF)
        else:
            return word_nm.replace(ALEF_MADDA, HAMZA + ALEF)
    else:
        return word_nm
Example #4
0
def find_triliteral_verb(db_base_path, triliteralverb, givenharaka):
    """
    Find the triliteral verb in the dictionary, 
    return a list of possible verb forms
    @param db_base_path: the database path
    @type db_base_path: path string.
    @param triliteralverb: given verb.
    @type triliteralverb: unicode.
    @param givenharaka: given haraka of tuture type of the verb.
    @type givenharaka: unicode.
    @return: list of triliteral verbs.
    @rtype: list of unicode.
    """
    liste = []
    try:
        import sqlite3 as sqlite
        import os
#     db_path = os.path.join(_base_directory(req), "data/verbdict.db")

        #db_path = os.path.join(db_base_path, "data/verbdict.db")
        conn  =  sqlite.connect(db_path)
        cursor  =  conn.cursor()
        verb_nm = araby.strip_harakat(triliteralverb)
        tup = (verb_nm, )
        cursor.execute("""select verb_vocalised, haraka, transitive 
                    from verbdict
                    where verb_unvocalised = ?""", tup)
        for row in cursor:
            verb_vocalised = row[0]
            haraka = row[1]
            transitive = row[2]
            # Return the transitivity option
            #MEEM is transitive
            # KAF is commun ( transitive and intransitive)
            # LAM is intransitive
            if transitive in (araby.KAF, araby.MEEM):
                transitive = True
            else:
                transitive = False
# if the given verb is the list, 
#it will be inserted in the top of the list, 
#to be treated in prior
            if triliteralverb == verb_vocalised and givenharaka == haraka:
                liste.insert(0, {"verb":verb_vocalised, 
                "haraka":haraka, "transitive":transitive})
# else the verb is appended in the liste
            else:
                liste.append({"verb":verb_vocalised, 
                "haraka":haraka, "transitive":transitive})
        cursor.close()
        return liste
    except IOError:
        return None
Example #5
0
def is_triliteral_verb(verb):
    """ Test if the verb is  triliteral, 
    used in selectionof verbs from the triliteral verb dictionnary
    @param verb: given verb.
    @type verb: unicode.
    @return: True if the verb is triliteral.
    @rtype: Boolean.
    """
    verb_nm = araby.strip_harakat(verb)
    verb_nm = verb_nm.replace(ALEF_MADDA, HAMZA+ALEF)
    if len(verb_nm) == 3:
        return True
    else : return False
Example #6
0
def is_triliteral_verb(verb):
    """ Test if the verb is  triliteral, 
    used in selectionof verbs from the triliteral verb dictionnary
    @param verb: given verb.
    @type verb: unicode.
    @return: True if the verb is triliteral.
    @rtype: Boolean.
    """
    verb_nm = araby.strip_harakat(verb)
    verb_nm = verb_nm.replace(ALEF_MADDA, HAMZA+ALEF)
    if len(verb_nm) == 3:
        return True
    else : return False
Example #7
0
def create_index_triverbtable():
    """ Create index from the verb dictionary
    to accelerate the search in the dictionary for verbs
    @return: create the TRIVERBTABLE_INDEX
    @rtype: None
    """
    # the key is the vocverb + the bab number
    for key in triverbtable.TriVerbTable.keys():
        vocverb = triverbtable.TriVerbTable[key]['verb']
        unvverb = araby.strip_harakat(vocverb)
        normverb = araby.normalize_hamza(unvverb)
        if TRIVERBTABLE_INDEX.has_key(normverb):
            TRIVERBTABLE_INDEX[normverb].append(key)
        else:
            TRIVERBTABLE_INDEX[normverb] = [key, ]
Example #8
0
def create_index_triverbtable():
    """ Create index from the verb dictionary
    to accelerate the search in the dictionary for verbs
    @return: create the TRIVERBTABLE_INDEX
    @rtype: None
    """
    # the key is the vocverb + the bab number
    for key in triverbtable.TriVerbTable.keys():
        vocverb = triverbtable.TriVerbTable[key]['verb']
        unvverb = araby.strip_harakat(vocverb)
        normverb = araby.normalize_hamza(unvverb)
        if TRIVERBTABLE_INDEX.has_key(normverb):
            TRIVERBTABLE_INDEX[normverb].append(key)
        else:
            TRIVERBTABLE_INDEX[normverb] = [key, ]
Example #9
0
    def test_strip(self):

        # strip_harakat(text):
        assert Araby.strip_harakat(u"الْعَرَبِيّةُ") == u'العربيّة'

        # strip_lastharaka(text)
        assert Araby.strip_lastharaka(u"الْعَرَبِيّةُ") == u'الْعَرَبِيّة'

        # strip_tashkeel(text)
        assert Araby.strip_tashkeel(u"الْعَرَبِيّةُ") == u'العربية'

        # strip_tatweel(text):
        assert Araby.strip_tatweel(u"العـــــربية") == u'العربية'

        # strip_shadda(text):
        assert Araby.strip_shadda(u"الشّمسيّة") == u'الشمسية'
Example #10
0
def find_alltriverb(triverb, givenharaka=araby.FATHA, vocalised_entree=False):
    """
    Find the triliteral verb in the dictionary (TriVerbTable)
    return a list of possible verb forms
    each item contains:
        - 'root':
        - 'haraka:
        - 'bab':
        - 'transitive':
    @param triverb: given verb.
    @type triverb: unicode.
    @param givenharaka: given haraka of tuture type of the verb, 
    default(FATHA).
    @type givenharaka: unicode.
    @param VocalisedEntree: True if the given verb is vocalized, 
    default False.
    @type VocalisedEntree: Boolean.
    @return: list of triliteral verbs.
    @rtype: list of dicts.
    """
    liste = []

    if vocalised_entree:
        verb_nm = araby.strip_harakat(triverb)
    else:
        verb_nm = triverb

    normalized = araby.normalize_hamza(verb_nm)
    if TRIVERBTABLE_INDEX.has_key(normalized):
        for verb_voc_id in TRIVERBTABLE_INDEX[normalized]:
            if triverb == triverbtable.TriVerbTable[verb_voc_id]['verb'] and \
             givenharaka == triverbtable.TriVerbTable[verb_voc_id]['haraka']:
                liste.insert(0, triverbtable.TriVerbTable[verb_voc_id])


#            if VocalisedEntree:
#if verb_voc_id[:-1] == triverb:
#    liste.append(TriVerbTable[verb_voc_id])
            else:
                liste.append(triverbtable.TriVerbTable[verb_voc_id])
    else:
        print("triverb has no verb")
    return liste
Example #11
0
def find_alltriverb(triverb, givenharaka = araby.FATHA, 
vocalised_entree = False):
    """
    Find the triliteral verb in the dictionary (TriVerbTable)
    return a list of possible verb forms
    each item contains:
        - 'root':
        - 'haraka:
        - 'bab':
        - 'transitive':
    @param triverb: given verb.
    @type triverb: unicode.
    @param givenharaka: given haraka of tuture type of the verb, 
    default(FATHA).
    @type givenharaka: unicode.
    @param VocalisedEntree: True if the given verb is vocalized, 
    default False.
    @type VocalisedEntree: Boolean.
    @return: list of triliteral verbs.
    @rtype: list of dicts.
    """
    liste = []

    if vocalised_entree:
        verb_nm = araby.strip_harakat(triverb)
    else:
        verb_nm = triverb

    normalized = araby.normalize_hamza(verb_nm)
    if TRIVERBTABLE_INDEX.has_key(normalized):
        for verb_voc_id in TRIVERBTABLE_INDEX[normalized]:
            if triverb == triverbtable.TriVerbTable[verb_voc_id]['verb'] and \
             givenharaka == triverbtable.TriVerbTable[verb_voc_id]['haraka']:
                liste.insert(0, triverbtable.TriVerbTable[verb_voc_id])
#            if VocalisedEntree:
                #if verb_voc_id[:-1] == triverb:
                #    liste.append(TriVerbTable[verb_voc_id])
            else:
                liste.append(triverbtable.TriVerbTable[verb_voc_id])
    else:
        print "triverb has no verb"
    return liste
def preprocess(sentences, stopwords, isStopword = False):
  """
    This takes in an array of complete araic sentences, and performs th following operations on all of them:
        1.) strips tashkeel
        2.) strips harakat
        3.) strips lastharaka
        4.) strips tatweel
        5.) Strips shadda
        6.) normalize lam alef ligatures 
        7.) normalize hamza
        8.) tokenize

    Returns a 2D martix, where each row represents normalized, tokens of each sentence
  """
  #print("SENTENCE INDEX!!!", sentences[0])
  output = []
  for sentence in sentences:
    #print("Before Preprocessing:"+ sentence)
    #print(sentence)
    text = araby.strip_harakat(sentence)
    #print("TEXT!!!!", text)
    text = araby.strip_tashkeel(text)
    text = araby.strip_lastharaka(text)
    text = araby.strip_tatweel(text)
    text = araby.strip_shadda(text)
    text = araby.normalize_ligature(text)
    text = araby.normalize_hamza(text)
    text = clean_str(text)
    #print("After Preprocessing:"+ text)
    #print("----")
    #print(text)
    try:
      text = re.match(r'[^\\n\\s\\p{Latin}]+', text).group()
      tokens = araby.tokenize(text)
      if not isStopword:
        tokens = remove_stopwords(stopwords, tokens)
      tokens = [t for t in tokens if t != '\n']
      output.append(tokens)
    except:
      pass
  
  return output
Example #13
0
    def display_rows(self, listtense):
        """
        Display The conjugation result for a list of tenses, as text in rows.
        every row contains:
            - unvocalized conjugation, 
            - unvocalized conjugation, 
            - pronoun
            - tense, 
            - transitive, 
            - original verb
            - tasrif bab

        @param listtense: the given tenses list to display result
        @type listtense: list of unicode
        @return: the result as text in row.
        @rtype: unicode.
        """
        text = u""

        transitive = "0"
        if self.transitive:
            transitive = '1'
        for pronoun in vconst.PronounsTable:
            ##            text += u"%s"  % (pronoun)
            for tense in listtense:
                #                print (self.verb).encode("utf-8"),
                if self.tab_conjug[tense][pronoun] != "":
                    text += "\t".join([
                        araby.strip_harakat(self.tab_conjug[tense][pronoun]),
                        self.tab_conjug[tense][pronoun],
                        TAB_DISPLAY[pronoun],
                        TAB_DISPLAY[tense],
                        transitive,
                        self.verb,
                        self.bab,
                    ])
                    text += u"\n"
        return text
Example #14
0
    def display_rows(self, listtense ):
        """
        Display The conjugation result for a list of tenses, as text in rows.
        every row contains:
            - unvocalized conjugation, 
            - unvocalized conjugation, 
            - pronoun
            - tense, 
            - transitive, 
            - original verb
            - tasrif bab

        @param listtense: the given tenses list to display result
        @type listtense: list of unicode
        @return: the result as text in row.
        @rtype: unicode.
        """        
        text = u""

        transitive = "0"
        if self.transitive:
            transitive = '1'
        for pronoun in vconst.PronounsTable:
##            text += u"%s"  % (pronoun)
            for tense in listtense:
#                print (self.verb).encode("utf-8"), 
                if  self.tab_conjug[tense][pronoun] != "":
                    text += "\t".join([
                        araby.strip_harakat(self.tab_conjug[tense][pronoun]), 
                        self.tab_conjug[tense][pronoun], 
                        TAB_DISPLAY[pronoun], 
                        TAB_DISPLAY[tense], 
                        transitive, 
                        self.verb, 
                        self.bab, 
                        ])
                    text += u"\n"
        return text
Example #15
0
def  normalize_alef_madda(word):
    """
    Convert Alef madda into two letters.
    @param word: given word.
    @type word: unicode.
    @return: converted word.
    @rtype: unicode.
    """
    if word.startswith(ALEF_MADDA):
        word_nm = araby.strip_harakat(word)
        if len(word_nm) == 2:
            return word_nm.replace(ALEF_MADDA, HAMZA+ALEF)
        elif len(word_nm) == 3:
            if vconst.ALEF_MADDA_VERB_TABLE.has_key(word_nm):
                #return the first one only
                #mylist = ALEF_MADDA_VERB_TABLE[word_nm]
                return vconst.ALEF_MADDA_VERB_TABLE[word_nm][0]
            else:
                return  word_nm.replace(ALEF_MADDA, HAMZA+ALEF)
        else:
            return word_nm.replace(ALEF_MADDA, HAMZA+ALEF)
    else:
        return word_nm
Example #16
0
 def Strip_tashkeel(self):
     return strip_harakat(self.text)
Example #17
0
def suggest_verb(verb):
    """
    Generate a list of valid infinitive verb for an invalid infinitive form.
    @param verb: given verb, of invalid infinitive form.
    @type verb: unicode.
    @return: a list of suggested infinitive verb forms
    @rtype: list of unicode.
    """
    # the verb is invalid
    list_suggest = []
    # first strip harakat, shadda is not striped
    verb = araby.strip_harakat(verb)
    # second strip all inacceptable letters in an infinivive form
    verb = re.sub(u"[%s%s%s%s]"%( TEH_MARBUTA, DAMMATAN, KASRATAN, FATHATAN), \
     '', verb)
    # test the resulted verb if it's valid, if ok, 
    # add it to the suggestion list.
    if is_valid_infinitive_verb(verb):
        list_suggest.append(verb)
        return list_suggest
    # if the verb starts by ALEF_HAMZA_BELOW like إستعمل, 
    #replace if by an ALEF, because it's a common error.
    # if the result is valid add it to the suggestions list
    elif verb.startswith(ALEF_HAMZA_BELOW):
        verb = re.sub(ALEF_HAMZA_BELOW, ALEF, verb)
        if is_valid_infinitive_verb(verb):
            list_suggest.append(verb)
            return list_suggest
    # if the verb starts by ALEF like اضرب, 
    #replace if by an ALEF_HAMZA_ABOVE, because it's a common error.
    # if the result is valid add it to the suggestions list
    elif verb.startswith(ALEF):
        verb_one = re.sub(ALEF, ALEF_HAMZA_ABOVE+FATHA, verb, 1)
        if is_valid_infinitive_verb(verb_one):
            list_suggest.append(verb_one)
            return list_suggest
    # if the verb is 2 letters length, 
    # suggest to add the third letter as : 
    # Shadda, Alef, Alef Maksura, Yeh at the end
    # if the result is valid add it to the suggestions list
    elif len(verb) == 2:
        verb = re.sub(ALEF, ALEF_HAMZA_ABOVE, verb, 1)
        #suggest to add the third letter as : Shadda at the end
        verb_one = verb+SHADDA
        if is_valid_infinitive_verb(verb_one):
            list_suggest.append(verb_one)
        #suggest to add the third letter as : Alef Maksura
        verb_one = verb+ALEF_MAKSURA
        if is_valid_infinitive_verb(verb_one):
            list_suggest.append(verb_one)
        #suggest to add the third letter as :Alef at the end
        verb_one = verb+ALEF
        if is_valid_infinitive_verb(verb_one):
            list_suggest.append(verb_one)
        #suggest to add the third letter as :Alef in middle
        verb_one = verb[0]+ALEF+verb[1]
        if is_valid_infinitive_verb(verb_one):
            list_suggest.append(verb_one)
        return list_suggest
    elif len(verb) >= 6:
    # if the verb is more than 6 letters length, 
    #suggest to replace the over added letters by Alef
    # if the result is valid add it to the suggestions list
        for i in range(len(verb)-6):
            verb_one = ALEF+verb[i:i+5]
            if is_valid_infinitive_verb(verb_one):
                list_suggest.append(verb_one)
    elif len(verb) == 5:
    # if the verb is 5 letters length, suggest
    # if the result is valid add it to the suggestions list
        # ToDo: review this part
        for i in range(len(verb)-5):
            verb_one = ALEF+verb[i:i+4]
            if is_valid_infinitive_verb(verb_one):
                list_suggest.append(verb_one)
    elif len(verb) == 4:

    # if the verb is 5 letters length, 
    #suggest to replace the over added letters by Alef
    # if the result is valid add it to the suggestions list
    # فعال = > فاعل
    #فّعل = > فعّل
        if verb[2] == ALEF or verb[1] == SHADDA:
            verb_one = verb[0]+verb[2]+verb[1]+verb[3]
            if is_valid_infinitive_verb(verb_one):
                list_suggest.append(verb_one)
        if verb.endswith(SHADDA):
    # if the verb is 4 letters length, 
    #suggest to correct the alef and shadda position
    # if the result is valid add it to the suggestions list
    #فعلّ = > فعّل
            verb_one = verb[0]+verb[1]+verb[3]+verb[2]
            if is_valid_infinitive_verb(verb_one):
                list_suggest.append(verb_one)
        return list_suggest
    else:
    # else sugest to conjugate another verb
        list_suggest.append(u"كتب")
        return list_suggest
    return list_suggest
Example #18
0
 def removeHarakat(self, t):
     return strip_harakat(t)
Example #19
0
def uniformate_verb(word):
    """
    Separate the harakat and the letters of the given word, 
    it return two strings ( the word without harakat and the harakat).
    If the weaked letters are reprsented as long harakat 
    and striped from the word.
    @param word: given word.
    @type word: unicode.
    @return: (letters, harakat).
    @rtype: tuple of unicode.
    """
    if word == "":
        return ("", "")
    #normalize ALEF MADDA
    if word.startswith(ALEF_MADDA):
        word = word.replace(ALEF_MADDA, HAMZA+HAMZA)
    else:
        word = word.replace(ALEF_MADDA, HAMZA+ALEF)

    word_nm = araby.strip_harakat(word)
    length = len(word_nm)
    if len(word_nm) != 3:
        # تستعمل الهمزات لتخمين حركات الفعل الثلاثي
        # normalize hamza here, because we use it to 
        # detect harakat on the trilateral verb.
        word_nm = vconst.HAMZAT_PATTERN.sub(HAMZA, word_nm)
    # length of word after normalization

    # اهمزات تستعمل لكشف تشكيل الفعل، يتم توحيدها لاحقا
    if length == 3:
        if word_nm[1]in (ALEF, ALEF_HAMZA_ABOVE) or \
         word_nm[2] in (ALEF_MAKSURA, ALEF_HAMZA_ABOVE, ALEF):
            marks = FATHA+FATHA+FATHA
        elif word[1] == YEH_HAMZA or word[2] in (YEH, YEH_HAMZA):
            marks = FATHA+KASRA+FATHA
        else:
            # let the verb haraka
            i = 0
        ## ignore harakat at the began of the word
            while araby.is_shortharaka(word[i]):# in HARAKAT:
                i += 1
        # الحرف الأول
            if not araby.is_shortharaka(word[i]):#not in HARAKAT:
                i += 1
        # الحركة الأولى
            while araby.is_shortharaka(word[i]):#word[i] in HARAKAT:
                i += 1
        # الحرف الثاني
            if not araby.is_shortharaka(word[i]):#word[i] not in HARAKAT:
                i += 1
        #الحركة الثانية
            if not araby.is_shortharaka(word[i]):#word[i] not in HARAKAT:
            #وجدنا مشاكل في تصريف الفعل المضاعف في الماضي
            # نجعل الحركة الثانية فتحة مؤقتا
            #ToDo: review this case
                secondharaka = FATHA
            else:
                secondharaka = word[i]
            marks = u''.join([FATHA, secondharaka, FATHA])
        # تستعمل الهمزات لتخمين حركات الفعل الثلاثي
        # normalize hamza here, because we use it to 
        # detect harakat on the trilateral verb.
        word_nm = vconst.HAMZAT_PATTERN.sub(HAMZA, word_nm)

    elif length == 4:
        marks = vconst.UNIFORMATE_MARKS_4
    elif length == 5:
        if word_nm.startswith(TEH):
            marks = vconst.UNIFORMATE_MARKS_5TEH
        else :
            marks = vconst.UNIFORMATE_MARKS_5
    elif length == 6:
        marks = vconst.UNIFORMATE_MARKS_6
    else:
        marks = FATHA*len(word_nm)

    i = 1
# first added automaticlly
    new_word = word_nm[0]
    new_harakat = marks[0]
# between the first and the last
    while i < length-1:
        if word_nm[i] == ALEF:
            new_harakat = new_harakat[:-1]+vconst.ALEF_HARAKA
        else:
            new_harakat += marks[i]
            new_word += word_nm[i]
        i += 1
# the last letter
##  حالة الفعل عيا، أعيا، عيّا والتي يتحول إلى ياء بدلا عن واو
    if word_nm[i] == ALEF:
        if len(word_nm) == 3 and word_nm[1] != YEH:
            new_word += vconst.ALEF_MAMDUDA
        else:
            new_word += YEH
    else:
        new_word += word_nm[i]
    new_harakat += marks[i]
##    new_word += word_nm[i]
    return (new_word, new_harakat)
Example #20
0
u"الْعَرَيِيّةُ",
u"العربية",
u"الْعَرَيِيّةُ الفصحى",
u"غير مشكول",
"Taha",
]
word1=u""
for word in word_list:
    print word.encode('utf8'),'\t',
    if araby.is_vocalized(word): print ' is vocalized',
##    if araby.isArabicstring(word): print ' iisArabicstring',
##    else:print ' invalid arabicstring',
    if araby.is_vocalizedtext(word): print ' is vocalized text',
    if araby.is_arabicword(word): print ' is valid word',
    else: print "invalid arabic word",
    print ' strip harakat', araby.strip_harakat(word).encode('utf8'),
    print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'),
    print ' strip tatweel',araby.strip_tatweel(word).encode('utf8'),
    print ' normalize ligature ', araby.normalize_ligature(word).encode('utf8'),
    if araby.vocalizedlike(word, word1): print "vocalized_like",
    print;
    word1=word;
if araby.vocalizedlike(u"العربية",u"العرَبية"): print "vocalized_like",
word=u"الْعَرَيِيّةُ"
word_list=[
u"الْعَرَيِيّةُ",
u"العربية",
u"الْعَرَيِيّةُ الفصحى",
u"غير مشكول",
"Taha",
]
Example #21
0
def is_valid_infinitive_verb(word, vocalized = True):
    """
    Determine if the given word is a valid infinitive form of an arabic verb.
    A word is not valid  infinitive if
        - lenght < 3 letters.
        - starts with : ALEF_MAKSURA, WAW_HAMZA, YEH_HAMZA, HARAKAT
        - contains TEH_MARBUTA, Tanwin
        - contains non arabic letters.
        - contains ALEF_MAKSURA not in the end.
        - contains double haraka : a warning
    @param word: given word.
    @type word: unicode.
    @param is_vocalized: if the given word is vocalized.
    @type is_vocalized:Boolean, default(True).
    @return: True if the word is a valid infinitive form of verb.
    @rtype: Boolean.
    """
    # test if the word is an arabic valid word, 
    if not  araby.is_arabicword(word):
        return False
    if vocalized :
        word_nm  =  araby.strip_harakat(word)
    else:
        word_nm = word
    # the alef_madda is  considered as 2 letters

    word_nm = word_nm.replace(ALEF_MADDA, HAMZA+ALEF)
    length = len(word_nm)

    # lenght with shadda must be between 3 and 6
    if length < 3  or length >= 7:
        return False
    # a 3 length verb can't start by Alef or Shadda, 
    #and the second letter can't be shadda
    elif length == 3 and (word_nm[0] == ALEF or word_nm[0] == SHADDA \
    or word_nm[1] == SHADDA):
        return False

    # a 5 length verb must start by ALEF or TEH
    elif length == 5 and word_nm[0] not in (TEH, ALEF):
        return False
    # a 6 length verb must start by ALEF
    elif length == 6 and word_nm[0] !=  ALEF:
        return False

    # contains some invalide letters in verb
    elif re.search(u"[%s%s%s%s%s]"%(ALEF_HAMZA_BELOW, TEH_MARBUTA, 
    DAMMATAN, KASRATAN, FATHATAN), word):
        return False
    # contains some SHADDA sequence letters in verb
    # Like shadda shadda, shadda on alef, start  
    # by shadda, shadda on alef_ maksura, 
    # ALEF folowed by (ALEF, ALEF_MAKSURA)
    # ALEF Folowed by a letter and ALEF
    # end with ALEF folowed by (YEH, ALEF_MAKSURA)
    # first letter is alef and ALLw alef and two letters aand shadda
    elif re.search(u"([%s%s%s]%s|^%s|^%s..%s|^.%s|%s.%s|%s%s|%s[%s%s]$)"%(
    ALEF, ALEF_MAKSURA, SHADDA, SHADDA, SHADDA, ALEF, SHADDA, SHADDA, 
    ALEF, ALEF, ALEF, ALEF, ALEF, ALEF_MAKSURA, YEH), word_nm):
        return False


    # Invalid root form some letters :
    #~ # initial YEH folowed by 
    #~ ((THEH, JEEM, HAH, KHAH, THAL, ZAIN, SHEEN, SAD, DAD,
     #~ TAH, ZAH, GHAIN, KAF, HEH, YEH))
    elif re.search(u"^%s[%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s]"%(
    YEH, THEH, JEEM, HAH, KHAH, THAL, ZAIN, SHEEN, SAD, DAD, 
    TAH, ZAH, GHAIN, KAF, HEH, YEH), word_nm):
        return False


       # TEH After (DAL, THAL, TAH, ZAH, DAD)
    elif re.search(u"[%s%s%s%s%s]%s"%(DAL, THAL, DAD, TAH, ZAH, TEH), word_nm):
        return False
    # Contains invalid root sequence in arabic, near in phonetic
    # like BEH and FEH, LAM And REH
    elif re.search(u"%s%s|%s%s|%s%s|%s%s|%s%s|%s%s|%s%s"%(
    LAM, REH, REH, LAM, FEH, BEH, BEH, FEH, NOON,
     LAM, HEH, HAH, HAH, HEH), word_nm):
        return False


    # in non 5 letters verbs :initial TEH followed by  
    # (THEH, DAL, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH)
    elif length !=  5 and word_nm.startswith(TEH) and word_nm[1] in (
    TEH, THEH, DAL, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH):
        return False
    # if word start by the same letter doubled
    elif word_nm[0] == word_nm[1] and word[0] !=  TEH:
        return False

    #verify the wazn of the verb
    elif length == 3:
        if re.match("^[^%s][^%s].$"%(ALEF, SHADDA), word_nm):
            return True
    # الأوزان المقبولة هي فعل، فعّ،
    # الأوزان غير المقبولة
    # اعل، فّل
        else: return False
    elif length == 4:
    #1- أفعل، 2- فاعل، 3 فعّل 4 فعلل
        if re.match(\
        "^([%s%s][^%s]{2}.|[^%s%s]%s[^%s%s].|[^%s%s]{2}%s[^%s]|[^%s%s]{4})$"\
        %(ALEF_HAMZA_ABOVE, HAMZA, SHADDA, ALEF, SHADDA, ALEF, ALEF, SHADDA,
         ALEF, SHADDA, SHADDA, SHADDA, ALEF, SHADDA), word_nm):

            return True
    # الأوزان المقبولة هي فعل، فعّ،
    # الأوزان غير المقبولة
    #     افعل: يجب تثبيت همزة القطع
    #فّعل، فعلّ: الشدة لها موضع خاص
    # فعال، فعلا: للألف موضع خاص
        else: return False
    elif length == 5:

        if  word_nm.startswith(ALEF):
            if re.match(u"^ا...ّ$", word_nm):
                return True
            # حالة اتخذ أو اذّكر أو اطّلع
            if re.match(u"^%s[%s%s%s]%s..$"%(ALEF, TEH, THAL, TAH, SHADDA), \
             word_nm):
                return True

            # انفعل
            elif re.match(u"^ان...$", word_nm):
                return True
            #افتعل
            elif re.match(u"^(ازد|اصط|اضط)..$", word_nm):
                return True
            elif re.match(u"^ا[^صضطظد]ت..$", word_nm):
                return True
            elif re.match(u"^ا...ّ$", word_nm):
                return True
            # حالة اتخذ أو اذّكر أو اطّلع
            elif re.match(u"^ا.ّ..$", word_nm):
                return True
            elif re.match(u"^ا...ى$", word_nm):
                return True
            else: return False
        elif word_nm.startswith(TEH):
            return True
        else:
            return False

    # الأوزان المقبولة هي فعل، فعّ،
    # الأوزان غير المقبولة
    #للشدة موضع خاص: تفعّل، افتعّ
    # للألف مواضع خاصة،
    elif length == 6:
        if not (word_nm.startswith(ALEF) or word_nm.startswith(TEH)):
            return False
        if VALID_INFINITIVE_VERB6_PATTERN.match(word_nm):
            return True
    # الأوزان المقبولة هي فعل، فعّ،
    # الأوزان غير المقبولة
    #للشدة موضع خاص: تفعّل، افتعّ
    # للألف مواضع خاصة،
        else: return False
    return True
    if araby.is_weak(c): print ('weak'),
    if araby.is_moon(c): print ('moon'),
    if araby.is_sun(c):print ('sun'),
    print (araby.order(c)),
    print ();
word=u"الْعَرَيِيّةُ"
word_list=[
u"الْعَرَيِيّةُ",
u"العربية",
u"الْعَرَيِيّةُ الفصحى",
u"غير مشكول",
"Taha",
u"سئل لأنه يؤم الإمام"
]
word1=u""
for word in word_list:
    print (word)
    if araby.is_vocalized(word): print (' is vocalized')
    if araby.is_vocalizedtext(word): print (' is vocalized text')
    if araby.is_arabicword(word): print (' is valid word')
    else: print ("invalid arabic word")
    print (' strip harakat', araby.strip_harakat(word))
    print (' strip tashkeel', araby.strip_tashkeel(word))
    print (' strip tatweel',araby.strip_tatweel(word))
    print (' normalize ligature ', araby.normalize_ligature(word))
    print (' normalize hamza', araby.normalize_hamza(word))
    if araby.vocalizedlike(word, word1): print ("vocalized_like")
    word1=word;
if araby.vocalizedlike(u"العربية",u"العرَبية"): print ("vocalized_like")

Example #23
0
def is_valid_infinitive_verb(word, vocalized=True):
    """
    Determine if the given word is a valid infinitive form of an arabic verb.
    A word is not valid  infinitive if
        - lenght < 3 letters.
        - starts with : ALEF_MAKSURA, WAW_HAMZA, YEH_HAMZA, HARAKAT
        - contains TEH_MARBUTA, Tanwin
        - contains non arabic letters.
        - contains ALEF_MAKSURA not in the end.
        - contains double haraka : a warning
    @param word: given word.
    @type word: unicode.
    @param is_vocalized: if the given word is vocalized.
    @type is_vocalized:Boolean, default(True).
    @return: True if the word is a valid infinitive form of verb.
    @rtype: Boolean.
    """
    # test if the word is an arabic valid word,
    if not araby.is_arabicword(word):
        return False
    if vocalized:
        word_nm = araby.strip_harakat(word)
    else:
        word_nm = word
    # the alef_madda is  considered as 2 letters

    word_nm = word_nm.replace(ALEF_MADDA, HAMZA + ALEF)
    length = len(word_nm)

    # lenght with shadda must be between 3 and 6
    if length < 3 or length >= 7:
        return False
    # a 3 length verb can't start by Alef or Shadda,
    #and the second letter can't be shadda
    elif length == 3 and (word_nm[0] == ALEF or word_nm[0] == SHADDA \
    or word_nm[1] == SHADDA):
        return False

    # a 5 length verb must start by ALEF or TEH
    elif length == 5 and word_nm[0] not in (TEH, ALEF):
        return False
    # a 6 length verb must start by ALEF
    elif length == 6 and word_nm[0] != ALEF:
        return False

    # contains some invalide letters in verb
    elif re.search(
            u"[%s%s%s%s%s]" %
        (ALEF_HAMZA_BELOW, TEH_MARBUTA, DAMMATAN, KASRATAN, FATHATAN), word):
        return False
    # contains some SHADDA sequence letters in verb
    # Like shadda shadda, shadda on alef, start
    # by shadda, shadda on alef_ maksura,
    # ALEF folowed by (ALEF, ALEF_MAKSURA)
    # ALEF Folowed by a letter and ALEF
    # end with ALEF folowed by (YEH, ALEF_MAKSURA)
    # first letter is alef and ALLw alef and two letters aand shadda
    elif re.search(
            u"([%s%s%s]%s|^%s|^%s..%s|^.%s|%s.%s|%s%s|%s[%s%s]$)" %
        (ALEF, ALEF_MAKSURA, SHADDA, SHADDA, SHADDA, ALEF, SHADDA, SHADDA,
         ALEF, ALEF, ALEF, ALEF, ALEF, ALEF_MAKSURA, YEH), word_nm):
        return False

    # Invalid root form some letters :
    #~ # initial YEH folowed by
    #~ ((THEH, JEEM, HAH, KHAH, THAL, ZAIN, SHEEN, SAD, DAD,
    #~ TAH, ZAH, GHAIN, KAF, HEH, YEH))
    elif re.search(
            u"^%s[%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s]" %
        (YEH, THEH, JEEM, HAH, KHAH, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH,
         GHAIN, KAF, HEH, YEH), word_nm):
        return False

    # TEH After (DAL, THAL, TAH, ZAH, DAD)
    elif re.search(u"[%s%s%s%s%s]%s" % (DAL, THAL, DAD, TAH, ZAH, TEH),
                   word_nm):
        return False
    # Contains invalid root sequence in arabic, near in phonetic
    # like BEH and FEH, LAM And REH
    elif re.search(
            u"%s%s|%s%s|%s%s|%s%s|%s%s|%s%s|%s%s" %
        (LAM, REH, REH, LAM, FEH, BEH, BEH, FEH, NOON, LAM, HEH, HAH, HAH,
         HEH), word_nm):
        return False

    # in non 5 letters verbs :initial TEH followed by
    # (THEH, DAL, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH)
    elif length != 5 and word_nm.startswith(TEH) and word_nm[1] in (
            TEH, THEH, DAL, THAL, ZAIN, SHEEN, SAD, DAD, TAH, ZAH):
        return False
    # if word start by the same letter doubled
    elif word_nm[0] == word_nm[1] and word[0] != TEH:
        return False

    #verify the wazn of the verb
    elif length == 3:
        if re.match("^[^%s][^%s].$" % (ALEF, SHADDA), word_nm):
            return True
    # الأوزان المقبولة هي فعل، فعّ،
    # الأوزان غير المقبولة
    # اعل، فّل
        else:
            return False
    elif length == 4:
        #1- أفعل، 2- فاعل، 3 فعّل 4 فعلل
        if re.match(\
        "^([%s%s][^%s]{2}.|[^%s%s]%s[^%s%s].|[^%s%s]{2}%s[^%s]|[^%s%s]{4})$"\
        %(ALEF_HAMZA_ABOVE, HAMZA, SHADDA, ALEF, SHADDA, ALEF, ALEF, SHADDA,
         ALEF, SHADDA, SHADDA, SHADDA, ALEF, SHADDA), word_nm):

            return True
    # الأوزان المقبولة هي فعل، فعّ،
    # الأوزان غير المقبولة
    #     افعل: يجب تثبيت همزة القطع
    #فّعل، فعلّ: الشدة لها موضع خاص
    # فعال، فعلا: للألف موضع خاص
        else:
            return False
    elif length == 5:

        if word_nm.startswith(ALEF):
            if re.match(u"^ا...ّ$", word_nm):
                return True
            # حالة اتخذ أو اذّكر أو اطّلع
            if re.match(u"^%s[%s%s%s]%s..$"%(ALEF, TEH, THAL, TAH, SHADDA), \
             word_nm):
                return True

            # انفعل
            elif re.match(u"^ان...$", word_nm):
                return True
            #افتعل
            elif re.match(u"^(ازد|اصط|اضط)..$", word_nm):
                return True
            elif re.match(u"^ا[^صضطظد]ت..$", word_nm):
                return True
            elif re.match(u"^ا...ّ$", word_nm):
                return True
            # حالة اتخذ أو اذّكر أو اطّلع
            elif re.match(u"^ا.ّ..$", word_nm):
                return True
            elif re.match(u"^ا...ى$", word_nm):
                return True
            else:
                return False
        elif word_nm.startswith(TEH):
            return True
        else:
            return False

    # الأوزان المقبولة هي فعل، فعّ،
    # الأوزان غير المقبولة
    #للشدة موضع خاص: تفعّل، افتعّ
    # للألف مواضع خاصة،
    elif length == 6:
        if not (word_nm.startswith(ALEF) or word_nm.startswith(TEH)):
            return False
        if VALID_INFINITIVE_VERB6_PATTERN.match(word_nm):
            return True
    # الأوزان المقبولة هي فعل، فعّ،
    # الأوزان غير المقبولة
    #للشدة موضع خاص: تفعّل، افتعّ
    # للألف مواضع خاصة،
        else:
            return False
    return True
Example #24
0
    def __init__(self, verb, transitive, future_type=FATHA):
        """ 
        init method
        @param verb: the given verb
        @type verb: unicode.
        @param transitive: the verb is transitive or not
        @type transitive: Boolean.        
        @param future_type: The mark of the third radical letter in the verb, 
        used for triletiral verb only. Default value is Fatha; 
        @type future_type: unicode; one arabic letter (Fatha, Damma, Kasra).        
        """    
        self.verb = verb
        # this cache is used to avoid duplicated operatioon in standardisation,
        # treat_sukun, and uniformate suffix
        self.cache_standard = cache_standard
        self.internal_verb = ar_verb.normalize(verb)
        self.future_type = ar_verb.get_future_type_by_name(future_type)
        (self.word_letters, self.word_marks) = ar_verb.uniformate_verb(verb)
        #Before last haraka in the past
        self.past_haraka = araby.secondlast_char(self.word_marks)
        self.word_marks = ar_verb.uniformate_alef_origin(self.word_marks, 
        self.internal_verb, self.future_type)

        self.transitive = transitive
        self.hamza_zaida = False
        self.tab_conjug_stem = {}
        verb = self.verb
        tab_type = [u"", u"", u"", u"فعل ثلاثي", u"فعل رباعي", u"فعل خماسي", 
        u"فعل سداسي", u"فعل سباعي", u"فعل ثماني", u"فعل تساعي"]
        verb = ar_verb.normalize(verb)

        self.unvocalized = araby.strip_harakat(verb)
        verb_nm = self.unvocalized
        self.vlength = len(verb_nm)
        self.vtype = tab_type[self.vlength]

        # الهمزة زائدة
        self.hamza_zaida = self._is_hamza_zaida(verb_nm)

        # التاء الزائدة
        #deprecated
        #self.teh_zaida=self.is_teh_zaida(verb_nm)

        # معالجة حالة الأفعال الشاذة
        # إذا كان الفعل من الشواذ، استخرجنا جذوع التصريف من جدوله
        #وإلا ولّدنا جذوع تصريفه
        # في المضارع والأمر فقط
        # أما الماضي فليس فيه شذوذ
        self.past_stem = ""
        self._prepare_past_stem()
        self._prepare_passive_past_stem()
        if self._is_irregular_verb():
            self._prepare_irregular_future_imperative_stem()

        else:
            self._prepare_future_imperative_stem()


        # display object
        self.conj_display = conjugatedisplay.ConjugateDisplay(self.verb)
        if self.transitive  :
            self.conj_display.add_attribut(u"اللزوم/التعدي", u"متعدي")
        else :
            self.conj_display.add_attribut(u"اللزوم/التعدي", u"لازم")
        self.conj_display.add_attribut(u"الفعل", self.verb)
        self.conj_display.add_attribut(u"نوع الفعل", self.vtype)
        self.future_form = self.conjugate_tense_pronoun(vconst.TenseFuture, 
        vconst.PronounHuwa)
        self.conj_display.set_future_form(self.future_form)
        if self.transitive :
            self.conj_display.settransitive()
        self.conj_display.setbab(self.future_type)
Example #25
0
from pyarabic.araby import strip_tashkeel
from pyarabic.araby import strip_harakat

if __name__ == '__main__':
  text = u"الْعَرَبِيّةُ"
  print(strip_harakat(text))
  text = u"الْعَرَبِيّةُ"
  print(strip_tashkeel(text))
Example #26
0
 def test_strip_harakat(self):
     """Test striped tashkeel for العربية?"""
     word = u"الْعَرَبِيَّةُ"
     word_nm = u'العربيّة'
     self.assertEqual(ar.strip_harakat(word), word_nm)
     self.assertNotEqual(ar.strip_harakat(word), word)
from pyarabic.araby import strip_harakat

with open('./nawar-raw/nawar.csv', encoding='utf-8') as f:
    with open('nawar_stripped.txt', mode='w', encoding="utf-8") as fw:
        fw.write(strip_harakat(f.read()))
 def get_all_place_pre(self):
     place_pre = self.voweled_df[self.voweled_df['@type'] == '50']
     place_pre['@voweledform'] = place_pre['@voweledform'].apply(
         lambda x: strip_harakat(x))
     return place_pre['@voweledform'].tolist()
 def get_all_time_pre(self):
     time_pre = self.voweled_df[self.voweled_df['@type'] == '49']
     time_pre['@voweledform'] = time_pre['@voweledform'].apply(
         lambda x: strip_harakat(x))
     return time_pre['@voweledform'].tolist()
Example #30
0
    u"الْعَرَيِيّةُ",
    u"العربية",
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
]
word1 = u""
for word in word_list:
    print word.encode('utf8'), '\t',
    if araby.is_vocalized(word): print ' is vocalized',
    ##    if araby.isArabicstring(word): print ' iisArabicstring',
    ##    else:print ' invalid arabicstring',
    if araby.is_vocalizedtext(word): print ' is vocalized text',
    if araby.is_arabicword(word): print ' is valid word',
    else: print "invalid arabic word",
    print ' strip harakat', araby.strip_harakat(word).encode('utf8'),
    print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'),
    print ' strip tatweel', araby.strip_tatweel(word).encode('utf8'),
    print ' normalize ligature ', araby.normalize_ligature(word).encode(
        'utf8'),
    if araby.vocalizedlike(word, word1): print "vocalized_like",
    print
    word1 = word
if araby.vocalizedlike(u"العربية", u"العرَبية"): print "vocalized_like",
word = u"الْعَرَيِيّةُ"
word_list = [
    u"الْعَرَيِيّةُ",
    u"العربية",
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
Example #31
0
 def test_strip_harakat(self):
     """Test striped tashkeel for العربية?"""
     word = u"الْعَرَبِيَّةُ"
     word_nm = u'العربيّة'
     self.assertEqual(ar.strip_harakat(word), word_nm)
     self.assertNotEqual(ar.strip_harakat(word), word)
Example #32
0
def suggest_verb(verb):
    """
    Generate a list of valid infinitive verb for an invalid infinitive form.
    @param verb: given verb, of invalid infinitive form.
    @type verb: unicode.
    @return: a list of suggested infinitive verb forms
    @rtype: list of unicode.
    """
    # the verb is invalid
    list_suggest = []
    # first strip harakat, shadda is not striped
    verb = araby.strip_harakat(verb)
    # second strip all inacceptable letters in an infinivive form
    verb = re.sub(u"[%s%s%s%s]"%( TEH_MARBUTA, DAMMATAN, KASRATAN, FATHATAN), \
     '', verb)
    # test the resulted verb if it's valid, if ok,
    # add it to the suggestion list.
    if is_valid_infinitive_verb(verb):
        list_suggest.append(verb)
        return list_suggest
    # if the verb starts by ALEF_HAMZA_BELOW like إستعمل,
    #replace if by an ALEF, because it's a common error.
    # if the result is valid add it to the suggestions list
    elif verb.startswith(ALEF_HAMZA_BELOW):
        verb = re.sub(ALEF_HAMZA_BELOW, ALEF, verb)
        if is_valid_infinitive_verb(verb):
            list_suggest.append(verb)
            return list_suggest
    # if the verb starts by ALEF like اضرب,
    #replace if by an ALEF_HAMZA_ABOVE, because it's a common error.
    # if the result is valid add it to the suggestions list
    elif verb.startswith(ALEF):
        verb_one = re.sub(ALEF, ALEF_HAMZA_ABOVE + FATHA, verb, 1)
        if is_valid_infinitive_verb(verb_one):
            list_suggest.append(verb_one)
            return list_suggest
    # if the verb is 2 letters length,
    # suggest to add the third letter as :
    # Shadda, Alef, Alef Maksura, Yeh at the end
    # if the result is valid add it to the suggestions list
    elif len(verb) == 2:
        verb = re.sub(ALEF, ALEF_HAMZA_ABOVE, verb, 1)
        #suggest to add the third letter as : Shadda at the end
        verb_one = verb + SHADDA
        if is_valid_infinitive_verb(verb_one):
            list_suggest.append(verb_one)
        #suggest to add the third letter as : Alef Maksura
        verb_one = verb + ALEF_MAKSURA
        if is_valid_infinitive_verb(verb_one):
            list_suggest.append(verb_one)
        #suggest to add the third letter as :Alef at the end
        verb_one = verb + ALEF
        if is_valid_infinitive_verb(verb_one):
            list_suggest.append(verb_one)
        #suggest to add the third letter as :Alef in middle
        verb_one = verb[0] + ALEF + verb[1]
        if is_valid_infinitive_verb(verb_one):
            list_suggest.append(verb_one)
        return list_suggest
    elif len(verb) >= 6:
        # if the verb is more than 6 letters length,
        #suggest to replace the over added letters by Alef
        # if the result is valid add it to the suggestions list
        for i in range(len(verb) - 6):
            verb_one = ALEF + verb[i:i + 5]
            if is_valid_infinitive_verb(verb_one):
                list_suggest.append(verb_one)
    elif len(verb) == 5:
        # if the verb is 5 letters length, suggest
        # if the result is valid add it to the suggestions list
        # ToDo: review this part
        for i in range(len(verb) - 5):
            verb_one = ALEF + verb[i:i + 4]
            if is_valid_infinitive_verb(verb_one):
                list_suggest.append(verb_one)
    elif len(verb) == 4:

        # if the verb is 5 letters length,
        #suggest to replace the over added letters by Alef
        # if the result is valid add it to the suggestions list
        # فعال = > فاعل
        #فّعل = > فعّل
        if verb[2] == ALEF or verb[1] == SHADDA:
            verb_one = verb[0] + verb[2] + verb[1] + verb[3]
            if is_valid_infinitive_verb(verb_one):
                list_suggest.append(verb_one)
        if verb.endswith(SHADDA):
            # if the verb is 4 letters length,
            #suggest to correct the alef and shadda position
            # if the result is valid add it to the suggestions list
            #فعلّ = > فعّل
            verb_one = verb[0] + verb[1] + verb[3] + verb[2]
            if is_valid_infinitive_verb(verb_one):
                list_suggest.append(verb_one)
        return list_suggest
    else:
        # else sugest to conjugate another verb
        list_suggest.append(u"كتب")
        return list_suggest
    return list_suggest
Example #33
0
word = u"الْعَرَيِيّةُ"
word_list = [
    u"الْعَرَيِيّةُ",
    u"العربية",
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
]
word1 = u""
for word in word_list:
    print(word, '\t', end=" ")
    if araby.is_vocalized(word): print(' is vocalized', end=" ")
    if araby.is_vocalizedtext(word): print(' is vocalized text', end=" ")
    if araby.is_arabicword(word): print(' is valid word', end=" ")
    else: print("invalid arabic word", end=" ")
    print(' strip harakat', araby.strip_harakat(word), end=" ")
    print(' strip tashkeel', araby.strip_tashkeel(word), end=" ")
    print(' strip tatweel', araby.strip_tatweel(word), end=" ")
    print(' normalize ligature ', araby.normalize_ligature(word), end=" ")
    if araby.vocalizedlike(word, word1): print("vocalized_like", end=" ")
    print()
    word1 = word
if araby.vocalizedlike(u"العربية", u"العرَبية"):
    print("vocalized_like", end=" ")
word = u"الْعَرَيِيّةُ"
word_list = [
    u"الْعَرَيِيّةُ",
    u"العربية",
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
Example #34
0
def uniformate_verb(word):
    """
    Separate the harakat and the letters of the given word, 
    it return two strings ( the word without harakat and the harakat).
    If the weaked letters are reprsented as long harakat 
    and striped from the word.
    @param word: given word.
    @type word: unicode.
    @return: (letters, harakat).
    @rtype: tuple of unicode.
    """
    if word == "":
        return ("", "")
    #normalize ALEF MADDA
    if word.startswith(ALEF_MADDA):
        word = normalize_alef_madda(word)
    else:
        word = word.replace(ALEF_MADDA, HAMZA + ALEF)

    word_nm = araby.strip_harakat(word)
    length = len(word_nm)
    if len(word_nm) != 3:
        # تستعمل الهمزات لتخمين حركات الفعل الثلاثي
        # normalize hamza here, because we use it to
        # detect harakat on the trilateral verb.
        word_nm = vconst.HAMZAT_PATTERN.sub(HAMZA, word_nm)
    # length of word after normalization

    # اهمزات تستعمل لكشف تشكيل الفعل، يتم توحيدها لاحقا
    if length == 3:
        if word_nm[1]in (ALEF, ALEF_HAMZA_ABOVE) or \
         word_nm[2] in (ALEF_MAKSURA, ALEF_HAMZA_ABOVE, ALEF):
            marks = FATHA + FATHA + FATHA
        elif word[1] == YEH_HAMZA or word[2] in (YEH, YEH_HAMZA):
            marks = FATHA + KASRA + FATHA
        else:
            # let the verb haraka
            i = 0
            ## ignore harakat at the began of the word
            while araby.is_shortharaka(word[i]):  # in HARAKAT:
                i += 1
        # الحرف الأول
            if not araby.is_shortharaka(word[i]):  #not in HARAKAT:
                i += 1
        # الحركة الأولى
            while araby.is_shortharaka(word[i]):  #word[i] in HARAKAT:
                i += 1
        # الحرف الثاني
            if not araby.is_shortharaka(word[i]):  #word[i] not in HARAKAT:
                i += 1
        #الحركة الثانية
            if not araby.is_shortharaka(word[i]):  #word[i] not in HARAKAT:
                #وجدنا مشاكل في تصريف الفعل المضاعف في الماضي
                # نجعل الحركة الثانية فتحة مؤقتا
                #ToDo: review this case
                secondharaka = FATHA
            else:
                secondharaka = word[i]
            marks = u''.join([FATHA, secondharaka, FATHA])
        # تستعمل الهمزات لتخمين حركات الفعل الثلاثي
        # normalize hamza here, because we use it to
        # detect harakat on the trilateral verb.
        word_nm = vconst.HAMZAT_PATTERN.sub(HAMZA, word_nm)

    elif length == 4:
        marks = vconst.UNIFORMATE_MARKS_4
    elif length == 5:
        if word_nm.startswith(TEH):
            marks = vconst.UNIFORMATE_MARKS_5TEH
        else:
            marks = vconst.UNIFORMATE_MARKS_5
    elif length == 6:
        marks = vconst.UNIFORMATE_MARKS_6
    else:
        marks = FATHA * len(word_nm)

    i = 1
    # first added automaticlly
    new_word = word_nm[0]
    new_harakat = marks[0]
    # between the first and the last
    while i < length - 1:
        if word_nm[i] == ALEF:
            new_harakat = new_harakat[:-1] + vconst.ALEF_HARAKA
        else:
            new_harakat += marks[i]
            new_word += word_nm[i]
        i += 1
# the last letter
##  حالة الفعل عيا، أعيا، عيّا والتي يتحول إلى ياء بدلا عن واو
    if word_nm[i] == ALEF:
        if len(word_nm) == 3 and word_nm[1] != YEH:
            new_word += vconst.ALEF_MAMDUDA
        else:
            new_word += YEH
    else:
        new_word += word_nm[i]
    new_harakat += marks[i]
    ##    new_word += word_nm[i]
    return (new_word, new_harakat)