Example #1
0
 def test_normalize_hamza(self):
     """Test normalize_hamzafunction ?"""
     text1 = u"جاء سؤال الأئمة عن الإسلام آجلا"
     text2 = u'جاء سءال الءءمة عن الءسلام ءءجلا'
     self.assertEqual(ar.normalize_hamza(text1), text2)
     text1 = u"جاء سؤال الأئمة عن الإسلام آجلا"
     text3 = u"جاء سوال الايمة عن الاسلام اجلا"
     self.assertEqual(ar.normalize_hamza(text1, method="tasheel"), text3)
Example #2
0
    def test_normalization(self):

        # normalize_ligature(text):TODO: fixme gives 'لانها لالء الاسلام'
        # assert  Araby.normalize_ligature( u"لانها لالء الاسلام") == u'لانها لالئ الاسلام'

        # normalize_hamza(word)
        assert Araby.normalize_hamza(u"سئل أحد الأئمة") == u'سءل ءحد الءءمة'
Example #3
0
 def normalize_root(word):
     """ test if word is a root"""
     # change alef madda to hamza + ALEF
     word = word.replace(araby.ALEF_MADDA, araby.HAMZA + araby.ALEF)
     word = word.replace(araby.TEH_MARBUTA, '')
     word = word.replace(araby.ALEF_MAKSURA, araby.YEH)
     return araby.normalize_hamza(word)
    def normalizeText(self, text):
        normalized_text = araby.strip_tatweel(text)
        normalized_text = araby.strip_tashkeel(normalized_text)
        normalized_text = araby.strip_harakat(normalized_text)
        normalized_text = araby.normalize_hamza(normalized_text)

        return normalized_text
Example #5
0
def normalize(word, wordtype="affix"):
    """
    Normalize the word, by unifoming hamzat, Alef madda, shadda, and lamalefs.
    @param word: given word.
    @type word: unicode.
    @param type: if the word is an affix
    @type type: unicode.
    @return: converted word.
    @rtype: unicode.
    """
    # تحويل الكلمة إلى شكلها النظري.
    # الشكل اللإملائي للكلمة هو طريقة كتابتها حسب قواعد الإملاء
    # الشكل النظري هو الشكل المتخيل للكلمة دون تطبيق قواعد اللغة
    # ويخص عادة الأشكال المتعددة للهمزة، و التي تكتب همزة على السطر
    # أمثلة
    # إملائي        نظري
    #إِمْلَائِي        ءِمْلَاءِي
    #سَاَلَ        سَءَلَ
    # الهدف : تحويل الكلمة إلى شكل نظري،
    #ومن ثم إمكانية تصريفها بعيدا عن قواعد الإملاء،
    #وبعد التصريف يتم تطبيق قواعد الإملاء من جديد.
    #الفرضية: الكلمات المدخلة مشكولة شكلا تاما.
    #الطريقة:
    # 1-تحويل جميع أنواع الهمزات إلى همزة على السطر
    # 1-فك الإدغام
    i = 0
    #   strip tatweel
    # the tatweel is used to uniformate the affix
    # when the Haraka is used separetely
    if wordtype != "affix":
        word = araby.strip_tatweel(word)


## تستبدل الألف الممدودة في , ل الكلمة بهمزة قطع بعدها همزة أخرى
    if word.startswith(ALEF_MADDA):
        word = normalize_alef_madda(word)

    # ignore harakat at the begin of the word
    len_word = len(word)
    while i < len_word and araby.is_shortharaka(word[i]):  # in HARAKAT:
        i += 1
    word = word[i:]
    # convert all Hamza from into one form
    word = araby.normalize_hamza(word)
    #Convert All LAM ALEF Ligature into separate letters
    word = word.replace(LAM_ALEF, SIMPLE_LAM_ALEF)
    word = word.replace(LAM_ALEF_HAMZA_ABOVE, SIMPLE_LAM_ALEF_HAMZA_ABOVE)
    word = word.replace(LAM_ALEF_MADDA_ABOVE, SIMPLE_LAM_ALEF_MADDA_ABOVE)
    return word
Example #6
0
def create_index_triverbtable():
    """ Create index from the verb dictionary
    to accelerate the search in the dictionary for verbs
    @return: create the TRIVERBTABLE_INDEX
    @rtype: None
    """
    # the key is the vocverb + the bab number
    for key in triverbtable.TriVerbTable.keys():
        vocverb = triverbtable.TriVerbTable[key]['verb']
        unvverb = araby.strip_harakat(vocverb)
        normverb = araby.normalize_hamza(unvverb)
        if TRIVERBTABLE_INDEX.has_key(normverb):
            TRIVERBTABLE_INDEX[normverb].append(key)
        else:
            TRIVERBTABLE_INDEX[normverb] = [key, ]
Example #7
0
def create_index_triverbtable():
    """ Create index from the verb dictionary
    to accelerate the search in the dictionary for verbs
    @return: create the TRIVERBTABLE_INDEX
    @rtype: None
    """
    # the key is the vocverb + the bab number
    for key in triverbtable.TriVerbTable.keys():
        vocverb = triverbtable.TriVerbTable[key]['verb']
        unvverb = araby.strip_harakat(vocverb)
        normverb = araby.normalize_hamza(unvverb)
        if TRIVERBTABLE_INDEX.has_key(normverb):
            TRIVERBTABLE_INDEX[normverb].append(key)
        else:
            TRIVERBTABLE_INDEX[normverb] = [key, ]
Example #8
0
def normalize(word, wordtype = "affix"):
    """
    Normalize the word, by unifoming hamzat, Alef madda, shadda, and lamalefs.
    @param word: given word.
    @type word: unicode.
    @param type: if the word is an affix
    @type type: unicode.
    @return: converted word.
    @rtype: unicode.
    """
# تحويل الكلمة إلى شكلها النظري.
# الشكل اللإملائي للكلمة هو طريقة كتابتها حسب قواعد الإملاء
# الشكل النظري هو الشكل المتخيل للكلمة دون تطبيق قواعد اللغة
# ويخص عادة الأشكال المتعددة للهمزة، و التي تكتب همزة على السطر
# أمثلة
# إملائي        نظري
#إِمْلَائِي        ءِمْلَاءِي
#سَاَلَ        سَءَلَ
# الهدف : تحويل الكلمة إلى شكل نظري، 
#ومن ثم إمكانية تصريفها بعيدا عن قواعد الإملاء،
#وبعد التصريف يتم تطبيق قواعد الإملاء من جديد.
#الفرضية: الكلمات المدخلة مشكولة شكلا تاما.
#الطريقة:
# 1-تحويل جميع أنواع الهمزات إلى همزة على السطر
# 1-فك الإدغام
    i = 0
#   strip tatweel
# the tatweel is used to uniformate the affix 
# when the Haraka is used separetely
    if wordtype != "affix":
        word = araby.strip_tatweel(word)
## تستبدل الألف الممدودة في , ل الكلمة بهمزة قطع بعدها همزة أخرى
    if word.startswith(ALEF_MADDA):
        word = normalize_alef_madda(word)

    # ignore harakat at the begin of the word
    len_word = len(word)
    while i < len_word and araby.is_shortharaka(word[i]): # in HARAKAT:
        i += 1
    word = word[i:]
    # convert all Hamza from into one form
    word = araby.normalize_hamza(word)
    #Convert All LAM ALEF Ligature into separate letters
    word = word.replace(LAM_ALEF, SIMPLE_LAM_ALEF)
    word = word.replace(LAM_ALEF_HAMZA_ABOVE, SIMPLE_LAM_ALEF_HAMZA_ABOVE)
    word = word.replace(LAM_ALEF_MADDA_ABOVE, SIMPLE_LAM_ALEF_MADDA_ABOVE)
    return word
Example #9
0
def find_alltriverb(triverb, givenharaka=araby.FATHA, vocalised_entree=False):
    """
    Find the triliteral verb in the dictionary (TriVerbTable)
    return a list of possible verb forms
    each item contains:
        - 'root':
        - 'haraka:
        - 'bab':
        - 'transitive':
    @param triverb: given verb.
    @type triverb: unicode.
    @param givenharaka: given haraka of tuture type of the verb, 
    default(FATHA).
    @type givenharaka: unicode.
    @param VocalisedEntree: True if the given verb is vocalized, 
    default False.
    @type VocalisedEntree: Boolean.
    @return: list of triliteral verbs.
    @rtype: list of dicts.
    """
    liste = []

    if vocalised_entree:
        verb_nm = araby.strip_harakat(triverb)
    else:
        verb_nm = triverb

    normalized = araby.normalize_hamza(verb_nm)
    if TRIVERBTABLE_INDEX.has_key(normalized):
        for verb_voc_id in TRIVERBTABLE_INDEX[normalized]:
            if triverb == triverbtable.TriVerbTable[verb_voc_id]['verb'] and \
             givenharaka == triverbtable.TriVerbTable[verb_voc_id]['haraka']:
                liste.insert(0, triverbtable.TriVerbTable[verb_voc_id])


#            if VocalisedEntree:
#if verb_voc_id[:-1] == triverb:
#    liste.append(TriVerbTable[verb_voc_id])
            else:
                liste.append(triverbtable.TriVerbTable[verb_voc_id])
    else:
        print("triverb has no verb")
    return liste
Example #10
0
def find_alltriverb(triverb, givenharaka = araby.FATHA, 
vocalised_entree = False):
    """
    Find the triliteral verb in the dictionary (TriVerbTable)
    return a list of possible verb forms
    each item contains:
        - 'root':
        - 'haraka:
        - 'bab':
        - 'transitive':
    @param triverb: given verb.
    @type triverb: unicode.
    @param givenharaka: given haraka of tuture type of the verb, 
    default(FATHA).
    @type givenharaka: unicode.
    @param VocalisedEntree: True if the given verb is vocalized, 
    default False.
    @type VocalisedEntree: Boolean.
    @return: list of triliteral verbs.
    @rtype: list of dicts.
    """
    liste = []

    if vocalised_entree:
        verb_nm = araby.strip_harakat(triverb)
    else:
        verb_nm = triverb

    normalized = araby.normalize_hamza(verb_nm)
    if TRIVERBTABLE_INDEX.has_key(normalized):
        for verb_voc_id in TRIVERBTABLE_INDEX[normalized]:
            if triverb == triverbtable.TriVerbTable[verb_voc_id]['verb'] and \
             givenharaka == triverbtable.TriVerbTable[verb_voc_id]['haraka']:
                liste.insert(0, triverbtable.TriVerbTable[verb_voc_id])
#            if VocalisedEntree:
                #if verb_voc_id[:-1] == triverb:
                #    liste.append(TriVerbTable[verb_voc_id])
            else:
                liste.append(triverbtable.TriVerbTable[verb_voc_id])
    else:
        print "triverb has no verb"
    return liste
def preprocess(sentences, stopwords, isStopword = False):
  """
    This takes in an array of complete araic sentences, and performs th following operations on all of them:
        1.) strips tashkeel
        2.) strips harakat
        3.) strips lastharaka
        4.) strips tatweel
        5.) Strips shadda
        6.) normalize lam alef ligatures 
        7.) normalize hamza
        8.) tokenize

    Returns a 2D martix, where each row represents normalized, tokens of each sentence
  """
  #print("SENTENCE INDEX!!!", sentences[0])
  output = []
  for sentence in sentences:
    #print("Before Preprocessing:"+ sentence)
    #print(sentence)
    text = araby.strip_harakat(sentence)
    #print("TEXT!!!!", text)
    text = araby.strip_tashkeel(text)
    text = araby.strip_lastharaka(text)
    text = araby.strip_tatweel(text)
    text = araby.strip_shadda(text)
    text = araby.normalize_ligature(text)
    text = araby.normalize_hamza(text)
    text = clean_str(text)
    #print("After Preprocessing:"+ text)
    #print("----")
    #print(text)
    try:
      text = re.match(r'[^\\n\\s\\p{Latin}]+', text).group()
      tokens = araby.tokenize(text)
      if not isStopword:
        tokens = remove_stopwords(stopwords, tokens)
      tokens = [t for t in tokens if t != '\n']
      output.append(tokens)
    except:
      pass
  
  return output
Example #12
0
    def lookup(self, normalized):
        """
        look up for all word forms in the dictionary
        @param normalized: the normalized word.
        @type normalized: unicode.
        @return: list of dictionary entries .
        @rtype: list.

        Example:
            >>> import arramooz.arabicdictionary 
            >>> mydict = arramooz.arabicdictionary.ArabicDictionary('verbs')
            >>> wordlist = [u"استقلّ", u'استقل', u"كذب"]
            >>> tmp_list = []
            >>> for word in wordlist:
            >>> foundlist = mydict.lookup(word)
            >>> for word_tuple in foundlist:
            >>>     word_tuple = dict(word_tuple)
            >>>     vocalized = word_tuple['vocalized']
            >>>     tmp_list.append(dict(word_tuple))
            >>> print(tmp_list)
            [{'think_trans': 1, 'passive': 0, 'confirmed': 0, 'vocalized': u'اِسْتَقَلَّ', 'stamped': u'ستقل', 'future_moode': 0, 'triliteral': 0, 'future': 0, 'unthink_trans': 0, 'past': 0, 'unvocalized': u'استقل', 'future_type': u'َ', 'double_trans': 0, 'normalized': u'استقل', 'reflexive_trans': 0, 'imperative': 0, 'transitive': 1, 'root': u'قلل', 'id': 7495},
            {'think_trans': 1, 'passive': 0, 'confirmed': 0, 'vocalized': u'كَذَبَ', 'stamped': u'كذب', 'future_moode': 0, 'triliteral': 1, 'future': 0, 'unthink_trans': 0, 'past': 0, 'unvocalized': u'كذب', 'future_type': u'كسرة', 'double_trans': 0, 'normalized': u'كذب', 'reflexive_trans': 0, 'imperative': 0, 'transitive': 1, 'root': u'كذب', 'id': 1072},
            {'think_trans': 1, 'passive': 0, 'confirmed': 0, 'vocalized': u'كَذَّبَ', 'stamped': u'كذب', 'future_moode': 0, 'triliteral': 0, 'future': 0, 'unthink_trans': 0, 'past': 0, 'unvocalized': u'كذب', 'future_type': u'َ', 'double_trans': 0, 'normalized': u'كذب', 'reflexive_trans': 0, 'imperative': 0, 'transitive': 1, 'root': u'كذب', 'id': 2869}]

        """
        idlist = []
        normword = araby.normalize_hamza(normalized)
        #print "###", normword.encode('utf8')

        sql = u"select * FROM %s WHERE normalized='%s'" % (self.table_name,
         normword)
        try:
            self.cursor.execute(sql)
            if self.cursor:
                # return self.curser.fetchall()
                for row in self.cursor:
                    idlist.append(row)
            return idlist
        except  sqlite.OperationalError:
            return []
Example #13
0
 def verb_stamp(self, word):
     """
     generate a stamp for a verb,
     the verb stamp is different of word stamp, by hamza noralization
     remove all letters which can change form in the word :
     - ALEF,
     - YEH,
     - WAW,
     - ALEF_MAKSURA
     - SHADDA
     @return: stamped word
     """
     word = ar.strip_tashkeel(word)
     #The vowels are striped in stamp function
     word = ar.normalize_hamza(word)
     if word.startswith(ar.HAMZA):
         #strip The first hamza
         word = word[1:]
     # strip the last letter if is doubled
     if word[-1:] == word[-2:-1]:
         word = word[:-1]
     return self.verb_stamp_pat.sub('', word)
Example #14
0
 def verb_stamp(self, word):
     """
     generate a stamp for a verb, 
     the verb stamp is different of word stamp, by hamza noralization
     remove all letters which can change form in the word :
     - ALEF, 
     - YEH, 
     - WAW, 
     - ALEF_MAKSURA
     - SHADDA
     @return: stamped word
     """
     word = araby.strip_tashkeel(word) 
     #The vowels are striped in stamp function
     word = araby.normalize_hamza(word) 
     if word.startswith(araby.HAMZA):
         #strip The first hamza
         word = word[1:] 
     # strip the last letter if is doubled
     if word[-1:] ==  word[-2:-1]:
         word = word[:-1] 
     return self.verb_stamp_pat.sub('', word)
Example #15
0
    def lookup(self, normalized):
        """
        look up for all word forms in the dictionary
        @param normalized: the normalized word.
        @type normalized: unicode.
        @return: list of dictionary entries .
        @rtype: list.
        """
        idlist = []
        normword = araby.normalize_hamza(normalized)
        #print "###", normword.encode('utf8')

        sql = u"select * FROM %s WHERE normalized='%s'" % (self.table_name,
         normword)
        try:
            self.cursor.execute(sql)
            if self.cursor:
                # return self.curser.fetchall()
                for row in self.cursor:
                    idlist.append(row)
            return idlist
        except  sqlite.OperationalError:
            return []
Example #16
0
    def lookup(self, normalized):
        """
        look up for all word forms in the dictionary
        @param normalized: the normalized word.
        @type normalized: unicode.
        @return: list of dictionary entries .
        @rtype: list.
        """
        idlist = []
        normword = araby.normalize_hamza(normalized)
        #print "###", normword.encode('utf8')

        sql = u"select * FROM %s WHERE normalized='%s'" % (self.table_name,
         normword)
        try:
            self.cursor.execute(sql)
            if self.cursor:
                # return self.curser.fetchall()
                for row in self.cursor:
                    idlist.append(row)
            return idlist
        except  sqlite.OperationalError:
            return []
Example #17
0
def ar_tokenizer(t):
    return [
        wordpunct_tokenize(
            normalize_ligature(normalize_hamza(strip_tashkeel(k)))) for k in t
    ]
Example #18
0
    def treat_tuple(self,tuple_noun):
        """ convert row data to specific fields
        return a dict of fields"""
        #~ self.id+=1;
        #extract field from the noun tuple
        fields={};
        for key in self.field_id.keys():
            try:
                fields[key] = tuple_noun[self.field_id[key]].strip();
            except IndexError:
                print "#"*5, "key error [%s],"%key, self.field_id[key], len(tuple_noun);
                print tuple_noun
                sys.exit()

        # treat specific fields
        fields['note']="";
        #if fields['root'] == "":
        if fields['number'] == u"جمع":
            fields['number'] = u"جمع تكسير"
        elif fields['number'] == u"مثنى":
            fields['number'] = u"مثنى"
        else:
            fields['number'] = u"مفرد"
        # make note  if definition is not given
        if not fields['definition']:
            fields['note'] = u":".join([fields['note'],u"لا شرح"]);

        #الممنوع من الصرف
        if not fields['tanwin_nasb']:
            fields['mamnou3_sarf'] = u"ممنوع من الصرف";
        elif fields['tanwin_nasb'] in ("Non","N"):
            fields['mamnou3_sarf'] = u"ممنوع من الصرف";            
        elif fields['tanwin_nasb'] in ("Tn",):
            fields['mamnou3_sarf'] = u"";            
        else:
            fields['mamnou3_sarf'] = u"";
        
        # get unvocalized fields
        fields['unvocalized'] = araby.strip_tashkeel(fields['vocalized']);
        # word type, must be defined for every file 
        # not god idea    
        #~ fields['wordtype']   = self.wordtype;
        fields['wordtype']   = araby.strip_tashkeel(fields['category'])+u":%s"%self.wordtype;

        # extarct broken plurals
        # extract plural from the plural field
        # the field can have +ون;+ات
        items = fields['plural'].split(";")
        if u'+ون' in items:items.remove(u'+ون')
        if u'+ات' in items:items.remove(u'+ات')
        if u'ون' in items:items.remove(u'ون')
        if u'ات' in items:items.remove(u'ات')
        if items:
            fields['broken_plural'] = u";".join(items); 
        else:
            fields['broken_plural'] = "";
        #display order
        fields['normalized'] = araby.normalize_hamza(fields['unvocalized'])
        fields['stamped'] = ndf.word_stamp(fields['unvocalized'])
        
        # special change in some fields
        
        # some fields are not fully defined, 
        # if the k_prefix si Null, it means True,
        # if is N or n, it's  False
        if fields['k_prefix'] in ('n', 'N'):
            fields['k_prefix'] = 0
        else:
            fields['k_prefix'] = 1
        # if the kal_prefix si Null, it means True,
        # if is N or n, it's  False
        if fields['kal_prefix'] in ('n', 'N'):
            fields['kal_prefix'] = 0
        else:
            fields['kal_prefix'] = 1
        # if the ha_suffix si Null, it means True,
        # if is N or n, it's  False
        if fields['ha_suffix'] in ('n', 'N'):
            fields['ha_suffix'] = 0
        else:
            fields['ha_suffix'] = 1
        # if the hm_suffix si Null, it means True,
        # if is N or n, it's  False
        if fields['hm_suffix'] in ('n', 'N'):
            fields['hm_suffix'] = 0
        else:
            fields['hm_suffix'] = 1
            
        # change boolean fields
        for key in self.boolean_fields:
            if not fields[key]: 
                fields[key] = 0
            elif fields[key] in ('n',"N", "Non"):
                fields[key] = 0
            elif fields[key] in ('o',"O"):
                fields[key] = 1                
            else:
                fields[key] = 1        
        return fields;
Example #19
0
 def test_normalize_hamza(self):
     """Test normalize_hamzafunction ?"""
     text1 = u"سئل أحد الأئمة"
     text2 = u"سءل ءحد الءءمة"
     self.assertEqual(ar.normalize_hamza(text1), text2)
Example #20
0
    def stemming_verb(self, verb_in):
        """
        Stemming verb
        @param verb_in: given verb
        @type verb_in: unicode
        @return : stemmed words
        @rtype:
        """
        if not verb_in:
            return None
        #~ list_found = []
        detailed_result = []
        verb_list = [
            verb_in,
        ] + self.get_verb_variants(verb_in)
        verb_list = list(set(verb_list))
        debug = self.debug
        #list of segmented words
        word_segmented_list = []
        for verb in verb_list:

            list_seg_comp = self.comp_stemmer.segment(verb)
            for seg in list_seg_comp:
                proclitic = verb[:seg[0]]
                stem = verb[seg[0]:seg[1]]
                enclitic = verb[seg[1]:]
                #~ print "stem_verb affix 93", "-".join([proclitic, stem, enclitic]).encode('utf8')
                #~secondsuffix = u''
                # حالة الفعل المتعدي لمفعولين
                if enclitic in SVC.TABLE_DOUBLE_TRANSITIVE_SUFFIX:
                    firstsuffix = \
                    SVC.TABLE_DOUBLE_TRANSITIVE_SUFFIX[enclitic]['first']
                    enclitic = firstsuffix

                list_stem = [stem] + self.get_in_stem_variants(stem, enclitic)
                #if enclitic, then transitive is ok
                transitive_comp = bool(enclitic)
                for stm in list_stem:
                    word_seg = {
                        "verb": verb,
                        "pro": proclitic,
                        "enc": enclitic,
                        'stem_comp': stm,
                        'trans_comp': transitive_comp,
                    }
                    word_segmented_list.append(word_seg)
        if debug: print("after first level")
        if debug:
            #~ print(repr(word_segmented_list).replace(
            #~ '},', '},\n').decode("unicode-escape"))
            print(arepr(verb_in))
            print(print_table(word_segmented_list))
        # second level for segmented word
        tmp_list = []
        #~ print 'first level', verb_in, len(word_segmented_list)
        for word_seg in word_segmented_list:
            verb2 = word_seg['stem_comp']
            # stem reduced verb : level two
            #segment the conjugated verb
            list_seg_conj = self.conj_stemmer.segment(verb2)

            # verify affix compatibility
            list_seg_conj = self.verify_affix(verb2, list_seg_conj,
                                              SVC.VERBAL_CONJUGATION_AFFIX)
            # verify proclitics and enclitecs
            # verify length pof stem
            for seg_conj in list_seg_conj:
                if (seg_conj[1] - seg_conj[0]) <= 6:

                    #word seg in level 2
                    word_seg_l2 = word_seg.copy()
                    word_seg_l2["prefix"] = verb2[:seg_conj[0]]
                    word_seg_l2["stem_conj"] = verb2[seg_conj[0]:seg_conj[1]]
                    word_seg_l2["suffix"] = verb2[seg_conj[1]:]
                    tmp_list.append(word_seg_l2)

        # verify compatibilty between proclitic and affixes
        word_segmented_list = tmp_list
        #~ print 'compatibility', verb_in, len(tmp_list)
        tmp_list = []
        for word_seg in word_segmented_list:
            # verify compatibility between proclitics and affixes
            proclitic = word_seg['pro']
            enclitic = word_seg['enc']
            affix_conj = u"-".join([word_seg['prefix'], word_seg['suffix']])
            if self.__check_clitic_affix(proclitic, enclitic, affix_conj):
                tmp_list.append(word_seg.copy())

        #~ print 'stamp', verb_in, len(tmp_list)
        # verify existance of condidate verb by stamp
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # verify existance of condidate verb by stamp
            if self.exists_as_stamp(word_seg['stem_conj']):
                tmp_list.append(word_seg.copy())

        if debug: print("after second level")
        if debug:
            print(arepr(verb_in))
            print(print_table(tmp_list))
        #~ print 'infinitive', verb_in, len(tmp_list)
        # get infinitive of condidate verbs
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # get infinitive of condidate verb by stamp

            # search the verb in the dictionary by stamp
            # if the verb exists in dictionary,
            # The transitivity is consedered
            # if is trilateral return its forms and Tashkeel
            # if not return forms without tashkeel,
            #because the conjugator can vocalized it,
            # we can return the tashkeel if we don't need the
            #conjugation step
            infverb_dict = self.__get_infinitive_verb_by_stem(
                word_seg['stem_conj'], word_seg['trans_comp'])
            if debug: print("infinitive candidat verbs")
            if debug:
                print(arepr(verb_in))
                print(print_table(infverb_dict))
            #~ print "list possible verbs", len(infverb_dict)
            #~ for item in infverb_dict:
            #~ print item['verb']
            # filter verbs
            infverb_dict = self.__verify_infinitive_verbs(
                word_seg['stem_conj'], infverb_dict)

            if debug: print("valid infinitive candidat verbs")
            if debug:
                print(arepr(verb_in))
                print(print_table(infverb_dict))
            for item in infverb_dict:
                #The haraka from is given from the dict
                word_seg_l3 = word_seg.copy()
                word_seg_l3['inf'] = item['verb']
                word_seg_l3['haraka'] = item['haraka']
                word_seg_l3['root'] = item.get('root', '')
                word_seg_l3['transitive'] = bool(item['transitive'] in ('y',
                                                                        1))
                tmp_list.append(word_seg_l3)
                # conjugation step
        if debug: print("after lookup dict")
        if debug:
            print(arepr(verb_in))
            print(print_table(tmp_list))
        #~ print repr(tmp_list).replace('},','},\n').decode("unicode-escape")
        #~ print 'conj', verb_in, len(tmp_list)
        # get conjugation for every infinitive verb
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # ToDo, conjugate the verb with affix,
            # if exists one verb which match, return it
            # تصريف الفعل مع الزوائد
            # إذا توافق التصريف مع الكلمة الناتجة
            # تعرض النتيجة

            one_correct_conj = self.__generate_possible_conjug(
                word_seg['inf'], word_seg['stem_comp'],
                word_seg['prefix'] + '-' + word_seg['suffix'],
                word_seg['haraka'], word_seg['pro'], word_seg['enc'],
                word_seg['transitive'])

            #~ print "len correct_conj", len(one_correct_conj)
            for conj in one_correct_conj:
                word_seg_l4 = word_seg.copy()
                word_seg_l4['conj'] = conj.copy()
                tmp_list.append(word_seg_l4)
        if debug: print("after generating conjugation")
        if debug:
            print(arepr(verb_in))
            conjs = [item['conj'] for item in tmp_list]
            print(print_table(conjs))
        #~ print 'result', verb_in, len(tmp_list)
        # generate all resulted data
        word_segmented_list = tmp_list

        #~ tmp_list = []
        for word_seg in word_segmented_list:
            conj = word_seg['conj']
            #~ vocalized, semivocalized = self.vocalize(
            vocal_tuple_list = self.vocalize(conj['vocalized'],
                                             word_seg['pro'], word_seg['enc'])
            tag_type = 'Verb'
            original_tags = "y" if conj['transitive'] else "n"
            # ~ print("stem_verb", vocal_tuple_list)
            for vocalized, semivocalized, __ in vocal_tuple_list:
                # ~ for XXX in vocal_tuple_list:
                # prepare tags
                tags = self.prepare_tags(conj, proclitic, enclitic)

                detailed_result.append(
                    wordcase.WordCase({
                        'word':
                        word_seg['verb'],
                        'affix': (word_seg['pro'], word_seg['prefix'],
                                  word_seg['suffix'], word_seg['enc']),
                        'stem':
                        word_seg['stem_conj'],
                        'root':
                        ar.normalize_hamza(word_seg.get('root', '')),
                        'original':
                        conj['verb'],
                        'vocalized':
                        vocalized,
                        'semivocalized':
                        semivocalized,
                        'tags':
                        tags,  #\
                        'type':
                        tag_type,
                        'number':
                        conj['pronoun_tags'].get('number', ''),
                        'gender':
                        conj['pronoun_tags'].get('gender', ''),
                        'person':
                        conj['pronoun_tags'].get('person', ''),
                        'tense2':
                        conj['tense_tags'].get('tense', ''),
                        'voice':
                        conj['tense_tags'].get('voice', ''),
                        'mood':
                        conj['tense_tags'].get('mood', ''),
                        'confirmed':
                        conj['tense_tags'].get('confirmed', ''),
                        'transitive':
                        conj['transitive'],
                        'tense':
                        conj['tense'],
                        'pronoun':
                        conj['pronoun'],
                        'freq':
                        'freqverb',
                        'originaltags':
                        original_tags,
                        'syntax':
                        '',
                    }))

        return detailed_result
    if araby.is_weak(c): print ('weak'),
    if araby.is_moon(c): print ('moon'),
    if araby.is_sun(c):print ('sun'),
    print (araby.order(c)),
    print ();
word=u"الْعَرَيِيّةُ"
word_list=[
u"الْعَرَيِيّةُ",
u"العربية",
u"الْعَرَيِيّةُ الفصحى",
u"غير مشكول",
"Taha",
u"سئل لأنه يؤم الإمام"
]
word1=u""
for word in word_list:
    print (word)
    if araby.is_vocalized(word): print (' is vocalized')
    if araby.is_vocalizedtext(word): print (' is vocalized text')
    if araby.is_arabicword(word): print (' is valid word')
    else: print ("invalid arabic word")
    print (' strip harakat', araby.strip_harakat(word))
    print (' strip tashkeel', araby.strip_tashkeel(word))
    print (' strip tatweel',araby.strip_tatweel(word))
    print (' normalize ligature ', araby.normalize_ligature(word))
    print (' normalize hamza', araby.normalize_hamza(word))
    if araby.vocalizedlike(word, word1): print ("vocalized_like")
    word1=word;
if araby.vocalizedlike(u"العربية",u"العرَبية"): print ("vocalized_like")

Example #22
0
def normalize(word):
    """ normalize root"""

    nrm = araby.normalize_hamza(word)
    nrm = nrm.replace(araby.ALEF, araby.WAW)
    return nrm
Example #23
0
    def treat_tuple(self, tuple_verb):
        """ convert row data to specific fields
        return a dict of fields"""
        #~ self.id+=1;
        v = {
            "id": self.id,
        }  # verb dict of fields

        #extract field from the verb tuple
        for key in self.field_id.keys():
            try:
                v[key] = tuple_verb[self.field_id[key]].strip()
            except IndexError:
                print "#" * 5, "key error [%s]," % key, self.field_id[
                    key], len(tuple_verb)
                print tuple_verb
                sys.exit()
        v["unvocalized"] = araby.strip_tashkeel(v['vocalized'])
        v['normalized'] = araby.normalize_hamza(v['unvocalized'])
        v['stamped'] = vdf.stamp(v['unvocalized'])

        # Adopt fields to the actual program
        #word;
        if v['tri'] == u"ثلاثي":
            v['triliteral'] = True
        else:
            v['triliteral'] = False
        #root
        #future_type
        if v['transitive'] != u"متعد":
            v['transitive'] = False
            v['unthink_trans'] = False
            # متعدي لغير العاقل
            v['think_trans'] = False
            # متعدي للعاقل، تلقائيا اﻷفعال تقبل العاقل
            v['reflexive_trans'] = False
            #فعل قلوب
            v['double_trans'] = False
            #متعدي لمفعولين
        else:
            v['transitive'] = True
            ##
            if v['nb_trans'] == "2":
                v['double_trans'] = True
            else:
                v['double_trans'] = False
            # TYPE OF THE OBJECT, REASONALBEL, OR NOT
            if v['object_type'] == u"عاقل":
                v['think_trans'] = True
                v['unthink_trans'] = False
            elif v['object_type'] == u"غيرع":
                v['think_trans'] = False
                v['unthink_trans'] = True
            else:
                v['think_trans'] = False
                v['unthink_trans'] = False
            # reflexive object  فعل القلوب المتعدي، أظنني
        if v['reflexive_type'] == u"قلبي":
            v['reflexive_trans'] = True
        else:
            v['reflexive_trans'] = False
        # decode tenses
        v['all'], v['past'], v['future'], v['passive'], v['imperative'], v[
            'future_moode'], v['confirmed'] = vdf.decode_tenses(v['tenses'])
        if v['all']:
            v['tenses'] = u"يعملان"
        else:
            v['tenses'] = u""
            if v['past']: v['tenses'] += u"ي"
            else: v['tenses'] += "-"
            if v['future']: v['tenses'] += u"ع"
            else: v['tenses'] += "-"
            if v['imperative']: v['tenses'] += u"م"
            else: v['tenses'] += "-"
            if v['passive']: v['tenses'] += u"ل"
            else: v['tenses'] += u"-"
            if v['future_moode']: v['tenses'] += u"ا"
            else: v['tenses'] += u"-"
            if v['confirmed']: v['tenses'] += u"ن"
            else: v['tenses'] += u"-"
            # convert True/false to 0/1

        v['triliteral'] = vdf.yes(v['triliteral'])
        v['transitive'] = vdf.yes(v['transitive'])
        v['double_trans'] = vdf.yes(v['double_trans'])
        v['think_trans'] = vdf.yes(v['think_trans'])
        v['unthink_trans'] = vdf.yes(v['unthink_trans'])
        v['reflexive_trans'] = vdf.yes(v['reflexive_trans'])
        v['past'] = vdf.yes(v['past'])
        v['future'] = vdf.yes(v['future'])
        v['imperative'] = vdf.yes(v['imperative'])
        v['passive'] = vdf.yes(v['passive'])
        v['future_moode'] = vdf.yes(v['future_moode'])
        v['confirmed'] = vdf.yes(v['confirmed'])

        return v
Example #24
0
    def steming_second_level(self, noun, stem_comp, procletic_nm, encletic_nm):
        """
        Analyze word morphologically by stemming the conjugation affixes.
        @param noun: the input noun.
        @type noun: unicode.
        @param stem_comp: the noun stemed from syntaxic affixes.
        @type stem_comp: unicode.
        @param procletic: the syntaxic prefixe extracted in the fisrt stage.
        @type procletic: unicode.
        @param encletic: the syntaxic suffixe extracted in the fisrt stage.
        @type encletic: unicode.
        @param encletic_nm: the syntaxic suffixe extracted in the
        first stage (not vocalized).
        @type encletic_nm: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """
        detailed_result = []
        #segment the coinjugated verb
        list_seg_conj = self.conj_stemmer.segment(stem_comp)
        # verify affix compatibility
        list_seg_conj = verify_affix(stem_comp, list_seg_conj,
                                     snconst.NOMINAL_CONJUGATION_AFFIX)
        # add vocalized forms of suffixes
        # and create the real affixes from the word
        #~list_seg_conj_voc = []
        for seg_conj in list_seg_conj:
            stem_conj = stem_comp[seg_conj[0]:seg_conj[1]]
            suffix_conj_nm = stem_comp[seg_conj[1]:]

            # noirmalize hamza before gessing  differents origines
            stem_conj = ar.normalize_hamza(stem_conj)

            # generate possible stems
            # add stripped letters to the stem to constitute possible noun list
            possible_noun_list = get_stem_variants(stem_conj, suffix_conj_nm)

            # search the noun in the dictionary
            # we can return the tashkeel
            infnoun_form_list = []
            for infnoun in set(possible_noun_list):
                # get the noun and get all its forms from the dict
                # if the noun has plural suffix, don't look up in
                #broken plural dictionary
                if infnoun not in self.cache_dict_search:
                    infnoun_foundlist = self.noun_dictionary.lookup(infnoun)
                    self.cache_dict_search[infnoun] = infnoun_foundlist
                else:
                    infnoun_foundlist = self.cache_dict_search[infnoun]
                infnoun_form_list.extend(infnoun_foundlist)

            for noun_tuple in infnoun_form_list:
                infnoun = noun_tuple['vocalized']
                # affixes tags contains prefixes and suffixes tags
                affix_tags = list(
                    set(snconst.COMP_PREFIX_LIST_TAGS[procletic_nm]['tags'] +
                        snconst.COMP_SUFFIX_LIST_TAGS[encletic_nm]['tags'] +
                        snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj_nm]['tags']))
                #test if the  given word from dictionary accept those
                # tags given by affixes
                # دراسة توافق الزوائد مع خصائص الاسم،
                # مثلا هل يقبل الاسم التأنيث.
                if validate_tags(noun_tuple, affix_tags, procletic_nm,
                                 encletic_nm, suffix_conj_nm):
                    ## get all vocalized form of suffixes
                    for vocalized_encletic in snconst.COMP_SUFFIX_LIST_TAGS[
                            encletic_nm]['vocalized']:
                        for vocalized_suffix in snconst.CONJ_SUFFIX_LIST_TAGS[
                                suffix_conj_nm]['vocalized']:

                            ## verify compatibility between procletics and affix
                            if self.is_compatible_proaffix_affix(
                                    noun_tuple, procletic_nm,
                                    vocalized_encletic, vocalized_suffix):
                                vocalized, semi_vocalized, _ = vocalize(
                                    infnoun, procletic_nm, vocalized_suffix,
                                    vocalized_encletic)

                                #add some tags from dictionary entry as
                                #mamnou3 min sarf and broken plural
                                original_tags = []
                                if noun_tuple['mankous'] == u"Tk":
                                    original_tags.append(u"منقوص")
                                # get affix tags
                                vocalized_affix_tags = snconst.COMP_PREFIX_LIST_TAGS[procletic_nm]['tags']\
                                  +snconst.COMP_SUFFIX_LIST_TAGS[vocalized_encletic]['tags']\
                                  +snconst.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags']
                                # if there are many cases like feminin plural with mansoub and majrour
                                if 'cases' in snconst.CONJ_SUFFIX_LIST_TAGS[
                                        vocalized_suffix]:
                                    list_cases = snconst.CONJ_SUFFIX_LIST_TAGS[
                                        vocalized_suffix]['cases']
                                else:
                                    list_cases = ('', )
                                for case in list_cases:
                                    voc_affix_case = vocalized_affix_tags + (
                                        case, )
                                    detailed_result.append(
                                        wordcase.WordCase({
                                            'word':
                                            noun,
                                            'affix': (procletic_nm, '',
                                                      vocalized_suffix,
                                                      vocalized_encletic),
                                            'stem':
                                            stem_conj,
                                            'original':
                                            infnoun,  #original,
                                            'vocalized':
                                            vocalized,
                                            'semivocalized':
                                            semi_vocalized,
                                            'tags':
                                            u':'.join(voc_affix_case),
                                            'type':
                                            u':'.join([
                                                'Noun', noun_tuple['wordtype']
                                            ]),
                                            'number':
                                            noun_tuple['number'],
                                            'gender':
                                            noun_tuple['gender'],
                                            'freq':
                                            'freqnoun',  # to note the frequency type
                                            'originaltags':
                                            u':'.join(original_tags),
                                            'syntax':
                                            '',
                                        }))
        return detailed_result
Example #25
0
movie_data = pd.concat([movie_name_link, genre_and_plot_data], axis=1)


for i in tqdm(range(0, len(movie_data))):
    try:
        movie_data['Plot'][int(i)] = "".join(movie_data['Plot'][int(i)])
        
    except TypeError:
        pass


data = pd.read_excel("movies_data.xlsx")
data = data.dropna()
data = data.reset_index()


tokenized_plot = []

for i in tqdm(range(0,len(data['Plot']))):
    sentence = normalize_hamza(data['Plot'][i])
    tokenized_plot.append(','.join(tokenize(sentence, conditions=is_arabicrange)))


data['Tokenized Plot'] = tokenized_plot


cleaned = []

for i in tqdm(range(0,len(data['Genre']))):
    data.iloc[i,4] = ', '.join(ast.literal_eval(data.iloc[i,4]))
Example #26
0
 def test_normalize_hamza(self):
     """Test normalize_hamzafunction ?"""
     text1 = u"سئل أحد الأئمة"
     text2 = u"سءل ءحد الءءمة"
     self.assertEqual(ar.normalize_hamza(text1), text2)
Example #27
0
# In[7]:

# récuperation des données dans des liste
list_enonce = list(corpus["#2 tweet_content"])
list_pays = list(corpus["#3 country_label"])
list_province = list(corpus["#4 province_label"])

# In[13]:

#nétoyage des tweets (URL, hashtags, emoticones/emojis, ponctuation, translittération de arabizi, bruits)
list_enonce_clean = []
for ligne in list_enonce:
    ligne = araby.strip_tashkeel(
        ligne)  #Supprimer les signes diacritique (voyelles)
    ligne = araby.normalize_hamza(
        ligne)  #Normalise une chaîne (hamza ou les différents A en arabe)
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
Example #28
0
    def stemming_noun(self, noun_in):
        """
        Analyze word morphologically as noun
        @param noun_in: the input noun.
        @type noun_in: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """
        debug = False
        #~list_found = []
        detailed_result = []
        noun_list = [
            noun_in,
        ] + get_noun_variants(noun_in)
        word_segmented_list = []
        for noun in noun_list:
            list_seg_comp = self.comp_stemmer.segment(noun)
            # filter
            list_seg_comp = verify_affix(noun, list_seg_comp,
                                         SNC.COMP_NOUN_AFFIXES)
            # treat multi vocalization enclitic
            for seg in list_seg_comp:
                proclitic_nm = noun[:seg[0]]
                stem = noun[seg[0]:seg[1]]
                enclitic_nm = noun[seg[1]:]
                # ajusting nouns variant
                list_stem = [
                    stem,
                ] + get_in_stem_variants(stem, enclitic_nm)

                # stem reduced noun : level two
                for stem in list_stem:
                    word_seg = {
                        'noun': noun,
                        'stem_comp': stem,
                        'pro': proclitic_nm,
                        'enc': enclitic_nm,
                    }
                    word_segmented_list.append(word_seg)
        # level two
        tmp_list = []
        if debug: print("after first level")
        if debug:
            print(
                repr(word_segmented_list).replace(
                    '},', '},\n').decode("unicode-escape"))

        for word_seg in word_segmented_list:

            #~ detailed_result.extend(
            #~ self.steming_second_level(word_seg['noun'], word_seg['stem_comp'],
            #~ word_seg['pro'], word_seg['enc']))
            #~ detailed_result_one = []
            #segment the coinjugated noun
            list_seg_conj = self.conj_stemmer.segment(word_seg['stem_comp'])
            # verify affix compatibility
            # filter
            list_seg_conj = verify_affix(word_seg['stem_comp'], list_seg_conj,
                                         SNC.NOMINAL_CONJUGATION_AFFIX)
            # add vocalized forms of suffixes
            # and create the real affixes from the word
            for seg_conj in list_seg_conj:
                stem_conj = word_seg['stem_comp'][:seg_conj[1]]
                suffix = word_seg['stem_comp'][seg_conj[1]:]
                stem_conj = ar.normalize_hamza(stem_conj)
                stem_conj_list = get_stem_variants(stem_conj, suffix)

                # generate possible stems
                # add stripped letters to the stem to constitute possible noun list
                for stem in stem_conj_list:
                    word_seg_l2 = word_seg.copy()
                    # normalize hamza before gessing  differents origines
                    word_seg_l2['stem_conj'] = stem
                    word_seg_l2['suffix'] = suffix
                    #affixes tags contains prefixes and suffixes tags
                    word_seg_l2['affix_tags'] = list(
                        set(SNC.COMP_PREFIX_LIST_TAGS[word_seg_l2['pro']]
                            ['tags'] + SNC.COMP_SUFFIX_LIST_TAGS[
                                word_seg_l2['enc']]['tags'] +
                            SNC.CONJ_SUFFIX_LIST_TAGS[
                                word_seg_l2['suffix']]['tags']))
                    tmp_list.append(word_seg_l2)

        if debug: print("after second level")
        if debug:
            print(
                repr(tmp_list).replace('},', '},\n').decode("unicode-escape"))
        # lookup in dictionary
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # search the noun in the dictionary
            # we can return the tashkeel
            inf_noun = word_seg['stem_conj']
            # get the noun and get all its forms from the dict
            # if the noun has plural suffix, don't look up in
            #broken plural dictionary
            if inf_noun in self.cache_dict_search:
                infnoun_foundlist = self.cache_dict_search[inf_noun]
            else:
                infnoun_foundlist = self.noun_dictionary.lookup(inf_noun)
                self.cache_dict_search[inf_noun] = infnoun_foundlist

            for noun_tuple in infnoun_foundlist:
                word_seg_l3 = word_seg.copy()
                word_seg_l3["original"] = noun_tuple['vocalized']
                word_seg_l3["noun_tuple"] = dict(noun_tuple)
                tmp_list.append(word_seg_l3)

        if debug: print("after lookup dict")
        if debug:
            print(
                repr(tmp_list).replace('},', '},\n').decode("unicode-escape"))
        # test compatiblity noun_tuple with affixes and proaffixes
        # and generate vocalized affixes and suffixes
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            #test if the  given word from dictionary accept those
            # tags given by affixes
            # دراسة توافق الزوائد مع خصائص الاسم،
            # مثلا هل يقبل الاسم التأنيث.
            if validate_tags(word_seg['noun_tuple'], word_seg['affix_tags'],
                             word_seg['pro'], word_seg['enc'],
                             word_seg['suffix']):
                ## get all vocalized form of suffixes
                for enc_voc in SNC.COMP_SUFFIX_LIST_TAGS[
                        word_seg['enc']]['vocalized']:
                    for suf_voc in SNC.CONJ_SUFFIX_LIST_TAGS[
                            word_seg['suffix']]['vocalized']:
                        ## verify compatibility between proclitics and affix
                        if self.__check_clitic_affix(word_seg['noun_tuple'],
                                                     word_seg['pro'], enc_voc,
                                                     suf_voc):
                            # get affix tags
                            affix_tags_voc = SNC.COMP_PREFIX_LIST_TAGS[word_seg['pro']]['tags']\
                              +SNC.COMP_SUFFIX_LIST_TAGS[enc_voc]['tags']\
                              +SNC.CONJ_SUFFIX_LIST_TAGS[suf_voc]['tags']
                            word_seg_l4 = word_seg.copy()
                            word_seg_l4['suf_voc'] = suf_voc
                            word_seg_l4['enc_voc'] = enc_voc
                            word_seg_l4['affix_tags'] = affix_tags_voc
                            tmp_list.append(word_seg_l4)

        if debug: print("after check compatibility")
        if debug:
            print(
                repr(tmp_list).replace('},', '},\n').decode("unicode-escape"))
        # Generate results
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # get voalized and vocalized without inflection
            vocalized, semi_vocalized, _ = vocalize(
                word_seg['noun_tuple']['vocalized'], word_seg['pro'],
                word_seg['suf_voc'], word_seg['enc_voc'])

            #add some tags from dictionary entry as
            #mamnou3 min sarf and broken plural
            original_tags = []
            if word_seg['noun_tuple']['mankous'] == u"Tk":
                original_tags.append(u"منقوص")
            # if there are many cases like feminin plural with mansoub and majrour
            if 'cases' in SNC.CONJ_SUFFIX_LIST_TAGS[word_seg['suf_voc']]:
                list_cases = SNC.CONJ_SUFFIX_LIST_TAGS[
                    word_seg['suf_voc']]['cases']
            else:
                list_cases = ('', )
            for case in list_cases:
                voc_affix_case = word_seg['affix_tags'] + (case, )
                detailed_result.append(
                    wordcase.WordCase({
                        'word':
                        noun_in,
                        'affix': (word_seg['pro'], '', word_seg['suf_voc'],
                                  word_seg['enc_voc']),
                        'stem':
                        word_seg['stem_conj'],
                        'original':
                        word_seg['noun_tuple']['vocalized'],  #original,
                        'vocalized':
                        vocalized,
                        'semivocalized':
                        semi_vocalized,
                        'tags':
                        u':'.join(voc_affix_case),
                        'type':
                        u':'.join(['Noun',
                                   word_seg['noun_tuple']['wordtype']]),
                        'number':
                        word_seg['noun_tuple']['number'],
                        'gender':
                        word_seg['noun_tuple']['gender'],
                        'freq':
                        'freqnoun',  # to note the frequency type
                        'originaltags':
                        u':'.join(original_tags),
                        'syntax':
                        '',
                    }))
        if debug: print("after generate result")
        if debug: print(len(detailed_result))
        #~ if debug: print repr(detailed_result).replace('},','},\n').decode("unicode-escape")
        return detailed_result
Example #29
0
    def steming_second_level(self, noun, noun2, procletic, encletic_nm):
        """
        Analyze word morphologically by stemming the conjugation affixes.
        @param noun: the input noun.
        @type noun: unicode.
        @param noun2: the noun stemed from syntaxic affixes.
        @type noun2: unicode.
        @param procletic: the syntaxic prefixe extracted in the fisrt stage.
        @type procletic: unicode.
        @param encletic: the syntaxic suffixe extracted in the fisrt stage.
        @type encletic: unicode.
        @param encletic_nm: the syntaxic suffixe extracted in the 
        first stage (not vocalized).
        @type encletic_nm: unicode.        
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """    
        detailed_result = []
        #segment the coinjugated verb
        list_seg_conj = self.conj_stemmer.segment(noun2)
        # verify affix compatibility
        list_seg_conj = verify_affix(noun2, list_seg_conj, 
        snconst.NOMINAL_CONJUGATION_AFFIX)
        # add vocalized forms of suffixes
        # and create the real affixes from the word
        #~list_seg_conj_voc = []
        for seg_conj in list_seg_conj:
            stem_conj = noun2[seg_conj[0]:seg_conj[1]]
            suffix_conj_nm = noun2[seg_conj[1]:]

            # noirmalize hamza before gessing  differents origines
            stem_conj = araby.normalize_hamza(stem_conj)

            # generate possible stems
            # add stripped letters to the stem to constitute possible noun list
            possible_noun_list = get_stem_variants(stem_conj, 
            suffix_conj_nm)

            # search the noun in the dictionary
            # we can return the tashkeel
            infnoun_form_list = []
            for infnoun in set(possible_noun_list):
                # get the noun and get all its forms from the dict
                # if the noun has plural suffix, don't look up in 
                #broken plural dictionary
                if not self.cache_dict_search.has_key(infnoun):
                    infnoun_foundlist = self.noun_dictionary.lookup(infnoun)
                    self.cache_dict_search[infnoun] = create_dict_word(
                    infnoun_foundlist)
                else: 
                    infnoun_foundlist = self.cache_dict_search[infnoun]        
                infnoun_form_list.extend(infnoun_foundlist)
            for noun_tuple in infnoun_form_list:
                infnoun = noun_tuple['vocalized']
                # affixes tags contains prefixes and suffixes tags
                affix_tags = snconst.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \
                          +snconst.COMP_SUFFIX_LIST_TAGS[encletic_nm]['tags'] \
                          +snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj_nm]['tags']
                #test if the  given word from dictionary accept those
                # tags given by affixes
                # دراسة توافق الزوائد مع خصائص الاسم،
                # مثلا هل يقبل الاسم التأنيث.
                if validate_tags(noun_tuple, affix_tags, procletic, encletic_nm, suffix_conj_nm):
                    ## get all vocalized form of suffixes
                    for vocalized_encletic in snconst.COMP_SUFFIX_LIST_TAGS[encletic_nm]['vocalized']:
                        for vocalized_suffix in snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj_nm]['vocalized']:
                         ## verify compatibility between procletics and affix
                            if vocalized_suffix == araby.FATHATAN and not (noun.endswith(araby.TEH_MARBUTA) or noun.endswith(araby.ALEF+araby.HAMZA) ):
                                continue
                            if u'جمع مذكر سالم' in snconst.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags']\
                              and not noun_tuple['masculin_plural']:
                                continue;
                            if self.is_compatible_proaffix_affix(noun_tuple, procletic, vocalized_encletic, vocalized_suffix):
                                vocalized, semi_vocalized = vocalize(infnoun,procletic,  vocalized_suffix, vocalized_encletic)

                                #add some tags from dictionary entry as 
                                #mamnou3 min sarf and broken plural
                                original_tags = []
                                if noun_tuple['mamnou3_sarf'] == \
                                u"ممنوع من الصرف":
                                    original_tags.append(u"ممنوع من الصرف")
                                if noun_tuple['number'] == u"جمع تكسير":
                                    original_tags.append(u"جمع تكسير")
                                if noun_tuple['feminable']:
                                    original_tags.append(u"يؤنث")                                    
                                # get affix tags
                                vocalized_affix_tags = snconst.COMP_PREFIX_LIST_TAGS[procletic]['tags']\
                                  +snconst.COMP_SUFFIX_LIST_TAGS[vocalized_encletic]['tags']\
                                  +snconst.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags'] 
                                # if there are many cases like feminin plural with mansoub and majrour
                                if 'cases' in snconst.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]:
                                    list_cases = snconst.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['cases']
                                else:
                                   list_cases = ('',)
                                for case in list_cases:
                                    voc_affix_case = vocalized_affix_tags + (case,)
                                    detailed_result.append(wordcase.WordCase({
                                    'word':noun, 
                                    'affix': (procletic,  '', vocalized_suffix, 
                                    vocalized_encletic),
                                    'stem':      stem_conj, 
                                    'original':  infnoun, #original, 
                                    'vocalized': vocalized, 
                                    'semivocalized':semi_vocalized,
                                    'tags':  u':'.join(voc_affix_case), 
                                    'type':  u':'.join(['Noun', noun_tuple['wordtype']]),  
                                    'freq':'freqnoun', # to note the frequency type 
                                    'originaltags':u':'.join(original_tags), 
                                    'syntax':'', 
                                    }))
        return detailed_result
import sys
import pyarabic.araby as araby

ayah = sys.argv[1]
ayah = araby.strip_tatweel(ayah)
ayah = araby.strip_tashkeel(ayah)
ayah = araby.normalize_ligature(ayah)
ayah = araby.normalize_hamza(ayah)
ayah = araby.normalize_alef(ayah)
ayah = araby.normalize_teh(ayah)
ayah = ayah.replace("ے", "ى")

print(ayah)
sys.stdout.flush()