def test_normalize_hamza(self): """Test normalize_hamzafunction ?""" text1 = u"جاء سؤال الأئمة عن الإسلام آجلا" text2 = u'جاء سءال الءءمة عن الءسلام ءءجلا' self.assertEqual(ar.normalize_hamza(text1), text2) text1 = u"جاء سؤال الأئمة عن الإسلام آجلا" text3 = u"جاء سوال الايمة عن الاسلام اجلا" self.assertEqual(ar.normalize_hamza(text1, method="tasheel"), text3)
def test_normalization(self): # normalize_ligature(text):TODO: fixme gives 'لانها لالء الاسلام' # assert Araby.normalize_ligature( u"لانها لالء الاسلام") == u'لانها لالئ الاسلام' # normalize_hamza(word) assert Araby.normalize_hamza(u"سئل أحد الأئمة") == u'سءل ءحد الءءمة'
def normalize_root(word): """ test if word is a root""" # change alef madda to hamza + ALEF word = word.replace(araby.ALEF_MADDA, araby.HAMZA + araby.ALEF) word = word.replace(araby.TEH_MARBUTA, '') word = word.replace(araby.ALEF_MAKSURA, araby.YEH) return araby.normalize_hamza(word)
def normalizeText(self, text): normalized_text = araby.strip_tatweel(text) normalized_text = araby.strip_tashkeel(normalized_text) normalized_text = araby.strip_harakat(normalized_text) normalized_text = araby.normalize_hamza(normalized_text) return normalized_text
def normalize(word, wordtype="affix"): """ Normalize the word, by unifoming hamzat, Alef madda, shadda, and lamalefs. @param word: given word. @type word: unicode. @param type: if the word is an affix @type type: unicode. @return: converted word. @rtype: unicode. """ # تحويل الكلمة إلى شكلها النظري. # الشكل اللإملائي للكلمة هو طريقة كتابتها حسب قواعد الإملاء # الشكل النظري هو الشكل المتخيل للكلمة دون تطبيق قواعد اللغة # ويخص عادة الأشكال المتعددة للهمزة، و التي تكتب همزة على السطر # أمثلة # إملائي نظري #إِمْلَائِي ءِمْلَاءِي #سَاَلَ سَءَلَ # الهدف : تحويل الكلمة إلى شكل نظري، #ومن ثم إمكانية تصريفها بعيدا عن قواعد الإملاء، #وبعد التصريف يتم تطبيق قواعد الإملاء من جديد. #الفرضية: الكلمات المدخلة مشكولة شكلا تاما. #الطريقة: # 1-تحويل جميع أنواع الهمزات إلى همزة على السطر # 1-فك الإدغام i = 0 # strip tatweel # the tatweel is used to uniformate the affix # when the Haraka is used separetely if wordtype != "affix": word = araby.strip_tatweel(word) ## تستبدل الألف الممدودة في , ل الكلمة بهمزة قطع بعدها همزة أخرى if word.startswith(ALEF_MADDA): word = normalize_alef_madda(word) # ignore harakat at the begin of the word len_word = len(word) while i < len_word and araby.is_shortharaka(word[i]): # in HARAKAT: i += 1 word = word[i:] # convert all Hamza from into one form word = araby.normalize_hamza(word) #Convert All LAM ALEF Ligature into separate letters word = word.replace(LAM_ALEF, SIMPLE_LAM_ALEF) word = word.replace(LAM_ALEF_HAMZA_ABOVE, SIMPLE_LAM_ALEF_HAMZA_ABOVE) word = word.replace(LAM_ALEF_MADDA_ABOVE, SIMPLE_LAM_ALEF_MADDA_ABOVE) return word
def create_index_triverbtable(): """ Create index from the verb dictionary to accelerate the search in the dictionary for verbs @return: create the TRIVERBTABLE_INDEX @rtype: None """ # the key is the vocverb + the bab number for key in triverbtable.TriVerbTable.keys(): vocverb = triverbtable.TriVerbTable[key]['verb'] unvverb = araby.strip_harakat(vocverb) normverb = araby.normalize_hamza(unvverb) if TRIVERBTABLE_INDEX.has_key(normverb): TRIVERBTABLE_INDEX[normverb].append(key) else: TRIVERBTABLE_INDEX[normverb] = [key, ]
def normalize(word, wordtype = "affix"): """ Normalize the word, by unifoming hamzat, Alef madda, shadda, and lamalefs. @param word: given word. @type word: unicode. @param type: if the word is an affix @type type: unicode. @return: converted word. @rtype: unicode. """ # تحويل الكلمة إلى شكلها النظري. # الشكل اللإملائي للكلمة هو طريقة كتابتها حسب قواعد الإملاء # الشكل النظري هو الشكل المتخيل للكلمة دون تطبيق قواعد اللغة # ويخص عادة الأشكال المتعددة للهمزة، و التي تكتب همزة على السطر # أمثلة # إملائي نظري #إِمْلَائِي ءِمْلَاءِي #سَاَلَ سَءَلَ # الهدف : تحويل الكلمة إلى شكل نظري، #ومن ثم إمكانية تصريفها بعيدا عن قواعد الإملاء، #وبعد التصريف يتم تطبيق قواعد الإملاء من جديد. #الفرضية: الكلمات المدخلة مشكولة شكلا تاما. #الطريقة: # 1-تحويل جميع أنواع الهمزات إلى همزة على السطر # 1-فك الإدغام i = 0 # strip tatweel # the tatweel is used to uniformate the affix # when the Haraka is used separetely if wordtype != "affix": word = araby.strip_tatweel(word) ## تستبدل الألف الممدودة في , ل الكلمة بهمزة قطع بعدها همزة أخرى if word.startswith(ALEF_MADDA): word = normalize_alef_madda(word) # ignore harakat at the begin of the word len_word = len(word) while i < len_word and araby.is_shortharaka(word[i]): # in HARAKAT: i += 1 word = word[i:] # convert all Hamza from into one form word = araby.normalize_hamza(word) #Convert All LAM ALEF Ligature into separate letters word = word.replace(LAM_ALEF, SIMPLE_LAM_ALEF) word = word.replace(LAM_ALEF_HAMZA_ABOVE, SIMPLE_LAM_ALEF_HAMZA_ABOVE) word = word.replace(LAM_ALEF_MADDA_ABOVE, SIMPLE_LAM_ALEF_MADDA_ABOVE) return word
def find_alltriverb(triverb, givenharaka=araby.FATHA, vocalised_entree=False): """ Find the triliteral verb in the dictionary (TriVerbTable) return a list of possible verb forms each item contains: - 'root': - 'haraka: - 'bab': - 'transitive': @param triverb: given verb. @type triverb: unicode. @param givenharaka: given haraka of tuture type of the verb, default(FATHA). @type givenharaka: unicode. @param VocalisedEntree: True if the given verb is vocalized, default False. @type VocalisedEntree: Boolean. @return: list of triliteral verbs. @rtype: list of dicts. """ liste = [] if vocalised_entree: verb_nm = araby.strip_harakat(triverb) else: verb_nm = triverb normalized = araby.normalize_hamza(verb_nm) if TRIVERBTABLE_INDEX.has_key(normalized): for verb_voc_id in TRIVERBTABLE_INDEX[normalized]: if triverb == triverbtable.TriVerbTable[verb_voc_id]['verb'] and \ givenharaka == triverbtable.TriVerbTable[verb_voc_id]['haraka']: liste.insert(0, triverbtable.TriVerbTable[verb_voc_id]) # if VocalisedEntree: #if verb_voc_id[:-1] == triverb: # liste.append(TriVerbTable[verb_voc_id]) else: liste.append(triverbtable.TriVerbTable[verb_voc_id]) else: print("triverb has no verb") return liste
def find_alltriverb(triverb, givenharaka = araby.FATHA, vocalised_entree = False): """ Find the triliteral verb in the dictionary (TriVerbTable) return a list of possible verb forms each item contains: - 'root': - 'haraka: - 'bab': - 'transitive': @param triverb: given verb. @type triverb: unicode. @param givenharaka: given haraka of tuture type of the verb, default(FATHA). @type givenharaka: unicode. @param VocalisedEntree: True if the given verb is vocalized, default False. @type VocalisedEntree: Boolean. @return: list of triliteral verbs. @rtype: list of dicts. """ liste = [] if vocalised_entree: verb_nm = araby.strip_harakat(triverb) else: verb_nm = triverb normalized = araby.normalize_hamza(verb_nm) if TRIVERBTABLE_INDEX.has_key(normalized): for verb_voc_id in TRIVERBTABLE_INDEX[normalized]: if triverb == triverbtable.TriVerbTable[verb_voc_id]['verb'] and \ givenharaka == triverbtable.TriVerbTable[verb_voc_id]['haraka']: liste.insert(0, triverbtable.TriVerbTable[verb_voc_id]) # if VocalisedEntree: #if verb_voc_id[:-1] == triverb: # liste.append(TriVerbTable[verb_voc_id]) else: liste.append(triverbtable.TriVerbTable[verb_voc_id]) else: print "triverb has no verb" return liste
def preprocess(sentences, stopwords, isStopword = False): """ This takes in an array of complete araic sentences, and performs th following operations on all of them: 1.) strips tashkeel 2.) strips harakat 3.) strips lastharaka 4.) strips tatweel 5.) Strips shadda 6.) normalize lam alef ligatures 7.) normalize hamza 8.) tokenize Returns a 2D martix, where each row represents normalized, tokens of each sentence """ #print("SENTENCE INDEX!!!", sentences[0]) output = [] for sentence in sentences: #print("Before Preprocessing:"+ sentence) #print(sentence) text = araby.strip_harakat(sentence) #print("TEXT!!!!", text) text = araby.strip_tashkeel(text) text = araby.strip_lastharaka(text) text = araby.strip_tatweel(text) text = araby.strip_shadda(text) text = araby.normalize_ligature(text) text = araby.normalize_hamza(text) text = clean_str(text) #print("After Preprocessing:"+ text) #print("----") #print(text) try: text = re.match(r'[^\\n\\s\\p{Latin}]+', text).group() tokens = araby.tokenize(text) if not isStopword: tokens = remove_stopwords(stopwords, tokens) tokens = [t for t in tokens if t != '\n'] output.append(tokens) except: pass return output
def lookup(self, normalized): """ look up for all word forms in the dictionary @param normalized: the normalized word. @type normalized: unicode. @return: list of dictionary entries . @rtype: list. Example: >>> import arramooz.arabicdictionary >>> mydict = arramooz.arabicdictionary.ArabicDictionary('verbs') >>> wordlist = [u"استقلّ", u'استقل', u"كذب"] >>> tmp_list = [] >>> for word in wordlist: >>> foundlist = mydict.lookup(word) >>> for word_tuple in foundlist: >>> word_tuple = dict(word_tuple) >>> vocalized = word_tuple['vocalized'] >>> tmp_list.append(dict(word_tuple)) >>> print(tmp_list) [{'think_trans': 1, 'passive': 0, 'confirmed': 0, 'vocalized': u'اِسْتَقَلَّ', 'stamped': u'ستقل', 'future_moode': 0, 'triliteral': 0, 'future': 0, 'unthink_trans': 0, 'past': 0, 'unvocalized': u'استقل', 'future_type': u'َ', 'double_trans': 0, 'normalized': u'استقل', 'reflexive_trans': 0, 'imperative': 0, 'transitive': 1, 'root': u'قلل', 'id': 7495}, {'think_trans': 1, 'passive': 0, 'confirmed': 0, 'vocalized': u'كَذَبَ', 'stamped': u'كذب', 'future_moode': 0, 'triliteral': 1, 'future': 0, 'unthink_trans': 0, 'past': 0, 'unvocalized': u'كذب', 'future_type': u'كسرة', 'double_trans': 0, 'normalized': u'كذب', 'reflexive_trans': 0, 'imperative': 0, 'transitive': 1, 'root': u'كذب', 'id': 1072}, {'think_trans': 1, 'passive': 0, 'confirmed': 0, 'vocalized': u'كَذَّبَ', 'stamped': u'كذب', 'future_moode': 0, 'triliteral': 0, 'future': 0, 'unthink_trans': 0, 'past': 0, 'unvocalized': u'كذب', 'future_type': u'َ', 'double_trans': 0, 'normalized': u'كذب', 'reflexive_trans': 0, 'imperative': 0, 'transitive': 1, 'root': u'كذب', 'id': 2869}] """ idlist = [] normword = araby.normalize_hamza(normalized) #print "###", normword.encode('utf8') sql = u"select * FROM %s WHERE normalized='%s'" % (self.table_name, normword) try: self.cursor.execute(sql) if self.cursor: # return self.curser.fetchall() for row in self.cursor: idlist.append(row) return idlist except sqlite.OperationalError: return []
def verb_stamp(self, word): """ generate a stamp for a verb, the verb stamp is different of word stamp, by hamza noralization remove all letters which can change form in the word : - ALEF, - YEH, - WAW, - ALEF_MAKSURA - SHADDA @return: stamped word """ word = ar.strip_tashkeel(word) #The vowels are striped in stamp function word = ar.normalize_hamza(word) if word.startswith(ar.HAMZA): #strip The first hamza word = word[1:] # strip the last letter if is doubled if word[-1:] == word[-2:-1]: word = word[:-1] return self.verb_stamp_pat.sub('', word)
def verb_stamp(self, word): """ generate a stamp for a verb, the verb stamp is different of word stamp, by hamza noralization remove all letters which can change form in the word : - ALEF, - YEH, - WAW, - ALEF_MAKSURA - SHADDA @return: stamped word """ word = araby.strip_tashkeel(word) #The vowels are striped in stamp function word = araby.normalize_hamza(word) if word.startswith(araby.HAMZA): #strip The first hamza word = word[1:] # strip the last letter if is doubled if word[-1:] == word[-2:-1]: word = word[:-1] return self.verb_stamp_pat.sub('', word)
def lookup(self, normalized): """ look up for all word forms in the dictionary @param normalized: the normalized word. @type normalized: unicode. @return: list of dictionary entries . @rtype: list. """ idlist = [] normword = araby.normalize_hamza(normalized) #print "###", normword.encode('utf8') sql = u"select * FROM %s WHERE normalized='%s'" % (self.table_name, normword) try: self.cursor.execute(sql) if self.cursor: # return self.curser.fetchall() for row in self.cursor: idlist.append(row) return idlist except sqlite.OperationalError: return []
def ar_tokenizer(t): return [ wordpunct_tokenize( normalize_ligature(normalize_hamza(strip_tashkeel(k)))) for k in t ]
def treat_tuple(self,tuple_noun): """ convert row data to specific fields return a dict of fields""" #~ self.id+=1; #extract field from the noun tuple fields={}; for key in self.field_id.keys(): try: fields[key] = tuple_noun[self.field_id[key]].strip(); except IndexError: print "#"*5, "key error [%s],"%key, self.field_id[key], len(tuple_noun); print tuple_noun sys.exit() # treat specific fields fields['note']=""; #if fields['root'] == "": if fields['number'] == u"جمع": fields['number'] = u"جمع تكسير" elif fields['number'] == u"مثنى": fields['number'] = u"مثنى" else: fields['number'] = u"مفرد" # make note if definition is not given if not fields['definition']: fields['note'] = u":".join([fields['note'],u"لا شرح"]); #الممنوع من الصرف if not fields['tanwin_nasb']: fields['mamnou3_sarf'] = u"ممنوع من الصرف"; elif fields['tanwin_nasb'] in ("Non","N"): fields['mamnou3_sarf'] = u"ممنوع من الصرف"; elif fields['tanwin_nasb'] in ("Tn",): fields['mamnou3_sarf'] = u""; else: fields['mamnou3_sarf'] = u""; # get unvocalized fields fields['unvocalized'] = araby.strip_tashkeel(fields['vocalized']); # word type, must be defined for every file # not god idea #~ fields['wordtype'] = self.wordtype; fields['wordtype'] = araby.strip_tashkeel(fields['category'])+u":%s"%self.wordtype; # extarct broken plurals # extract plural from the plural field # the field can have +ون;+ات items = fields['plural'].split(";") if u'+ون' in items:items.remove(u'+ون') if u'+ات' in items:items.remove(u'+ات') if u'ون' in items:items.remove(u'ون') if u'ات' in items:items.remove(u'ات') if items: fields['broken_plural'] = u";".join(items); else: fields['broken_plural'] = ""; #display order fields['normalized'] = araby.normalize_hamza(fields['unvocalized']) fields['stamped'] = ndf.word_stamp(fields['unvocalized']) # special change in some fields # some fields are not fully defined, # if the k_prefix si Null, it means True, # if is N or n, it's False if fields['k_prefix'] in ('n', 'N'): fields['k_prefix'] = 0 else: fields['k_prefix'] = 1 # if the kal_prefix si Null, it means True, # if is N or n, it's False if fields['kal_prefix'] in ('n', 'N'): fields['kal_prefix'] = 0 else: fields['kal_prefix'] = 1 # if the ha_suffix si Null, it means True, # if is N or n, it's False if fields['ha_suffix'] in ('n', 'N'): fields['ha_suffix'] = 0 else: fields['ha_suffix'] = 1 # if the hm_suffix si Null, it means True, # if is N or n, it's False if fields['hm_suffix'] in ('n', 'N'): fields['hm_suffix'] = 0 else: fields['hm_suffix'] = 1 # change boolean fields for key in self.boolean_fields: if not fields[key]: fields[key] = 0 elif fields[key] in ('n',"N", "Non"): fields[key] = 0 elif fields[key] in ('o',"O"): fields[key] = 1 else: fields[key] = 1 return fields;
def test_normalize_hamza(self): """Test normalize_hamzafunction ?""" text1 = u"سئل أحد الأئمة" text2 = u"سءل ءحد الءءمة" self.assertEqual(ar.normalize_hamza(text1), text2)
def stemming_verb(self, verb_in): """ Stemming verb @param verb_in: given verb @type verb_in: unicode @return : stemmed words @rtype: """ if not verb_in: return None #~ list_found = [] detailed_result = [] verb_list = [ verb_in, ] + self.get_verb_variants(verb_in) verb_list = list(set(verb_list)) debug = self.debug #list of segmented words word_segmented_list = [] for verb in verb_list: list_seg_comp = self.comp_stemmer.segment(verb) for seg in list_seg_comp: proclitic = verb[:seg[0]] stem = verb[seg[0]:seg[1]] enclitic = verb[seg[1]:] #~ print "stem_verb affix 93", "-".join([proclitic, stem, enclitic]).encode('utf8') #~secondsuffix = u'' # حالة الفعل المتعدي لمفعولين if enclitic in SVC.TABLE_DOUBLE_TRANSITIVE_SUFFIX: firstsuffix = \ SVC.TABLE_DOUBLE_TRANSITIVE_SUFFIX[enclitic]['first'] enclitic = firstsuffix list_stem = [stem] + self.get_in_stem_variants(stem, enclitic) #if enclitic, then transitive is ok transitive_comp = bool(enclitic) for stm in list_stem: word_seg = { "verb": verb, "pro": proclitic, "enc": enclitic, 'stem_comp': stm, 'trans_comp': transitive_comp, } word_segmented_list.append(word_seg) if debug: print("after first level") if debug: #~ print(repr(word_segmented_list).replace( #~ '},', '},\n').decode("unicode-escape")) print(arepr(verb_in)) print(print_table(word_segmented_list)) # second level for segmented word tmp_list = [] #~ print 'first level', verb_in, len(word_segmented_list) for word_seg in word_segmented_list: verb2 = word_seg['stem_comp'] # stem reduced verb : level two #segment the conjugated verb list_seg_conj = self.conj_stemmer.segment(verb2) # verify affix compatibility list_seg_conj = self.verify_affix(verb2, list_seg_conj, SVC.VERBAL_CONJUGATION_AFFIX) # verify proclitics and enclitecs # verify length pof stem for seg_conj in list_seg_conj: if (seg_conj[1] - seg_conj[0]) <= 6: #word seg in level 2 word_seg_l2 = word_seg.copy() word_seg_l2["prefix"] = verb2[:seg_conj[0]] word_seg_l2["stem_conj"] = verb2[seg_conj[0]:seg_conj[1]] word_seg_l2["suffix"] = verb2[seg_conj[1]:] tmp_list.append(word_seg_l2) # verify compatibilty between proclitic and affixes word_segmented_list = tmp_list #~ print 'compatibility', verb_in, len(tmp_list) tmp_list = [] for word_seg in word_segmented_list: # verify compatibility between proclitics and affixes proclitic = word_seg['pro'] enclitic = word_seg['enc'] affix_conj = u"-".join([word_seg['prefix'], word_seg['suffix']]) if self.__check_clitic_affix(proclitic, enclitic, affix_conj): tmp_list.append(word_seg.copy()) #~ print 'stamp', verb_in, len(tmp_list) # verify existance of condidate verb by stamp word_segmented_list = tmp_list tmp_list = [] for word_seg in word_segmented_list: # verify existance of condidate verb by stamp if self.exists_as_stamp(word_seg['stem_conj']): tmp_list.append(word_seg.copy()) if debug: print("after second level") if debug: print(arepr(verb_in)) print(print_table(tmp_list)) #~ print 'infinitive', verb_in, len(tmp_list) # get infinitive of condidate verbs word_segmented_list = tmp_list tmp_list = [] for word_seg in word_segmented_list: # get infinitive of condidate verb by stamp # search the verb in the dictionary by stamp # if the verb exists in dictionary, # The transitivity is consedered # if is trilateral return its forms and Tashkeel # if not return forms without tashkeel, #because the conjugator can vocalized it, # we can return the tashkeel if we don't need the #conjugation step infverb_dict = self.__get_infinitive_verb_by_stem( word_seg['stem_conj'], word_seg['trans_comp']) if debug: print("infinitive candidat verbs") if debug: print(arepr(verb_in)) print(print_table(infverb_dict)) #~ print "list possible verbs", len(infverb_dict) #~ for item in infverb_dict: #~ print item['verb'] # filter verbs infverb_dict = self.__verify_infinitive_verbs( word_seg['stem_conj'], infverb_dict) if debug: print("valid infinitive candidat verbs") if debug: print(arepr(verb_in)) print(print_table(infverb_dict)) for item in infverb_dict: #The haraka from is given from the dict word_seg_l3 = word_seg.copy() word_seg_l3['inf'] = item['verb'] word_seg_l3['haraka'] = item['haraka'] word_seg_l3['root'] = item.get('root', '') word_seg_l3['transitive'] = bool(item['transitive'] in ('y', 1)) tmp_list.append(word_seg_l3) # conjugation step if debug: print("after lookup dict") if debug: print(arepr(verb_in)) print(print_table(tmp_list)) #~ print repr(tmp_list).replace('},','},\n').decode("unicode-escape") #~ print 'conj', verb_in, len(tmp_list) # get conjugation for every infinitive verb word_segmented_list = tmp_list tmp_list = [] for word_seg in word_segmented_list: # ToDo, conjugate the verb with affix, # if exists one verb which match, return it # تصريف الفعل مع الزوائد # إذا توافق التصريف مع الكلمة الناتجة # تعرض النتيجة one_correct_conj = self.__generate_possible_conjug( word_seg['inf'], word_seg['stem_comp'], word_seg['prefix'] + '-' + word_seg['suffix'], word_seg['haraka'], word_seg['pro'], word_seg['enc'], word_seg['transitive']) #~ print "len correct_conj", len(one_correct_conj) for conj in one_correct_conj: word_seg_l4 = word_seg.copy() word_seg_l4['conj'] = conj.copy() tmp_list.append(word_seg_l4) if debug: print("after generating conjugation") if debug: print(arepr(verb_in)) conjs = [item['conj'] for item in tmp_list] print(print_table(conjs)) #~ print 'result', verb_in, len(tmp_list) # generate all resulted data word_segmented_list = tmp_list #~ tmp_list = [] for word_seg in word_segmented_list: conj = word_seg['conj'] #~ vocalized, semivocalized = self.vocalize( vocal_tuple_list = self.vocalize(conj['vocalized'], word_seg['pro'], word_seg['enc']) tag_type = 'Verb' original_tags = "y" if conj['transitive'] else "n" # ~ print("stem_verb", vocal_tuple_list) for vocalized, semivocalized, __ in vocal_tuple_list: # ~ for XXX in vocal_tuple_list: # prepare tags tags = self.prepare_tags(conj, proclitic, enclitic) detailed_result.append( wordcase.WordCase({ 'word': word_seg['verb'], 'affix': (word_seg['pro'], word_seg['prefix'], word_seg['suffix'], word_seg['enc']), 'stem': word_seg['stem_conj'], 'root': ar.normalize_hamza(word_seg.get('root', '')), 'original': conj['verb'], 'vocalized': vocalized, 'semivocalized': semivocalized, 'tags': tags, #\ 'type': tag_type, 'number': conj['pronoun_tags'].get('number', ''), 'gender': conj['pronoun_tags'].get('gender', ''), 'person': conj['pronoun_tags'].get('person', ''), 'tense2': conj['tense_tags'].get('tense', ''), 'voice': conj['tense_tags'].get('voice', ''), 'mood': conj['tense_tags'].get('mood', ''), 'confirmed': conj['tense_tags'].get('confirmed', ''), 'transitive': conj['transitive'], 'tense': conj['tense'], 'pronoun': conj['pronoun'], 'freq': 'freqverb', 'originaltags': original_tags, 'syntax': '', })) return detailed_result
if araby.is_weak(c): print ('weak'), if araby.is_moon(c): print ('moon'), if araby.is_sun(c):print ('sun'), print (araby.order(c)), print (); word=u"الْعَرَيِيّةُ" word_list=[ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", u"سئل لأنه يؤم الإمام" ] word1=u"" for word in word_list: print (word) if araby.is_vocalized(word): print (' is vocalized') if araby.is_vocalizedtext(word): print (' is vocalized text') if araby.is_arabicword(word): print (' is valid word') else: print ("invalid arabic word") print (' strip harakat', araby.strip_harakat(word)) print (' strip tashkeel', araby.strip_tashkeel(word)) print (' strip tatweel',araby.strip_tatweel(word)) print (' normalize ligature ', araby.normalize_ligature(word)) print (' normalize hamza', araby.normalize_hamza(word)) if araby.vocalizedlike(word, word1): print ("vocalized_like") word1=word; if araby.vocalizedlike(u"العربية",u"العرَبية"): print ("vocalized_like")
def normalize(word): """ normalize root""" nrm = araby.normalize_hamza(word) nrm = nrm.replace(araby.ALEF, araby.WAW) return nrm
def treat_tuple(self, tuple_verb): """ convert row data to specific fields return a dict of fields""" #~ self.id+=1; v = { "id": self.id, } # verb dict of fields #extract field from the verb tuple for key in self.field_id.keys(): try: v[key] = tuple_verb[self.field_id[key]].strip() except IndexError: print "#" * 5, "key error [%s]," % key, self.field_id[ key], len(tuple_verb) print tuple_verb sys.exit() v["unvocalized"] = araby.strip_tashkeel(v['vocalized']) v['normalized'] = araby.normalize_hamza(v['unvocalized']) v['stamped'] = vdf.stamp(v['unvocalized']) # Adopt fields to the actual program #word; if v['tri'] == u"ثلاثي": v['triliteral'] = True else: v['triliteral'] = False #root #future_type if v['transitive'] != u"متعد": v['transitive'] = False v['unthink_trans'] = False # متعدي لغير العاقل v['think_trans'] = False # متعدي للعاقل، تلقائيا اﻷفعال تقبل العاقل v['reflexive_trans'] = False #فعل قلوب v['double_trans'] = False #متعدي لمفعولين else: v['transitive'] = True ## if v['nb_trans'] == "2": v['double_trans'] = True else: v['double_trans'] = False # TYPE OF THE OBJECT, REASONALBEL, OR NOT if v['object_type'] == u"عاقل": v['think_trans'] = True v['unthink_trans'] = False elif v['object_type'] == u"غيرع": v['think_trans'] = False v['unthink_trans'] = True else: v['think_trans'] = False v['unthink_trans'] = False # reflexive object فعل القلوب المتعدي، أظنني if v['reflexive_type'] == u"قلبي": v['reflexive_trans'] = True else: v['reflexive_trans'] = False # decode tenses v['all'], v['past'], v['future'], v['passive'], v['imperative'], v[ 'future_moode'], v['confirmed'] = vdf.decode_tenses(v['tenses']) if v['all']: v['tenses'] = u"يعملان" else: v['tenses'] = u"" if v['past']: v['tenses'] += u"ي" else: v['tenses'] += "-" if v['future']: v['tenses'] += u"ع" else: v['tenses'] += "-" if v['imperative']: v['tenses'] += u"م" else: v['tenses'] += "-" if v['passive']: v['tenses'] += u"ل" else: v['tenses'] += u"-" if v['future_moode']: v['tenses'] += u"ا" else: v['tenses'] += u"-" if v['confirmed']: v['tenses'] += u"ن" else: v['tenses'] += u"-" # convert True/false to 0/1 v['triliteral'] = vdf.yes(v['triliteral']) v['transitive'] = vdf.yes(v['transitive']) v['double_trans'] = vdf.yes(v['double_trans']) v['think_trans'] = vdf.yes(v['think_trans']) v['unthink_trans'] = vdf.yes(v['unthink_trans']) v['reflexive_trans'] = vdf.yes(v['reflexive_trans']) v['past'] = vdf.yes(v['past']) v['future'] = vdf.yes(v['future']) v['imperative'] = vdf.yes(v['imperative']) v['passive'] = vdf.yes(v['passive']) v['future_moode'] = vdf.yes(v['future_moode']) v['confirmed'] = vdf.yes(v['confirmed']) return v
def steming_second_level(self, noun, stem_comp, procletic_nm, encletic_nm): """ Analyze word morphologically by stemming the conjugation affixes. @param noun: the input noun. @type noun: unicode. @param stem_comp: the noun stemed from syntaxic affixes. @type stem_comp: unicode. @param procletic: the syntaxic prefixe extracted in the fisrt stage. @type procletic: unicode. @param encletic: the syntaxic suffixe extracted in the fisrt stage. @type encletic: unicode. @param encletic_nm: the syntaxic suffixe extracted in the first stage (not vocalized). @type encletic_nm: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result = [] #segment the coinjugated verb list_seg_conj = self.conj_stemmer.segment(stem_comp) # verify affix compatibility list_seg_conj = verify_affix(stem_comp, list_seg_conj, snconst.NOMINAL_CONJUGATION_AFFIX) # add vocalized forms of suffixes # and create the real affixes from the word #~list_seg_conj_voc = [] for seg_conj in list_seg_conj: stem_conj = stem_comp[seg_conj[0]:seg_conj[1]] suffix_conj_nm = stem_comp[seg_conj[1]:] # noirmalize hamza before gessing differents origines stem_conj = ar.normalize_hamza(stem_conj) # generate possible stems # add stripped letters to the stem to constitute possible noun list possible_noun_list = get_stem_variants(stem_conj, suffix_conj_nm) # search the noun in the dictionary # we can return the tashkeel infnoun_form_list = [] for infnoun in set(possible_noun_list): # get the noun and get all its forms from the dict # if the noun has plural suffix, don't look up in #broken plural dictionary if infnoun not in self.cache_dict_search: infnoun_foundlist = self.noun_dictionary.lookup(infnoun) self.cache_dict_search[infnoun] = infnoun_foundlist else: infnoun_foundlist = self.cache_dict_search[infnoun] infnoun_form_list.extend(infnoun_foundlist) for noun_tuple in infnoun_form_list: infnoun = noun_tuple['vocalized'] # affixes tags contains prefixes and suffixes tags affix_tags = list( set(snconst.COMP_PREFIX_LIST_TAGS[procletic_nm]['tags'] + snconst.COMP_SUFFIX_LIST_TAGS[encletic_nm]['tags'] + snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj_nm]['tags'])) #test if the given word from dictionary accept those # tags given by affixes # دراسة توافق الزوائد مع خصائص الاسم، # مثلا هل يقبل الاسم التأنيث. if validate_tags(noun_tuple, affix_tags, procletic_nm, encletic_nm, suffix_conj_nm): ## get all vocalized form of suffixes for vocalized_encletic in snconst.COMP_SUFFIX_LIST_TAGS[ encletic_nm]['vocalized']: for vocalized_suffix in snconst.CONJ_SUFFIX_LIST_TAGS[ suffix_conj_nm]['vocalized']: ## verify compatibility between procletics and affix if self.is_compatible_proaffix_affix( noun_tuple, procletic_nm, vocalized_encletic, vocalized_suffix): vocalized, semi_vocalized, _ = vocalize( infnoun, procletic_nm, vocalized_suffix, vocalized_encletic) #add some tags from dictionary entry as #mamnou3 min sarf and broken plural original_tags = [] if noun_tuple['mankous'] == u"Tk": original_tags.append(u"منقوص") # get affix tags vocalized_affix_tags = snconst.COMP_PREFIX_LIST_TAGS[procletic_nm]['tags']\ +snconst.COMP_SUFFIX_LIST_TAGS[vocalized_encletic]['tags']\ +snconst.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags'] # if there are many cases like feminin plural with mansoub and majrour if 'cases' in snconst.CONJ_SUFFIX_LIST_TAGS[ vocalized_suffix]: list_cases = snconst.CONJ_SUFFIX_LIST_TAGS[ vocalized_suffix]['cases'] else: list_cases = ('', ) for case in list_cases: voc_affix_case = vocalized_affix_tags + ( case, ) detailed_result.append( wordcase.WordCase({ 'word': noun, 'affix': (procletic_nm, '', vocalized_suffix, vocalized_encletic), 'stem': stem_conj, 'original': infnoun, #original, 'vocalized': vocalized, 'semivocalized': semi_vocalized, 'tags': u':'.join(voc_affix_case), 'type': u':'.join([ 'Noun', noun_tuple['wordtype'] ]), 'number': noun_tuple['number'], 'gender': noun_tuple['gender'], 'freq': 'freqnoun', # to note the frequency type 'originaltags': u':'.join(original_tags), 'syntax': '', })) return detailed_result
movie_data = pd.concat([movie_name_link, genre_and_plot_data], axis=1) for i in tqdm(range(0, len(movie_data))): try: movie_data['Plot'][int(i)] = "".join(movie_data['Plot'][int(i)]) except TypeError: pass data = pd.read_excel("movies_data.xlsx") data = data.dropna() data = data.reset_index() tokenized_plot = [] for i in tqdm(range(0,len(data['Plot']))): sentence = normalize_hamza(data['Plot'][i]) tokenized_plot.append(','.join(tokenize(sentence, conditions=is_arabicrange))) data['Tokenized Plot'] = tokenized_plot cleaned = [] for i in tqdm(range(0,len(data['Genre']))): data.iloc[i,4] = ', '.join(ast.literal_eval(data.iloc[i,4]))
# In[7]: # récuperation des données dans des liste list_enonce = list(corpus["#2 tweet_content"]) list_pays = list(corpus["#3 country_label"]) list_province = list(corpus["#4 province_label"]) # In[13]: #nétoyage des tweets (URL, hashtags, emoticones/emojis, ponctuation, translittération de arabizi, bruits) list_enonce_clean = [] for ligne in list_enonce: ligne = araby.strip_tashkeel( ligne) #Supprimer les signes diacritique (voyelles) ligne = araby.normalize_hamza( ligne) #Normalise une chaîne (hamza ou les différents A en arabe) emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u'\U00010000-\U0010ffff' u"\u200d" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u23cf" u"\u23e9"
def stemming_noun(self, noun_in): """ Analyze word morphologically as noun @param noun_in: the input noun. @type noun_in: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ debug = False #~list_found = [] detailed_result = [] noun_list = [ noun_in, ] + get_noun_variants(noun_in) word_segmented_list = [] for noun in noun_list: list_seg_comp = self.comp_stemmer.segment(noun) # filter list_seg_comp = verify_affix(noun, list_seg_comp, SNC.COMP_NOUN_AFFIXES) # treat multi vocalization enclitic for seg in list_seg_comp: proclitic_nm = noun[:seg[0]] stem = noun[seg[0]:seg[1]] enclitic_nm = noun[seg[1]:] # ajusting nouns variant list_stem = [ stem, ] + get_in_stem_variants(stem, enclitic_nm) # stem reduced noun : level two for stem in list_stem: word_seg = { 'noun': noun, 'stem_comp': stem, 'pro': proclitic_nm, 'enc': enclitic_nm, } word_segmented_list.append(word_seg) # level two tmp_list = [] if debug: print("after first level") if debug: print( repr(word_segmented_list).replace( '},', '},\n').decode("unicode-escape")) for word_seg in word_segmented_list: #~ detailed_result.extend( #~ self.steming_second_level(word_seg['noun'], word_seg['stem_comp'], #~ word_seg['pro'], word_seg['enc'])) #~ detailed_result_one = [] #segment the coinjugated noun list_seg_conj = self.conj_stemmer.segment(word_seg['stem_comp']) # verify affix compatibility # filter list_seg_conj = verify_affix(word_seg['stem_comp'], list_seg_conj, SNC.NOMINAL_CONJUGATION_AFFIX) # add vocalized forms of suffixes # and create the real affixes from the word for seg_conj in list_seg_conj: stem_conj = word_seg['stem_comp'][:seg_conj[1]] suffix = word_seg['stem_comp'][seg_conj[1]:] stem_conj = ar.normalize_hamza(stem_conj) stem_conj_list = get_stem_variants(stem_conj, suffix) # generate possible stems # add stripped letters to the stem to constitute possible noun list for stem in stem_conj_list: word_seg_l2 = word_seg.copy() # normalize hamza before gessing differents origines word_seg_l2['stem_conj'] = stem word_seg_l2['suffix'] = suffix #affixes tags contains prefixes and suffixes tags word_seg_l2['affix_tags'] = list( set(SNC.COMP_PREFIX_LIST_TAGS[word_seg_l2['pro']] ['tags'] + SNC.COMP_SUFFIX_LIST_TAGS[ word_seg_l2['enc']]['tags'] + SNC.CONJ_SUFFIX_LIST_TAGS[ word_seg_l2['suffix']]['tags'])) tmp_list.append(word_seg_l2) if debug: print("after second level") if debug: print( repr(tmp_list).replace('},', '},\n').decode("unicode-escape")) # lookup in dictionary word_segmented_list = tmp_list tmp_list = [] for word_seg in word_segmented_list: # search the noun in the dictionary # we can return the tashkeel inf_noun = word_seg['stem_conj'] # get the noun and get all its forms from the dict # if the noun has plural suffix, don't look up in #broken plural dictionary if inf_noun in self.cache_dict_search: infnoun_foundlist = self.cache_dict_search[inf_noun] else: infnoun_foundlist = self.noun_dictionary.lookup(inf_noun) self.cache_dict_search[inf_noun] = infnoun_foundlist for noun_tuple in infnoun_foundlist: word_seg_l3 = word_seg.copy() word_seg_l3["original"] = noun_tuple['vocalized'] word_seg_l3["noun_tuple"] = dict(noun_tuple) tmp_list.append(word_seg_l3) if debug: print("after lookup dict") if debug: print( repr(tmp_list).replace('},', '},\n').decode("unicode-escape")) # test compatiblity noun_tuple with affixes and proaffixes # and generate vocalized affixes and suffixes word_segmented_list = tmp_list tmp_list = [] for word_seg in word_segmented_list: #test if the given word from dictionary accept those # tags given by affixes # دراسة توافق الزوائد مع خصائص الاسم، # مثلا هل يقبل الاسم التأنيث. if validate_tags(word_seg['noun_tuple'], word_seg['affix_tags'], word_seg['pro'], word_seg['enc'], word_seg['suffix']): ## get all vocalized form of suffixes for enc_voc in SNC.COMP_SUFFIX_LIST_TAGS[ word_seg['enc']]['vocalized']: for suf_voc in SNC.CONJ_SUFFIX_LIST_TAGS[ word_seg['suffix']]['vocalized']: ## verify compatibility between proclitics and affix if self.__check_clitic_affix(word_seg['noun_tuple'], word_seg['pro'], enc_voc, suf_voc): # get affix tags affix_tags_voc = SNC.COMP_PREFIX_LIST_TAGS[word_seg['pro']]['tags']\ +SNC.COMP_SUFFIX_LIST_TAGS[enc_voc]['tags']\ +SNC.CONJ_SUFFIX_LIST_TAGS[suf_voc]['tags'] word_seg_l4 = word_seg.copy() word_seg_l4['suf_voc'] = suf_voc word_seg_l4['enc_voc'] = enc_voc word_seg_l4['affix_tags'] = affix_tags_voc tmp_list.append(word_seg_l4) if debug: print("after check compatibility") if debug: print( repr(tmp_list).replace('},', '},\n').decode("unicode-escape")) # Generate results word_segmented_list = tmp_list tmp_list = [] for word_seg in word_segmented_list: # get voalized and vocalized without inflection vocalized, semi_vocalized, _ = vocalize( word_seg['noun_tuple']['vocalized'], word_seg['pro'], word_seg['suf_voc'], word_seg['enc_voc']) #add some tags from dictionary entry as #mamnou3 min sarf and broken plural original_tags = [] if word_seg['noun_tuple']['mankous'] == u"Tk": original_tags.append(u"منقوص") # if there are many cases like feminin plural with mansoub and majrour if 'cases' in SNC.CONJ_SUFFIX_LIST_TAGS[word_seg['suf_voc']]: list_cases = SNC.CONJ_SUFFIX_LIST_TAGS[ word_seg['suf_voc']]['cases'] else: list_cases = ('', ) for case in list_cases: voc_affix_case = word_seg['affix_tags'] + (case, ) detailed_result.append( wordcase.WordCase({ 'word': noun_in, 'affix': (word_seg['pro'], '', word_seg['suf_voc'], word_seg['enc_voc']), 'stem': word_seg['stem_conj'], 'original': word_seg['noun_tuple']['vocalized'], #original, 'vocalized': vocalized, 'semivocalized': semi_vocalized, 'tags': u':'.join(voc_affix_case), 'type': u':'.join(['Noun', word_seg['noun_tuple']['wordtype']]), 'number': word_seg['noun_tuple']['number'], 'gender': word_seg['noun_tuple']['gender'], 'freq': 'freqnoun', # to note the frequency type 'originaltags': u':'.join(original_tags), 'syntax': '', })) if debug: print("after generate result") if debug: print(len(detailed_result)) #~ if debug: print repr(detailed_result).replace('},','},\n').decode("unicode-escape") return detailed_result
def steming_second_level(self, noun, noun2, procletic, encletic_nm): """ Analyze word morphologically by stemming the conjugation affixes. @param noun: the input noun. @type noun: unicode. @param noun2: the noun stemed from syntaxic affixes. @type noun2: unicode. @param procletic: the syntaxic prefixe extracted in the fisrt stage. @type procletic: unicode. @param encletic: the syntaxic suffixe extracted in the fisrt stage. @type encletic: unicode. @param encletic_nm: the syntaxic suffixe extracted in the first stage (not vocalized). @type encletic_nm: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result = [] #segment the coinjugated verb list_seg_conj = self.conj_stemmer.segment(noun2) # verify affix compatibility list_seg_conj = verify_affix(noun2, list_seg_conj, snconst.NOMINAL_CONJUGATION_AFFIX) # add vocalized forms of suffixes # and create the real affixes from the word #~list_seg_conj_voc = [] for seg_conj in list_seg_conj: stem_conj = noun2[seg_conj[0]:seg_conj[1]] suffix_conj_nm = noun2[seg_conj[1]:] # noirmalize hamza before gessing differents origines stem_conj = araby.normalize_hamza(stem_conj) # generate possible stems # add stripped letters to the stem to constitute possible noun list possible_noun_list = get_stem_variants(stem_conj, suffix_conj_nm) # search the noun in the dictionary # we can return the tashkeel infnoun_form_list = [] for infnoun in set(possible_noun_list): # get the noun and get all its forms from the dict # if the noun has plural suffix, don't look up in #broken plural dictionary if not self.cache_dict_search.has_key(infnoun): infnoun_foundlist = self.noun_dictionary.lookup(infnoun) self.cache_dict_search[infnoun] = create_dict_word( infnoun_foundlist) else: infnoun_foundlist = self.cache_dict_search[infnoun] infnoun_form_list.extend(infnoun_foundlist) for noun_tuple in infnoun_form_list: infnoun = noun_tuple['vocalized'] # affixes tags contains prefixes and suffixes tags affix_tags = snconst.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \ +snconst.COMP_SUFFIX_LIST_TAGS[encletic_nm]['tags'] \ +snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj_nm]['tags'] #test if the given word from dictionary accept those # tags given by affixes # دراسة توافق الزوائد مع خصائص الاسم، # مثلا هل يقبل الاسم التأنيث. if validate_tags(noun_tuple, affix_tags, procletic, encletic_nm, suffix_conj_nm): ## get all vocalized form of suffixes for vocalized_encletic in snconst.COMP_SUFFIX_LIST_TAGS[encletic_nm]['vocalized']: for vocalized_suffix in snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj_nm]['vocalized']: ## verify compatibility between procletics and affix if vocalized_suffix == araby.FATHATAN and not (noun.endswith(araby.TEH_MARBUTA) or noun.endswith(araby.ALEF+araby.HAMZA) ): continue if u'جمع مذكر سالم' in snconst.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags']\ and not noun_tuple['masculin_plural']: continue; if self.is_compatible_proaffix_affix(noun_tuple, procletic, vocalized_encletic, vocalized_suffix): vocalized, semi_vocalized = vocalize(infnoun,procletic, vocalized_suffix, vocalized_encletic) #add some tags from dictionary entry as #mamnou3 min sarf and broken plural original_tags = [] if noun_tuple['mamnou3_sarf'] == \ u"ممنوع من الصرف": original_tags.append(u"ممنوع من الصرف") if noun_tuple['number'] == u"جمع تكسير": original_tags.append(u"جمع تكسير") if noun_tuple['feminable']: original_tags.append(u"يؤنث") # get affix tags vocalized_affix_tags = snconst.COMP_PREFIX_LIST_TAGS[procletic]['tags']\ +snconst.COMP_SUFFIX_LIST_TAGS[vocalized_encletic]['tags']\ +snconst.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags'] # if there are many cases like feminin plural with mansoub and majrour if 'cases' in snconst.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]: list_cases = snconst.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['cases'] else: list_cases = ('',) for case in list_cases: voc_affix_case = vocalized_affix_tags + (case,) detailed_result.append(wordcase.WordCase({ 'word':noun, 'affix': (procletic, '', vocalized_suffix, vocalized_encletic), 'stem': stem_conj, 'original': infnoun, #original, 'vocalized': vocalized, 'semivocalized':semi_vocalized, 'tags': u':'.join(voc_affix_case), 'type': u':'.join(['Noun', noun_tuple['wordtype']]), 'freq':'freqnoun', # to note the frequency type 'originaltags':u':'.join(original_tags), 'syntax':'', })) return detailed_result
import sys import pyarabic.araby as araby ayah = sys.argv[1] ayah = araby.strip_tatweel(ayah) ayah = araby.strip_tashkeel(ayah) ayah = araby.normalize_ligature(ayah) ayah = araby.normalize_hamza(ayah) ayah = araby.normalize_alef(ayah) ayah = araby.normalize_teh(ayah) ayah = ayah.replace("ے", "ى") print(ayah) sys.stdout.flush()