def check_partial_vocalized(word_vocalised, resulted_data): """ if the entred word is vocalized fully or partially, the analyzer return the vocalized like words This function treat the partial vocalized case. @param word_vocalised: the input word. @type word_vocalised: unicode. @param resulted_data: the founded resulat from dictionary. @type resulted_data: list of dict. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ #print "check partial vocalization",word_vocalised.encode('utf8'),araby.is_vocalized(word_vocalised) #return resulted_data filtred_data = [] if not araby.is_vocalized(word_vocalised): return resulted_data else: #compare the vocalized output with the vocalized input #print ' is vocalized' for item in resulted_data: if 'vocalized' in item and araby.vocalizedlike(word_vocalised, item['vocalized']): item['tags'] += ':'+analex_const.partialVocalizedTag filtred_data.append(item) return filtred_data
def test_vocalized_similarity(self): """Test vocalized_similarity function ?""" word1 = u"ضَربٌ" word2 = u"ضَرْبٌ" self.assertTrue(ar.vocalizedlike(word1, word2)) self.assertNotEqual(ar.vocalized_similarity(word1, word2), -2) self.assertTrue(ar.vocalized_similarity(word1, word2))
def Comparetashkeel(text): import tashkeel.tashkeel as ArabicVocalizer # the entred text is vocalized correctly correct_text=text; text=araby.stripTashkeel(text); vocalizer=ArabicVocalizer.TashkeelClass(); vocalized_text=vocalizer.tashkeel(text); # compare voalized text with a correct text text1=correct_text; text2=vocalized_text; # remove collocations symboles text2=text2.replace("'",""); text2=text2.replace("~",""); #stemmer=tashaphyne.stemming.ArabicLightStemmer() list1=vocalizer.analyzer.tokenize(text1); list2=vocalizer.analyzer.tokenize(text2); print u":".join(list1).encode('utf8'); print u":".join(list2).encode('utf8'); correct=0; incorrect=0; total=len(list1); if len(list1)!=len(list2): print "lists haven't the same length"; else: for i in range(total): if araby.vocalizedlike(list1[i],list2[i]): correct+=1; else: incorrect+=1; result=[vocalized_text,"correct:%0.2f%%"%round(correct*100.00/total,2),"incorrect:%0.2f%%"%round(incorrect*100.00/total,2),total] return result#correct*100/total;
def mot_except(word): """Détecte si un mot donné en entrée est un mot éxceptionnel ou non par rapport à la BDD.""" combs = [] for me in ExceptionalWord.objects.filter( unvoweled_form=araby.strip_diacritics(word)): if araby.vocalizedlike(word, me): combs.append(me) return combs
def mot_outil(word): """Détecte si un mot donné en entrée est un mot outil ou non par rapport à la BDD.""" mo_combs = [] combs = decoupage(word) for c in combs: for mo in ToolWord.objects.filter( unvoweled_form=araby.strip_diacritics(c['Base'])): if araby.vocalizedlike(c['Base'], mo.voweled_form): dico = {'tw_object': mo} dico['Préfixe'] = c['Préfixe'] dico['Suffixe'] = c['Suffixe'] mo_combs.append(dico) return mo_combs
def nom_propre(word): """Détecte si un mot donné en entrée est un mot spécifique ou non par rapport à la BDD.""" np_combs = [] combs = decoupage(word) for c in combs: for np in ProperNoun.objects.filter( unvoweled_form=araby.strip_diacritics(c['Base'])): if araby.vocalizedlike(c['Base'], np.voweled_form): dico = {'pn_object': np} dico['Base'] = c['Base'] dico['Préfixe'] = c['Préfixe'] dico['Suffixe'] = c['Suffixe'] np_combs.append(dico) return np_combs
def check_partial_vocalized(word_vocalised, resulted_data): """ if the entred word is vocalized fully or partially, the analyzer return the vocalized like words This function treat the partial vocalized case. @param word_vocalised: the input word. @type word_vocalised: unicode. @param resulted_data: the founded resulat from dictionary. @type resulted_data: list of dict. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ filtred_data = [] if not araby.is_vocalized(word_vocalised): return resulted_data else: #compare the vocalized output with the vocalized input #print ' is vocalized' for item in resulted_data: if 'vocalized' in item: output = item['vocalized'] is_verb = "Verb" in item['type'] if araby.vocalizedlike(word_vocalised, output): item[ 'tags'] += ':' + analex_const.PARTIAL_VOCALIZED_TAG filtred_data.append(item) # حالة التقا الساكنين، مع نص مشكول مسبقا، والفعل في آخره كسرة بدل السكون elif is_verb and word_vocalised.endswith( araby.KASRA) and output.endswith(araby.SUKUN): if araby.vocalizedlike(word_vocalised[:-1], output[:-1]): item[ 'tags'] += ':' + analex_const.PARTIAL_VOCALIZED_TAG filtred_data.append(item) return filtred_data
def Comparetashkeel(text): import tashkeel.tashkeel as ArabicVocalizer # the entred text is vocalized correctly correct_text = text text = araby.stripTashkeel(text) vocalizer = ArabicVocalizer.TashkeelClass() vocalized_text = vocalizer.tashkeel(text) # compare voalized text with a correct text text1 = correct_text text2 = vocalized_text # remove collocations symboles text2 = text2.replace("'", "") text2 = text2.replace("~", "") #stemmer=tashaphyne.stemming.ArabicLightStemmer() list1 = vocalizer.analyzer.tokenize(text1) list2 = vocalizer.analyzer.tokenize(text2) print u":".join(list1).encode('utf8') print u":".join(list2).encode('utf8') correct = 0 incorrect = 0 total = len(list1) if len(list1) != len(list2): print "lists haven't the same length" else: for i in range(total): if araby.vocalizedlike(list1[i], list2[i]): correct += 1 else: incorrect += 1 result = [ vocalized_text, "correct:%0.2f%%" % round(correct * 100.00 / total, 2), "incorrect:%0.2f%%" % round(incorrect * 100.00 / total, 2), total ] return result #correct*100/total;
def check_partial_vocalized(self, word_vocalised, resulted_data): """ if the entred word is vocalized fully or partially, the analyzer return the vocalized like words; This function treat the partial vocalized case. @param word_vocalised: the input word. @type word_vocalised: unicode. @param resulted_data: the founded resulat from dictionary. @type resulted_data: list of dict. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ # print word_vocalised.encode('utf8'); filtred_data = [] if not araby.isVocalized(word_vocalised): return resulted_data else: # compare the vocalized output with the vocalized input # print ' is vocalized'; for item in resulted_data: if "vocalized" in item.__dict__ and araby.vocalizedlike(word_vocalised, item.__dict__["vocalized"]): item.__dict__["tags"] += ":" + analex_const.partialVocalizedTag filtred_data.append(item) return filtred_data
def compare_tashkeel(text): """ Compare tashkeel between vocalized text and automatic vocalized text """ import tashkeel.tashkeel as ArabicVocalizer # the entred text is vocalized correctly correct_text = text.strip() text = araby.strip_tashkeel(text.strip()) cpath = os.path.join(os.path.dirname(__file__), '../tmp/') vocalizer = ArabicVocalizer.TashkeelClass(mycache_path=cpath) #~vocalized_text = vocalizer.tashkeel(text) #~ vocalizer.disable_cache() vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(text) # compare voalized text with a correct text text1 = correct_text #~text2 = vocalized_text displayed_html = u"" #stemmer=tashaphyne.stemming.ArabicLightStemmer() #~texts = vocalizer.analyzer.split_into_phrases(text1) texts = [text1, ] list1 =[] for txt in texts: list1 += vocalizer.analyzer.tokenize(txt) list2 = vocalized_dict print u"\t".join(list1).encode('utf8') correct = 0 incorrect = 0 total = len(list1) if len(list1)!= len(list2): print "lists haven't the same length", len(list1), len(list2) for i in range(min(len(list1), len(list2))): print (u"'%s'\t'%s'"%(list1[i], list2[i].get('chosen',''))).encode("utf8") sys.exit() else: for i in range(total): wo1 = list1[i] wo1_strip = wo1 wo2 = list2[i]['chosen'] wo2_strip = list2[i]['semi'] # words without inflection mark inflect = list2[i]['inflect'] link = list2[i]['link'] rule = list2[i]['rule'] style = "diff" #~if araby.is_vocalized(wo2) and araby.vocalizedlike(wo1, wo2): if araby.vocalizedlike(wo1, wo2): if wo2 == "\n": wo2 = "<br/>" #~displayed_html += u" " + wo2 displayed_html += u" <span id='diff' class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % ( style, wo1, inflect, link, str(rule), wo2) correct += 1 else: incorrect += 1 # green for last mark difference wo1_strip = wo1 #~wo2_strip = araby.strip_lastharaka(wo2) if araby.vocalizedlike(wo1_strip, wo2_strip): style = 'diff-mark' else: # if the last marks are equal wm1 = wo1[-1:] wm2 = wo2[-1:] if (araby.is_haraka(wm1) and araby.is_haraka(wm2) and wm1 == wm2) \ or (bool(araby.is_haraka(wm1)) ^ bool(araby.is_haraka(wm2))): style = "diff-word" else: style = 'diff-all' displayed_html += u" <span id='diff' class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % ( style, wo1, inflect, link, str(rule), wo2) per_correct = round(correct*100.00/total, 2) per_incorrect = round(incorrect*100.00/total, 2) result = [displayed_html, "correct:%0.2f%%, incorrect:%0.2f%%"%(per_correct, per_incorrect)] return result#correct*100/total
u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1 = u"" for word in word_list: print(word, '\t', end=" ") if araby.is_vocalized(word): print(' is vocalized', end=" ") if araby.is_vocalizedtext(word): print(' is vocalized text', end=" ") if araby.is_arabicword(word): print(' is valid word', end=" ") else: print("invalid arabic word", end=" ") print(' strip harakat', araby.strip_harakat(word), end=" ") print(' strip tashkeel', araby.strip_tashkeel(word), end=" ") print(' strip tatweel', araby.strip_tatweel(word), end=" ") print(' normalize ligature ', araby.normalize_ligature(word), end=" ") if araby.vocalizedlike(word, word1): print("vocalized_like", end=" ") print() word1 = word if araby.vocalizedlike(u"العربية", u"العرَبية"): print("vocalized_like", end=" ") word = u"الْعَرَيِيّةُ" word_list = [ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1 = u"" for word in word_list: print(word, '\t', end=" ")
def test_vocalizedlike(self): """Test vocalizedlike function ?""" word1 = u"ضَربٌ" word2 = u"ضَرْبٌ" self.assertTrue(ar.vocalizedlike(word1, word2))
def decoupage(word): """Découpe le mot donné en entrée (word) en (préfixes, racine et suffixes). La sortie de la fonction est une liste de dictionnaires regroupant toutes les combinaisons syntaxiquement correctes d'aprés la compatibilitée entre les préfixes et sufixes détéctés et la taille de la racine.""" word_unvocalized = araby.strip_diacritics(word) prefixes, suffixes = [""], [""] combinaisons_possibles = [] for p in Prefixe.objects.all(): if word_unvocalized.startswith(p.unvoweled_form): # print("p:"+p.unvoweled_form) if araby.is_vocalized(word): if araby.vocalizedlike(word[:len(p.voweled_form)], p.voweled_form): prefixes.append(p) else: prefixes.append(p) for s in Suffixe.objects.all(): if word_unvocalized.endswith(s.unvoweled_form): if araby.is_vocalized(word): if araby.vocalizedlike(word[-len(s.voweled_form):], s.voweled_form): suffixes.append(s) else: suffixes.append(s) for pr in prefixes: for sf in suffixes: # Validation criteria if pr != "" and sf != "": if (len(word_unvocalized) - len(pr.unvoweled_form) - len(sf.unvoweled_form)) <= 2 or \ (len(word_unvocalized) - len(pr.unvoweled_form) - len(sf.unvoweled_form)) > 9: continue if ((pr.classe[0] == 'N' and sf.classe[0] == 'V') or (pr.classe[0] == 'V' and sf.classe[0] == 'N') or (pr.classe in ['N1', 'N2', 'N3', 'N5'])): continue # Si on est là -> le préfixe est compatible avec le suffixe, et la taille de la base est accéptable base = word # Supprimer le prefixe de la base // En gardant le Tachkil if pr: for char in pr.unvoweled_form: while char != base[0]: base = base[1:] base = base[1:] while araby.is_tashkeel(base[0]): base = base[1:] # Supprimer le suffixe de la base // En gardant le Tachkil if sf: r_sf = [c for c in sf.unvoweled_form] r_sf.reverse() for char in r_sf: base = base[:base.rindex(char)] combinaisons_possibles.append({ 'Base': base, 'Préfixe': pr, 'Suffixe': sf }) return combinaisons_possibles
def test_vocalizedlike(self): # vocalizedlike(word1, word2) word1 = u"ضَربٌ" word2 = u"ضَرْبٌ" self.assertTrue(Araby.vocalizedlike(word1, word2))
from pyarabic.unshape import unshaping_line import arabic_reshaper from pyarabic import araby f = open('data.txt','r') lignes = f.readlines() print(araby.vocalizedlike('ب ر ي ت'.replace(' ', ''), 'بريت')) for ligne in lignes: f2 = open('right.txt','r') rights = f2.readlines() print(rights) for right in rights: if araby.vocalizedlike(unshaping_line(ligne).replace(' ', ''), unshaping_line(right)): print(unshaping_line(right).encode('utf8')) print('بريت')
def compare_tashkeel(text): """ Compare tashkeel between vocalized text and automatic vocalized text """ import tashkeel.tashkeel as ArabicVocalizer # the entred text is vocalized correctly correct_text = text.strip() text = araby.strip_tashkeel(text.strip()) vocalizer = ArabicVocalizer.TashkeelClass() #~vocalized_text = vocalizer.tashkeel(text) vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(text) # compare voalized text with a correct text text1 = correct_text #~text2 = vocalized_text displayed_html = u"" #stemmer=tashaphyne.stemming.ArabicLightStemmer() #~texts = vocalizer.analyzer.split_into_phrases(text1) texts = [ text1, ] list1 = [] for txt in texts: list1 += vocalizer.analyzer.tokenize(txt) list2 = vocalized_dict print u"\t".join(list1).encode('utf8') correct = 0 incorrect = 0 total = len(list1) if len(list1) != len(list2): print "lists haven't the same length", len(list1), len(list2) for i in range(min(len(list1), len(list2))): print(u"'%s'\t'%s'" % (list1[i], list2[i].get('chosen', ''))).encode("utf8") sys.exit() else: for i in range(total): wo1 = list1[i] wo1_strip = wo1 wo2 = list2[i]['chosen'] wo2_strip = list2[i]['semi'] # words without inflection mark inflect = list2[i]['inflect'] link = list2[i]['link'] rule = list2[i]['rule'] style = "diff" #~if araby.is_vocalized(wo2) and araby.vocalizedlike(wo1, wo2): if araby.vocalizedlike(wo1, wo2): if wo2 == "\n": wo2 = "<br/>" #~displayed_html += u" " + wo2 displayed_html += u" <span id='diff' class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % ( style, wo1, inflect, link, str(rule), wo2) correct += 1 else: incorrect += 1 # green for last mark difference wo1_strip = wo1 #~wo2_strip = araby.strip_lastharaka(wo2) if araby.vocalizedlike(wo1_strip, wo2_strip): style = 'diff-mark' else: # if the last marks are equal wm1 = wo1[-1:] wm2 = wo2[-1:] if (araby.is_haraka(wm1) and araby.is_haraka(wm2) and wm1 == wm2) \ or (bool(araby.is_haraka(wm1)) ^ bool(araby.is_haraka(wm2))): style = "diff-word" else: style = 'diff-all' displayed_html += u" <span id='diff' class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % ( style, wo1, inflect, link, str(rule), wo2) per_correct = round(correct * 100.00 / total, 2) per_incorrect = round(incorrect * 100.00 / total, 2) result = [ displayed_html, "correct:%0.2f%%, incorrect:%0.2f%%" % (per_correct, per_incorrect) ] return result #correct*100/total
"Taha", ] word1=u"" for word in word_list: print word.encode('utf8'),'\t', if araby.is_vocalized(word): print ' is vocalized', ## if araby.isArabicstring(word): print ' iisArabicstring', ## else:print ' invalid arabicstring', if araby.is_vocalizedtext(word): print ' is vocalized text', if araby.is_arabicword(word): print ' is valid word', else: print "invalid arabic word", print ' strip harakat', araby.strip_harakat(word).encode('utf8'), print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'), print ' strip tatweel',araby.strip_tatweel(word).encode('utf8'), print ' normalize ligature ', araby.normalize_ligature(word).encode('utf8'), if araby.vocalizedlike(word, word1): print "vocalized_like", print; word1=word; if araby.vocalizedlike(u"العربية",u"العرَبية"): print "vocalized_like", word=u"الْعَرَيِيّةُ" word_list=[ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1=u"" for word in word_list: print word.encode('utf8'),'\t', if araby.is_vocalized(word): print ' is vocalized',
"Taha", ] word1="" for word in word_list: print(word,'\t') if araby.is_vocalized(word): print(' is vocalized') ## if araby.isArabicstring(word): print ' iisArabicstring', ## else:print ' invalid arabicstring', if araby.is_vocalizedtext(word): print(' is vocalized text') if araby.is_arabicword(word): print(' is valid word') else: print("invalid arabic word") print(' strip harakat', araby.strip_harakat(word)) print(' strip tashkeel', araby.strip_tashkeel(word)) print(' strip tatweel',araby.strip_tatweel(word)) print(' normalize ligature ', araby.normalize_ligature(word)) if araby.vocalizedlike(word, word1): print("vocalized_like") print(); word1=word; if araby.vocalizedlike("العربية","العرَبية"): print("vocalized_like") word="الْعَرَيِيّةُ" word_list=[ "الْعَرَيِيّةُ", "العربية", "الْعَرَيِيّةُ الفصحى", "غير مشكول", "Taha", ] word1="" for word in word_list: print(word,'\t') if araby.is_vocalized(word): print(' is vocalized')
] word1 = u"" for word in word_list: print word.encode('utf8'), '\t', if araby.is_vocalized(word): print ' is vocalized', ## if araby.isArabicstring(word): print ' iisArabicstring', ## else:print ' invalid arabicstring', if araby.is_vocalizedtext(word): print ' is vocalized text', if araby.is_arabicword(word): print ' is valid word', else: print "invalid arabic word", print ' strip harakat', araby.strip_harakat(word).encode('utf8'), print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'), print ' strip tatweel', araby.strip_tatweel(word).encode('utf8'), print ' normalize ligature ', araby.normalize_ligature(word).encode( 'utf8'), if araby.vocalizedlike(word, word1): print "vocalized_like", print word1 = word if araby.vocalizedlike(u"العربية", u"العرَبية"): print "vocalized_like", word = u"الْعَرَيِيّةُ" word_list = [ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1 = u"" for word in word_list: print word.encode('utf8'), '\t', if araby.is_vocalized(word): print ' is vocalized',