def get_suffix_variants(word, suffix, enclitic): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix = ة, encletic = ي. The suffix is converted to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: variant of suffixes (vocalized suffix and vocalized suffix without I'rab short mark). @rtype: (unicode, unicode) """ enclitic_nm = araby.strip_tashkeel(enclitic) newsuffix = suffix #default value #if the word ends by a haraka if not enclitic_nm and word[-1:] in ( araby.ALEF_MAKSURA, araby.YEH, araby.ALEF) and araby.is_haraka(suffix): newsuffix = u"" #gererate the suffix without I'rab short mark # here we lookup with given suffix because the new suffix is # changed and can be not found in table if u'متحرك' in ssconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']: suffix_non_irab_mark = araby.strip_lastharaka(newsuffix) else: suffix_non_irab_mark = newsuffix return newsuffix, suffix_non_irab_mark
def get_suffix_variants(word, suffix, enclitic): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix = ة, encletic = ي. The suffix is converted to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: variant of suffixes (vocalized suffix and vocalized suffix without I'rab short mark). @rtype: (unicode, unicode) """ enclitic_nm = araby.strip_tashkeel(enclitic) newsuffix = suffix #default value #if the word ends by a haraka if suffix.find(araby.TEH_MARBUTA) >= 0 and len (enclitic_nm)>0: newsuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix) elif not enclitic_nm and word[-1:] in (araby.YEH, araby.ALEF) and araby.is_haraka(suffix): newsuffix = u"" #gererate the suffix without I'rab short mark # here we lookup with given suffix because the new suffix is # changed and can be not found in table if u'متحرك' in snconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']: suffix_non_irab_mark = araby.strip_lastharaka(newsuffix) else: suffix_non_irab_mark = newsuffix return newsuffix, suffix_non_irab_mark
def uniformate_suffix(word): """ separate the harakat and the letters of the given word, it return two strings ( the word without harakat and the harakat). If the weaked letters are reprsented as long harakat and striped from the word. """ ## type : affix : uniformate affixes ## word = normalize_affix(word) word = word.replace(SHADDA, SUKUN + SHADDA) shakl = u"" word_nm = u"" i = 0 len_word = len(word) # print "len word", len(word) while i < len_word: if not araby.is_shortharaka(word[i]): # not in HARAKAT: word_nm += word[i] if i + 1 < len(word) and araby.is_shortharaka(word[i + 1]): if word[i + 1] == FATHA: if i+2 < len(word) and word[i+2] == ALEF and \ i+3 < len(word): shakl += vconst.ALEF_HARAKA i += 3 else: shakl += FATHA i += 2 elif word[i+1] == DAMMA and i+2 < len(word) and \ word[i+2] == WAW: if i + 3 >= len(word) or not araby.is_shortharaka( word[i + 3]): shakl += vconst.WAW_HARAKA i += 3 else: shakl += DAMMA i += 2 elif word[i+1] == KASRA and i+2 < len(word) and \ word[i+2] == YEH: if i + 3 >= len(word) or not araby.is_shortharaka( word[i + 3]): shakl += vconst.YEH_HARAKA i += 3 else: shakl += KASRA i += 2 else: shakl += word[i + 1] i += 2 elif i + 1 < len(word) and araby.is_haraka(word[i + 1]): shakl += word[i + 1] else: shakl += vconst.NOT_DEF_HARAKA i += 1 else: i += 1 if len(word_nm) == len(shakl): return (word_nm, shakl) else: return (u"", u"")
def uniformate_suffix(word): """ separate the harakat and the letters of the given word, it return two strings ( the word without harakat and the harakat). If the weaked letters are reprsented as long harakat and striped from the word. """ ## type : affix : uniformate affixes ## word = normalize_affix(word) word = word.replace(SHADDA, SUKUN+SHADDA) shakl = u"" word_nm = u"" i = 0 len_word = len(word) # print "len word", len(word) while i < len_word: if not araby.is_shortharaka(word[i]): # not in HARAKAT: word_nm += word[i] if i+1 < len(word) and araby.is_shortharaka(word[i+1]): if word[i+1] == FATHA : if i+2 < len(word) and word[i+2] == ALEF and \ i+3 < len(word): shakl += vconst.ALEF_HARAKA i += 3 else : shakl += FATHA i += 2 elif word[i+1] == DAMMA and i+2 < len(word) and \ word[i+2] == WAW: if i+3 >= len(word) or not araby.is_shortharaka(word[i+3]): shakl += vconst.WAW_HARAKA i += 3 else : shakl += DAMMA i += 2 elif word[i+1] == KASRA and i+2 < len(word) and \ word[i+2] == YEH: if i+3 >= len(word) or not araby.is_shortharaka(word[i+3]): shakl += vconst.YEH_HARAKA i += 3 else : shakl += KASRA i += 2 else : shakl += word[i+1] i += 2 elif i+1 < len(word) and araby.is_haraka(word[i+1]): shakl += word[i+1] else: shakl += vconst.NOT_DEF_HARAKA i += 1 else: i += 1 if len(word_nm) == len(shakl): return (word_nm, shakl) else: return (u"", u"")
def test_is_letter(self): self.assertTrue(Araby.is_sukun(Araby.SUKUN)) self.assertTrue(Araby.is_shadda(Araby.SHADDA)) self.assertTrue(Araby.is_tatweel(Araby.TATWEEL)) for archar in Araby.TANWIN: self.assertTrue(Araby.is_tanwin(archar)) for archar in Araby.TASHKEEL: self.assertTrue(Araby.is_tashkeel(archar)) for haraka in Araby.HARAKAT: self.assertTrue(Araby.is_haraka(haraka)) for short_haraka in Araby.SHORTHARAKAT: self.assertTrue(Araby.is_shortharaka(short_haraka)) for liguature in Araby.LIGUATURES: self.assertTrue(Araby.is_ligature(liguature)) for hamza in Araby.HAMZAT: self.assertTrue(Araby.is_hamza(hamza)) for alef in Araby.ALEFAT: self.assertTrue(Araby.is_alef(alef)) for yeh in Araby.YEHLIKE: self.assertTrue(Araby.is_yehlike(yeh)) for waw in Araby.WAWLIKE: self.assertTrue(Araby.is_wawlike(waw)) for teh in Araby.TEHLIKE: self.assertTrue(Araby.is_teh) for small in Araby.SMALL: self.assertTrue(Araby.is_small(small)) for weak in Araby.WEAK: self.assertTrue(Araby.is_weak(weak)) for archar in Araby.MOON: self.assertTrue(Araby.is_moon(archar)) for archar in Araby.SUN: self.assertTrue(Araby.is_sun(archar))
def get_suffix_variants(word, suffix, enclitic, mankous=False): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix = ة, enclitic = ي. The suffix is converted to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @param mankous: if the noun is mankous ends with Yeh منقوص. @type mankous: boolean. @return: variant of suffixes (vocalized suffix and vocalized suffix without I'rab short mark). @rtype: (unicode, unicode) """ #enclitic_nm = ar.strip_tashkeel(enclitic) enclitic_nm = enclitic # given enclitic is not vocalized newsuffix = suffix #default value #if the word ends by a haraka if suffix.find(ar.TEH_MARBUTA) >= 0 and enclitic_nm: newsuffix = re.sub(ar.TEH_MARBUTA, ar.TEH, suffix) elif not enclitic_nm and ar.is_haraka(suffix): if word[-1:] in (ar.YEH, ar.ALEF): newsuffix = u"" elif mankous: # the word is striped from YEH المنقوص حذفت ياؤه قبل قليل # تحول حركته إلى تنوين كسر newsuffix = ar.KASRATAN #gererate the suffix without I'rab short mark # here we lookup with given suffix because the new suffix is # changed and can be not found in table if u'متحرك' in SNC.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']: suffix_non_irab_mark = ar.strip_lastharaka(newsuffix) else: suffix_non_irab_mark = newsuffix #~ if suffix.endswith(ar.YEH+ar.SHADDA+ ar.DAMMA) and enclitic_nm.startswith(ar.YEH): #~ newsuffix = ar.YEH+ar.SHADDA+ ar.DAMMA #~ suffix_non_irab_mark = ar.YEH+ar.SHADDA #~ if suffix.endswith(ar.DAMMA) and enclitic_nm.startswith( ar.YEH): #~ newsuffix = suffix[:-1] + ar.KASRA #~ suffix_non_irab_mark = suffix[:-1] return newsuffix, suffix_non_irab_mark
def lookup(self, text, word_type = ''): """ look up for all word forms in the dictionary, according to word_type - 'verb': lookup for verb only. - 'noun': look up for nouns. - 'unknown': the word is not alayzed, then search for unvocalized word. - '': look for voaclize word without type @param text:vocalized word. @type text: unicode. @param word_type: the word type can take 'verb', 'noun', 'unknwon', ''. @type word_type: unicode. @return: list of dictionary entries IDs. @rtype: list. """ idlist = [] # strip the last haraka from the text to ensure the search # if araby.is_haraka(text[-1:]): text = text[:-1] # homogoneize with word typography # strip all fatha before alef into text = re.sub(araby.FATHA+araby.ALEF, araby.ALEF, text) if word_type == 'unknown': sql = u"select * FROM %s WHERE unvocalized='%s'" % ( self.table_name, text) else: sql = u"select * FROM %s WHERE vocalized='%s'" % ( self.table_name, text) if word_type == 'verb': sql += " AND word_type='verb' " elif word_type == 'noun': sql += " AND word_type!='verb' " try: self.cursor.execute(sql) except sqlite.OperationalError: print "Fatal Error can't execute query: file: wordfrequencydictionary" return [] if self.cursor: # return self.curser.fetchall() for row in self.cursor: idlist.append(row) return idlist
def get_suffix_variants(word, suffix, enclitic, mankous = False): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix = ة, encletic = ي. The suffix is converted to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @param mankous: if the noun is mankous ends with Yeh منقوص. @type mankous: boolean. @return: variant of suffixes (vocalized suffix and vocalized suffix without I'rab short mark). @rtype: (unicode, unicode) """ enclitic_nm = araby.strip_tashkeel(enclitic) newsuffix = suffix #default value #if the word ends by a haraka if suffix.find(araby.TEH_MARBUTA) >= 0 and enclitic_nm: newsuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix) elif not enclitic_nm and araby.is_haraka(suffix): if word[-1:] in (araby.YEH, araby.ALEF): newsuffix = u"" elif mankous : # the word is striped from YEH المنقوص حذفت ياؤه قبل قليل # تحول حركته إلى تنوين كسر newsuffix = araby.KASRATAN #gererate the suffix without I'rab short mark # here we lookup with given suffix because the new suffix is # changed and can be not found in table if u'متحرك' in snconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']: suffix_non_irab_mark = araby.strip_lastharaka(newsuffix) else: suffix_non_irab_mark = newsuffix return newsuffix, suffix_non_irab_mark
def vocalize(noun, proclitic, suffix, enclitic): """ Join the noun and its affixes, and get the vocalized form @param noun: noun found in dictionary. @type noun: unicode. @param proclitic: first level prefix. @type proclitic: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: vocalized word. @rtype: unicode. """ # procletic have only an uniq vocalization in arabic proclitic_voc = snconst.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0] # encletic can be variant according to suffix #print (u"vocalize: '%s' '%s'"%(enclitic, noun)).encode('utf8') enclitic_voc = snconst.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0] enclitic_voc, encl_voc_non_inflect = get_enclitic_variant( enclitic_voc, suffix) suffix_voc = suffix #adjust some some harakat #strip last if tanwin or last harakat if ar.is_haraka(noun[-1:]): #(DAMMATAN, FATHATAN, KASRATAN, FATHA, DAMMA, KASRA): noun = noun[:-1] # convert Fathatan into one fatha, in some cases where # #the tanwin is not at the end: eg. محتوًى noun = noun.replace(ar.FATHATAN, ar.FATHA) #add shadda if the first letter is sunny and the procletic #contains AL definition mark if u'تعريف' in snconst.COMP_PREFIX_LIST_TAGS[proclitic]["tags"]\ and ar.is_sun(noun[0]): noun = u''.join([noun[0], ar.SHADDA, noun[1:]]) #strip the Skun from the lam if proclitic_voc.endswith(ar.SUKUN): proclitic_voc = proclitic_voc[:-1] #completate the dictionary word vocalization # this allow to avoid some missed harakat before ALEF # in the dictionary form of word, all alefat are preceded by Fatha #~noun = ar.complet #~ print "stem_noun.vocalize; before", noun.encode('utf8'); noun = noun.replace(ar.ALEF, ar.FATHA + ar.ALEF) #~ print "stem_noun.vocalize; 2", noun.encode('utf8'); noun = noun.replace(ar.ALEF_MAKSURA, ar.FATHA + ar.ALEF_MAKSURA) noun = re.sub(ur"(%s)+" % ar.FATHA, ar.FATHA, noun) # remove initial fatha if alef is the first letter noun = re.sub(ur"^(%s)+" % ar.FATHA, "", noun) #~ print "stem_noun.vocalize; 3", noun.encode('utf8'); # generate the word variant for some words witch ends by special #letters like Teh_marbuta or Alef_maksura, or hamza, #the variant is influed by the suffix harakat, # for example مدرسة+ي = مدرست+ي mankous = True if noun.endswith(ar.KASRA + ar.YEH) else False noun = get_word_variant(noun, suffix, enclitic) # generate the suffix variant. if the suffix is Teh_marbuta or #Alef_maksura, or hamza, the variant is influed by the enclitic harakat, # for example مدرس+ة+ي = مدرس+ت+ي suffix_voc, suffix_non_irab_mark = get_suffix_variants( noun, suffix_voc, enclitic, mankous) # generate the non vacalized end word: the vocalized word # without the I3rab Mark # if the suffix is a short haraka word_non_irab_mark = ''.join([ proclitic_voc, noun, suffix_non_irab_mark, encl_voc_non_inflect ]) # ajust the semivocalized form word_non_irab_mark = re.sub(ur"(%s)+" % ar.FATHA, ar.FATHA, word_non_irab_mark) word_non_irab_mark = re.sub( u"(%s%s%s)+" % (ar.FATHA, ar.ALEF_MAKSURA, ar.KASRATAN), ar.FATHATAN + ar.ALEF_MAKSURA, word_non_irab_mark) word_non_irab_mark = re.sub(ur"%s%s%s" % (ar.FATHA, ar.ALEF_MAKSURA, ar.KASRA), ar.FATHA + ar.ALEF_MAKSURA, word_non_irab_mark) word_non_irab_mark = re.sub(ur"%s[%s|%s|%s]" % (ar.ALEF_MAKSURA, ar.DAMMA, ar.FATHA, ar.KASRA), ar.ALEF_MAKSURA, word_non_irab_mark) #generate vocalized form word_vocalized = ''.join([proclitic_voc, noun, suffix_voc, enclitic_voc]) #used for spelling purposes segmented = '-'.join([proclitic_voc, noun, suffix_voc, enclitic_voc]) segmented = ar.strip_tashkeel(segmented) #~word_vocalized = ar.ajust_vocalization(word_vocalized) word_vocalized = re.sub(ur"(%s)+" % ar.FATHA, ar.FATHA, word_vocalized) word_vocalized = re.sub(ur"%s%s%s" % (ar.FATHA, ar.ALEF_MAKSURA, ar.KASRATAN), ar.FATHATAN + ar.ALEF_MAKSURA, word_vocalized) word_vocalized = re.sub(ur"%s%s%s" % (ar.FATHA, ar.ALEF_MAKSURA, ar.DAMMATAN), ar.FATHATAN + ar.ALEF_MAKSURA, word_vocalized) word_vocalized = re.sub(ur"%s%s%s" % (ar.FATHA, ar.ALEF_MAKSURA, ar.FATHATAN), ar.FATHATAN + ar.ALEF_MAKSURA, word_vocalized) word_vocalized = re.sub(ur"%s%s%s" % (ar.FATHA, ar.ALEF_MAKSURA, ar.KASRA), ar.FATHA + ar.ALEF_MAKSURA, word_vocalized) word_vocalized = re.sub(ur"%s[%s|%s|%s]" % (ar.ALEF_MAKSURA, ar.DAMMA, ar.FATHA, ar.KASRA), ar.ALEF_MAKSURA, word_vocalized) return word_vocalized, word_non_irab_mark, segmented
def compare_tashkeel(text): """ Compare tashkeel between vocalized text and automatic vocalized text """ import tashkeel.tashkeel as ArabicVocalizer # the entred text is vocalized correctly correct_text = text.strip() text = araby.strip_tashkeel(text.strip()) vocalizer = ArabicVocalizer.TashkeelClass() #~vocalized_text = vocalizer.tashkeel(text) vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(text) # compare voalized text with a correct text text1 = correct_text #~text2 = vocalized_text displayed_html = u"" #stemmer=tashaphyne.stemming.ArabicLightStemmer() #~texts = vocalizer.analyzer.split_into_phrases(text1) texts = [ text1, ] list1 = [] for txt in texts: list1 += vocalizer.analyzer.tokenize(txt) list2 = vocalized_dict print u"\t".join(list1).encode('utf8') correct = 0 incorrect = 0 total = len(list1) if len(list1) != len(list2): print "lists haven't the same length", len(list1), len(list2) for i in range(min(len(list1), len(list2))): print(u"'%s'\t'%s'" % (list1[i], list2[i].get('chosen', ''))).encode("utf8") sys.exit() else: for i in range(total): wo1 = list1[i] wo1_strip = wo1 wo2 = list2[i]['chosen'] wo2_strip = list2[i]['semi'] # words without inflection mark inflect = list2[i]['inflect'] link = list2[i]['link'] rule = list2[i]['rule'] style = "diff" #~if araby.is_vocalized(wo2) and araby.vocalizedlike(wo1, wo2): if araby.vocalizedlike(wo1, wo2): if wo2 == "\n": wo2 = "<br/>" #~displayed_html += u" " + wo2 displayed_html += u" <span id='diff' class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % ( style, wo1, inflect, link, str(rule), wo2) correct += 1 else: incorrect += 1 # green for last mark difference wo1_strip = wo1 #~wo2_strip = araby.strip_lastharaka(wo2) if araby.vocalizedlike(wo1_strip, wo2_strip): style = 'diff-mark' else: # if the last marks are equal wm1 = wo1[-1:] wm2 = wo2[-1:] if (araby.is_haraka(wm1) and araby.is_haraka(wm2) and wm1 == wm2) \ or (bool(araby.is_haraka(wm1)) ^ bool(araby.is_haraka(wm2))): style = "diff-word" else: style = 'diff-all' displayed_html += u" <span id='diff' class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % ( style, wo1, inflect, link, str(rule), wo2) per_correct = round(correct * 100.00 / total, 2) per_incorrect = round(incorrect * 100.00 / total, 2) result = [ displayed_html, "correct:%0.2f%%, incorrect:%0.2f%%" % (per_correct, per_incorrect) ] return result #correct*100/total
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys sys.path.append("../") #~ import pyarabic.harf as harf #from pyarabic import harf import pyarabic.araby as araby for c in araby.arabicrange(): print c.encode('utf8'),'\t', araby.name(c).encode('utf8'), print '\t', if araby.is_sukun(c): print "sukun", if araby.is_haraka(c): print "haraka", if araby.is_shadda(c): print "shadda", if araby.is_tatweel(c): print "tatweel", if araby.is_tashkeel(c): print "tashkeel", if araby.is_tanwin(c): print "tanwin", if araby.is_shortharaka(c): print "short haraka", if araby.is_ligature(c):print " ligature", if araby.is_ligature(c):print 'ligature', if araby.is_hamza(c): print 'hamza', if araby.is_alef(c): print 'alef', if araby.is_yehlike(c): print 'yeh', if araby.is_wawlike(c): print 'waw', if araby.is_teh(c): print 'teh', if araby.is_small(c): print 'small', if araby.is_weak(c): print 'weak', if araby.is_moon(c): print 'moon', if araby.is_sun(c):print 'sun', print araby.order(c), print;
def vocalize( noun, proclitic, suffix, enclitic): """ Join the noun and its affixes, and get the vocalized form @param noun: noun found in dictionary. @type noun: unicode. @param proclitic: first level prefix. @type proclitic: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: vocalized word. @rtype: unicode. """ # procletic have only an uniq vocalization in arabic proclitic_voc = snconst.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0] # encletic can be variant according to suffix #print (u"vocalize: '%s' '%s'"%(enclitic, noun)).encode('utf8') enclitic_voc = snconst.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0] enclitic_voc,enclitic_voc_non_inflected = get_enclitic_variant(enclitic_voc, suffix) suffix_voc = suffix #adjust some some harakat #strip last if tanwin or last harakat if araby.is_haraka(noun[-1:]): #(DAMMATAN, FATHATAN, KASRATAN, FATHA, DAMMA, KASRA): noun = noun[:-1] # convert Fathatan into one fatha, in some cases where # #the tanwin is not at the end: eg. محتوًى noun = noun.replace(araby.FATHATAN, araby.FATHA) #add shadda if the first letter is sunny and the procletic #contains AL definition mark if (u'تعريف' in snconst.COMP_PREFIX_LIST_TAGS[proclitic]["tags"]\ and araby.is_sun(noun[0])): noun = u''.join([noun[0], araby.SHADDA, noun[1:]]) #strip the Skun from the lam if proclitic_voc.endswith(araby.SUKUN): proclitic_voc = proclitic_voc[:-1] # generate the word variant for some words witch ends by special #letters like Teh_marbuta or Alef_maksura, or hamza, #the variant is influed by the suffix harakat, # for example مدرسة+ي = مدرست+ي noun = get_word_variant(noun, suffix+enclitic) # generate the suffix variant. if the suffix is Teh_marbuta or #Alef_maksura, or hamza, the variant is influed by the enclitic harakat, # for example مدرس+ة+ي = مدرس+ت+ي suffix_voc, suffix_non_irab_mark = get_suffix_variants(noun, suffix_voc, enclitic) #completate the dictionary word vocalization # this allow to avoid some missed harakat before ALEF # in the dictionary form of word, all alefat are preceded by Fatha #~noun = araby.complet noun = noun.replace(araby.ALEF, araby.FATHA + araby.ALEF) noun = noun.replace(araby.ALEF_MAKSURA, araby.FATHA + araby.ALEF_MAKSURA) noun = re.sub(ur"(%s)+"%araby.FATHA , araby.FATHA, noun) # remove initial fatha if alef is the first letter noun = re.sub(ur"^(%s)+"%araby.FATHA , "", noun) # generate the non vacalized end word: the vocalized word # without the I3rab Mark # if the suffix is a short haraka word_non_irab_mark = ''.join([ proclitic_voc, noun, suffix_non_irab_mark, enclitic_voc_non_inflected]) # ajust the semivocalized form word_non_irab_mark = re.sub(ur"(%s)+"%araby.FATHA , araby.FATHA, word_non_irab_mark ) word_non_irab_mark = re.sub(ur"(%s%s%s)+"%(araby.FATHA, araby.ALEF_MAKSURA, araby.KASRATAN) , araby.FATHATAN + araby.ALEF_MAKSURA, word_non_irab_mark ) word_non_irab_mark = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.KASRA) , araby.FATHA + araby.ALEF_MAKSURA, word_non_irab_mark ) word_non_irab_mark = re.sub(ur"%s[%s|%s|%s]"%(araby.ALEF_MAKSURA, araby.DAMMA, araby.FATHA, araby.KASRA) , araby.ALEF_MAKSURA, word_non_irab_mark ) #generate vocalized form word_vocalized = ''.join([ proclitic_voc, noun, suffix_voc, enclitic_voc]) #~word_vocalized = araby.ajust_vocalization(word_vocalized) word_vocalized = re.sub(ur"(%s)+"%araby.FATHA , araby.FATHA, word_vocalized) word_vocalized = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.KASRATAN) , araby.FATHATAN + araby.ALEF_MAKSURA, word_vocalized) word_vocalized = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.DAMMATAN) , araby.FATHATAN + araby.ALEF_MAKSURA, word_vocalized) word_vocalized = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.FATHATAN) , araby.FATHATAN + araby.ALEF_MAKSURA, word_vocalized) word_vocalized = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.KASRA) , araby.FATHA + araby.ALEF_MAKSURA, word_vocalized) word_vocalized = re.sub(ur"%s[%s|%s|%s]"%(araby.ALEF_MAKSURA, araby.DAMMA, araby.FATHA, araby.KASRA) , araby.ALEF_MAKSURA, word_vocalized) return word_vocalized, word_non_irab_mark
absolute_import, print_function, unicode_literals, division, ) import sys sys.path.append("../") #~ import pyarabic.harf as harf #from pyarabic import harf import pyarabic.araby as araby for c in araby.arabicrange(): print(c, '\t', araby.name(c), end=" ") print('\t', end=" ") if araby.is_sukun(c): print("sukun", end=" ") if araby.is_haraka(c): print("haraka", end=" ") if araby.is_shadda(c): print("shadda", end=" ") if araby.is_tatweel(c): print("tatweel", end=" ") if araby.is_tashkeel(c): print("tashkeel", end=" ") if araby.is_tanwin(c): print("tanwin", end=" ") if araby.is_shortharaka(c): print("short haraka", end=" ") if araby.is_ligature(c): print(" ligature", end=" ") if araby.is_ligature(c): print('ligature', end=" ") if araby.is_hamza(c): print('hamza', end=" ") if araby.is_alef(c): print('alef', end=" ") if araby.is_yehlike(c): print('yeh', end=" ") if araby.is_wawlike(c): print('waw', end=" ") if araby.is_teh(c): print('teh', end=" ") if araby.is_small(c): print('small', end=" ") if araby.is_weak(c): print('weak', end=" ") if araby.is_moon(c): print('moon', end=" ")
#drop harf alef word1 = word.replace(araby.second_char(word), u'') index = listWords.index(word) listWords[index] = word1 if (araby.waznlike(word, u'يفعلوا') or araby.waznlike(word, u'افعلوا') or araby.waznlike(word, u'فعلوا')) and (araby.last_char(word) == u'ا'): #drop harf alef word1 = word.replace(araby.last_char(word), u'') index = listWords.index(word) listWords[index] = word1 if word in asmaaIshara: #replace the third char with ا iza kan al7arf al tani harakih w iza la bnbaddil il harf altani if araby.is_haraka(araby.second_char(word)): #replace the third char #word1 = araby.first_char(ss) + u'ا'+ ss[1:] index = listWords.index(word) listWords[index] = word1 else: #replace second char word1 = araby.first_char(word) + u'ا' + word[1:] index = listWords.index(word) listWords[index] = word1 #in these words we will add alef to the second last character if word == u'الله': word1 = word[:3] + u'ا' + araby.last_char(word) index = listWords.index(word) listWords[index] = word1
for token in text: frequency[token] += 1 #حذف الكلمات اللي تكرارها قليل texts = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts) dictionary.save('D:\GraduationProject1\projectITSelf\myDict.dict') print(dictionary) print(dictionary.token2id) #we want to check harakit alharf al a5eer mn l beet aa5irHarf = [] #list for iesh al7arf al a5eer aw 7rkto for line in f: lastChar = araby.last_char(line) aa5irHarf.append(lastChar) if (araby.is_haraka(lastChar)): if (lastChar == araby.DAMMA): print(araby.DAMMA) elif (lastChar == araby.FATHA): print(araby.FATHA) elif (lastChar == araby.KASRA): print(araby.KASRA) elif (lastChar == araby.DAMMATAN): print(araby.DAMMATAN) elif (lastChar == araby.FATHATAN): print(araby.FATHATAN)
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys sys.path.append("../") #~ import pyarabic.harf as harf #from pyarabic import harf import pyarabic.araby as araby for c in araby.arabicrange(): print c.encode('utf8'), '\t', araby.name(c).encode('utf8'), print '\t', if araby.is_sukun(c): print "sukun", if araby.is_haraka(c): print "haraka", if araby.is_shadda(c): print "shadda", if araby.is_tatweel(c): print "tatweel", if araby.is_tashkeel(c): print "tashkeel", if araby.is_tanwin(c): print "tanwin", if araby.is_shortharaka(c): print "short haraka", if araby.is_ligature(c): print " ligature", if araby.is_ligature(c): print 'ligature', if araby.is_hamza(c): print 'hamza', if araby.is_alef(c): print 'alef', if araby.is_yehlike(c): print 'yeh', if araby.is_wawlike(c): print 'waw', if araby.is_teh(c): print 'teh', if araby.is_small(c): print 'small', if araby.is_weak(c): print 'weak', if araby.is_moon(c): print 'moon', if araby.is_sun(c): print 'sun', print araby.order(c), print
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys sys.path.append('../') from pyarabic import araby for c in araby.arabicrange(): print (c,'\t', araby.name(c)) print ('\t') if araby.is_sukun(c): print ("sukun") if araby.is_haraka(c): print ("haraka") if araby.is_shadda(c): print ("shadda") if araby.is_tatweel(c): print ("tatweel") if araby.is_tashkeel(c): print ("tashkeel") if araby.is_tanwin(c): print ("tanwin") if araby.is_shortharaka(c): print ("short haraka"), if araby.is_ligature(c):print (" ligature"), if araby.is_ligature(c):print ('ligature'), if araby.is_hamza(c): print ('hamza'), if araby.is_alef(c): print ('alef'), if araby.is_yehlike(c): print ('yeh'), if araby.is_wawlike(c): print ('waw'), if araby.is_teh(c): print ('teh'), if araby.is_small(c): print ('small'), if araby.is_weak(c): print ('weak'), if araby.is_moon(c): print ('moon'), if araby.is_sun(c):print ('sun'), print (araby.order(c)), print (); word=u"الْعَرَيِيّةُ"
def lookup(self, text, word_type=''): """ look up for all word forms in the dictionary, according to word_type - 'verb': lookup for verb only. - 'noun': look up for nouns. - 'unknown': the word is not alayzed, then search for unvocalized word. - '': look for voaclize word without type Example: >>> mydict = WordFreqDictionary('wordfreq') >>> wordlist = [u"صلاة", u'كرة', u"قَطَرً", u"أَرْض"] >>> for word in wordlist: >>> print("word freq", mydict.get_freq(word)) >>> idlist = mydict.lookup(word) >>> for row in idlist: >>> row = dict(row) >>> print('frequency', row['freq']) >>> print(repr(row).decode("unicode-escape")) صلاة 0 word freq 0 كرة 0 word freq 0 قَطَرً 2 [(984, u'\u0642\u064e\u0637\u064e\u0631', u'\u0642\u0637\u0631', u'noun_prop', 154772, u'\u064e'), (13874, u'\u0642\u064e\u0637\u064e\u0631', u'\u0642\u0637\u0631', u'verb', 1859, u'\u064e')] word freq 154772 frequency 154772 {'vocalized': u'قَطَر', 'word_type': u'noun_prop', 'unvocalized': u'قطر', 'future_type': u'َ', 'freq': 154772, 'id': 984} frequency 1859 {'vocalized': u'قَطَر', 'word_type': u'verb', 'unvocalized': u'قطر', 'future_type': u'َ', 'freq': 1859, 'id': 13874} أَرْض 1 [(349, u'\u0623\u064e\u0631\u0652\u0636', u'\u0623\u0631\u0636', u'noun', 389839, u'\u064e')] word freq 389839 frequency 389839 {'vocalized': u'أَرْض', 'word_type': u'noun', 'unvocalized': u'أرض', 'future_type': u'َ', 'freq': 389839, 'id': 349} @param text:vocalized word. @type text: unicode. @param word_type: the word type can take 'verb', 'noun', 'unknwon', ''. @type word_type: unicode. @return: list of dictionary entries IDs. @rtype: list. """ idlist = [] # strip the last haraka from the text to ensure the search # if araby.is_haraka(text[-1:]): text = text[:-1] # homogoneize with word typography # strip all fatha before alef into text = re.sub(araby.FATHA + araby.ALEF, araby.ALEF, text) if word_type == 'unknown': sql = u"select * FROM %s WHERE unvocalized='%s'" % ( self.table_name, text) else: sql = u"select * FROM %s WHERE vocalized='%s'" % (self.table_name, text) if word_type == 'verb': sql += " AND word_type='verb' " elif word_type == 'noun': sql += " AND word_type!='verb' " try: self.cursor.execute(sql) except sqlite.OperationalError: print( "Fatal Error can't execute query: file: wordfrequencydictionary" ) return [] if self.cursor: # return self.curser.fetchall() for row in self.cursor: idlist.append(row) return idlist
def compare_tashkeel(text): """ Compare tashkeel between vocalized text and automatic vocalized text """ import tashkeel.tashkeel as ArabicVocalizer # the entred text is vocalized correctly correct_text = text.strip() text = araby.strip_tashkeel(text.strip()) cpath = os.path.join(os.path.dirname(__file__), '../tmp/') vocalizer = ArabicVocalizer.TashkeelClass(mycache_path=cpath) #~vocalized_text = vocalizer.tashkeel(text) #~ vocalizer.disable_cache() vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(text) # compare voalized text with a correct text text1 = correct_text #~text2 = vocalized_text displayed_html = u"" #stemmer=tashaphyne.stemming.ArabicLightStemmer() #~texts = vocalizer.analyzer.split_into_phrases(text1) texts = [text1, ] list1 =[] for txt in texts: list1 += vocalizer.analyzer.tokenize(txt) list2 = vocalized_dict print u"\t".join(list1).encode('utf8') correct = 0 incorrect = 0 total = len(list1) if len(list1)!= len(list2): print "lists haven't the same length", len(list1), len(list2) for i in range(min(len(list1), len(list2))): print (u"'%s'\t'%s'"%(list1[i], list2[i].get('chosen',''))).encode("utf8") sys.exit() else: for i in range(total): wo1 = list1[i] wo1_strip = wo1 wo2 = list2[i]['chosen'] wo2_strip = list2[i]['semi'] # words without inflection mark inflect = list2[i]['inflect'] link = list2[i]['link'] rule = list2[i]['rule'] style = "diff" #~if araby.is_vocalized(wo2) and araby.vocalizedlike(wo1, wo2): if araby.vocalizedlike(wo1, wo2): if wo2 == "\n": wo2 = "<br/>" #~displayed_html += u" " + wo2 displayed_html += u" <span id='diff' class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % ( style, wo1, inflect, link, str(rule), wo2) correct += 1 else: incorrect += 1 # green for last mark difference wo1_strip = wo1 #~wo2_strip = araby.strip_lastharaka(wo2) if araby.vocalizedlike(wo1_strip, wo2_strip): style = 'diff-mark' else: # if the last marks are equal wm1 = wo1[-1:] wm2 = wo2[-1:] if (araby.is_haraka(wm1) and araby.is_haraka(wm2) and wm1 == wm2) \ or (bool(araby.is_haraka(wm1)) ^ bool(araby.is_haraka(wm2))): style = "diff-word" else: style = 'diff-all' displayed_html += u" <span id='diff' class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % ( style, wo1, inflect, link, str(rule), wo2) per_correct = round(correct*100.00/total, 2) per_incorrect = round(incorrect*100.00/total, 2) result = [displayed_html, "correct:%0.2f%%, incorrect:%0.2f%%"%(per_correct, per_incorrect)] return result#correct*100/total
def vocalize( stop, proclitic, suffix, enclitic): """ Join the stop and its affixes, and get the vocalized form @param stop: stop found in dictionary. @type stop: unicode. @param proclitic: first level prefix. @type proclitic: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: vocalized word. @rtype: unicode. """ # enclitic and procletric have only an uniq vocalization in arabic enclitic_voc = ssconst.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0] proclitic_voc = ssconst.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0] suffix_voc = suffix#CONJ_SUFFIX_LIST_TAGS[suffix]["vocalized"][0] #adjust some some harakat #strip last if tanwin or last harakat if suffix_voc and araby.is_haraka(stop[-1:]): #(DAMMATAN, FATHATAN, KASRATAN, FATHA, DAMMA, KASRA): stop = stop[:-1] # convert Fathatan into one fatha, in some cases where # #the tanwin is not at the end: eg. محتوًى stop = stop.replace(araby.FATHATAN, araby.FATHA) #add shadda if the first letter is sunny and the procletic #contains AL definition mark if (u'تعريف' in ssconst.COMP_PREFIX_LIST_TAGS[proclitic]["tags"]\ and araby.is_sun(stop[0])): stop = u''.join([stop[0], araby.SHADDA, stop[1:]]) #strip the Skun from the lam if proclitic_voc.endswith(araby.SUKUN): proclitic_voc = proclitic_voc[:-1] # generate the word variant for some words witch ends by special #letters like Teh_marbuta or Alef_maksura, or hamza, #the variant is influed by the suffix harakat, # for example مدرسة+ي = مدرست+ي stop = get_word_variant(stop, suffix+enclitic) # generate the suffix variant. if the suffix is Teh_marbuta or #Alef_maksura, or hamza, the variant is influed by the enclitic harakat, # for example مدرس+ة+ي = مدرس+ت+ي suffix_voc, suffix_non_irab_mark = get_suffix_variants(stop, suffix_voc, enclitic) #Get the enclitic variant to be joined to the word. #For example: word = مدرس, suffix = ِة, encletic = هُ. #The enclitic is convert to HEH+ KAsra. #~enclitic_voc = self.getEncliticVariant(stop, suffix_voc, enclitic_voc) # generate the non vacalized end word: the vocalized word # without the I3rab Mark # if the suffix is a short haraka word_non_irab_mark = ''.join([ proclitic_voc, stop, suffix_non_irab_mark, enclitic_voc]) word_vocalized = ''.join([proclitic_voc, stop, suffix_voc, enclitic_voc]) return word_vocalized, word_non_irab_mark