Esempio n. 1
0
def get_suffix_variants(word, suffix, enclitic):
    """
    Get the suffix variant to be joined to the word.
    For example: word = مدرس, suffix = ة, encletic = ي. 
    The suffix is converted to Teh.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @return: variant of suffixes  (vocalized suffix and vocalized 
    suffix without I'rab short mark).
    @rtype: (unicode, unicode)
    """
    enclitic_nm = araby.strip_tashkeel(enclitic)
    newsuffix = suffix  #default value
    #if the word ends by a haraka
    if not enclitic_nm and word[-1:] in (
            araby.ALEF_MAKSURA, araby.YEH,
            araby.ALEF) and araby.is_haraka(suffix):
        newsuffix = u""

    #gererate the suffix without I'rab short mark
    # here we lookup with given suffix because the new suffix is
    # changed and can be not found in table
    if u'متحرك' in ssconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
        suffix_non_irab_mark = araby.strip_lastharaka(newsuffix)
    else:
        suffix_non_irab_mark = newsuffix

    return newsuffix, suffix_non_irab_mark
Esempio n. 2
0
def get_suffix_variants(word, suffix, enclitic):
    """
    Get the suffix variant to be joined to the word.
    For example: word = مدرس, suffix = ة, encletic = ي. 
    The suffix is converted to Teh.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @return: variant of suffixes  (vocalized suffix and vocalized 
    suffix without I'rab short mark).
    @rtype: (unicode, unicode)
    """
    enclitic_nm = araby.strip_tashkeel(enclitic)
    newsuffix = suffix #default value
    #if the word ends by a haraka
    if suffix.find(araby.TEH_MARBUTA) >= 0 and len (enclitic_nm)>0:
        newsuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix)

    elif  not enclitic_nm and word[-1:] in (araby.YEH, araby.ALEF) and araby.is_haraka(suffix):
        newsuffix = u""        
    #gererate the suffix without I'rab short mark
    # here we lookup with given suffix because the new suffix is 
    # changed and can be not found in table
    if u'متحرك' in snconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
        suffix_non_irab_mark = araby.strip_lastharaka(newsuffix)
    else:
        suffix_non_irab_mark = newsuffix
    return newsuffix, suffix_non_irab_mark 
Esempio n. 3
0
def uniformate_suffix(word):
    """ separate the harakat and the letters of the given word, 
    it return two strings ( the word without harakat and the harakat).
    If the weaked letters are reprsented as long harakat and striped 
    from the word.
    """
    ## type : affix : uniformate affixes
    ##    word = normalize_affix(word)
    word = word.replace(SHADDA, SUKUN + SHADDA)
    shakl = u""
    word_nm = u""
    i = 0
    len_word = len(word)
    #    print "len word", len(word)
    while i < len_word:
        if not araby.is_shortharaka(word[i]):  # not in HARAKAT:
            word_nm += word[i]
            if i + 1 < len(word) and araby.is_shortharaka(word[i + 1]):
                if word[i + 1] == FATHA:
                    if i+2 < len(word) and word[i+2] == ALEF and \
                                 i+3 < len(word):
                        shakl += vconst.ALEF_HARAKA
                        i += 3
                    else:
                        shakl += FATHA
                        i += 2
                elif word[i+1] == DAMMA and i+2 < len(word) and \
                       word[i+2] == WAW:
                    if i + 3 >= len(word) or not araby.is_shortharaka(
                            word[i + 3]):
                        shakl += vconst.WAW_HARAKA
                        i += 3
                    else:
                        shakl += DAMMA
                        i += 2
                elif word[i+1] == KASRA and i+2 < len(word) and \
                 word[i+2] == YEH:
                    if i + 3 >= len(word) or not araby.is_shortharaka(
                            word[i + 3]):
                        shakl += vconst.YEH_HARAKA
                        i += 3
                    else:
                        shakl += KASRA
                        i += 2
                else:
                    shakl += word[i + 1]
                    i += 2

            elif i + 1 < len(word) and araby.is_haraka(word[i + 1]):
                shakl += word[i + 1]
            else:
                shakl += vconst.NOT_DEF_HARAKA
                i += 1
        else:
            i += 1
    if len(word_nm) == len(shakl):
        return (word_nm, shakl)
    else:
        return (u"", u"")
Esempio n. 4
0
def uniformate_suffix(word):
    """ separate the harakat and the letters of the given word, 
    it return two strings ( the word without harakat and the harakat).
    If the weaked letters are reprsented as long harakat and striped 
    from the word.
    """
    ## type : affix : uniformate affixes
##    word = normalize_affix(word)
    word = word.replace(SHADDA, SUKUN+SHADDA)
    shakl = u""
    word_nm = u""
    i = 0
    len_word = len(word)
#    print "len word", len(word)
    while i < len_word:
        if not araby.is_shortharaka(word[i]):  # not in HARAKAT:
            word_nm += word[i]
            if i+1 < len(word) and araby.is_shortharaka(word[i+1]):
                if word[i+1] == FATHA :
                    if i+2 < len(word) and word[i+2] == ALEF and \
                                 i+3 < len(word):
                        shakl += vconst.ALEF_HARAKA
                        i += 3
                    else :
                        shakl += FATHA
                        i += 2
                elif word[i+1] == DAMMA and i+2 < len(word) and \
                       word[i+2] == WAW:
                    if i+3 >= len(word) or not araby.is_shortharaka(word[i+3]):
                        shakl += vconst.WAW_HARAKA
                        i += 3
                    else :
                        shakl += DAMMA
                        i += 2
                elif word[i+1] == KASRA and i+2 < len(word) and \
                 word[i+2] == YEH:
                    if i+3 >= len(word) or not araby.is_shortharaka(word[i+3]):
                        shakl += vconst.YEH_HARAKA
                        i += 3
                    else :
                        shakl += KASRA
                        i += 2
                else :
                    shakl += word[i+1]
                    i += 2

            elif  i+1 < len(word) and araby.is_haraka(word[i+1]):
                shakl += word[i+1]
            else:
                shakl += vconst.NOT_DEF_HARAKA
                i += 1
        else: i += 1
    if len(word_nm) == len(shakl):
        return (word_nm, shakl)
    else: return (u"", u"")
Esempio n. 5
0
    def test_is_letter(self):

        self.assertTrue(Araby.is_sukun(Araby.SUKUN))
        self.assertTrue(Araby.is_shadda(Araby.SHADDA))
        self.assertTrue(Araby.is_tatweel(Araby.TATWEEL))

        for archar in Araby.TANWIN:
            self.assertTrue(Araby.is_tanwin(archar))

        for archar in Araby.TASHKEEL:
            self.assertTrue(Araby.is_tashkeel(archar))

        for haraka in Araby.HARAKAT:
            self.assertTrue(Araby.is_haraka(haraka))

        for short_haraka in Araby.SHORTHARAKAT:
            self.assertTrue(Araby.is_shortharaka(short_haraka))

        for liguature in Araby.LIGUATURES:
            self.assertTrue(Araby.is_ligature(liguature))

        for hamza in Araby.HAMZAT:
            self.assertTrue(Araby.is_hamza(hamza))

        for alef in Araby.ALEFAT:
            self.assertTrue(Araby.is_alef(alef))

        for yeh in Araby.YEHLIKE:
            self.assertTrue(Araby.is_yehlike(yeh))

        for waw in Araby.WAWLIKE:
            self.assertTrue(Araby.is_wawlike(waw))

        for teh in Araby.TEHLIKE:
            self.assertTrue(Araby.is_teh)

        for small in Araby.SMALL:
            self.assertTrue(Araby.is_small(small))

        for weak in Araby.WEAK:
            self.assertTrue(Araby.is_weak(weak))

        for archar in Araby.MOON:
            self.assertTrue(Araby.is_moon(archar))

        for archar in  Araby.SUN:
            self.assertTrue(Araby.is_sun(archar))
Esempio n. 6
0
    def get_suffix_variants(word, suffix, enclitic, mankous=False):
        """
        Get the suffix variant to be joined to the word.
        For example: word = مدرس, suffix = ة, enclitic = ي.
        The suffix is converted to Teh.
        @param word: word found in dictionary.
        @type word: unicode.
        @param suffix: second level suffix.
        @type suffix: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @param mankous: if the noun is mankous ends with Yeh منقوص.
        @type mankous: boolean.
        @return: variant of suffixes  (vocalized suffix and vocalized
        suffix without I'rab short mark).
        @rtype: (unicode, unicode)
        """
        #enclitic_nm = ar.strip_tashkeel(enclitic)
        enclitic_nm = enclitic  # given enclitic is not vocalized
        newsuffix = suffix  #default value
        #if the word ends by a haraka
        if suffix.find(ar.TEH_MARBUTA) >= 0 and enclitic_nm:
            newsuffix = re.sub(ar.TEH_MARBUTA, ar.TEH, suffix)

        elif not enclitic_nm and ar.is_haraka(suffix):
            if word[-1:] in (ar.YEH, ar.ALEF):
                newsuffix = u""
            elif mankous:
                # the word is striped from YEH المنقوص حذفت ياؤه قبل قليل
                # تحول حركته إلى تنوين كسر
                newsuffix = ar.KASRATAN
        #gererate the suffix without I'rab short mark
        # here we lookup with given suffix because the new suffix is
        # changed and can be not found in table
        if u'متحرك' in SNC.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
            suffix_non_irab_mark = ar.strip_lastharaka(newsuffix)
        else:
            suffix_non_irab_mark = newsuffix
            
        #~ if suffix.endswith(ar.YEH+ar.SHADDA+ ar.DAMMA) and enclitic_nm.startswith(ar.YEH):
            #~ newsuffix = ar.YEH+ar.SHADDA+ ar.DAMMA
            #~ suffix_non_irab_mark = ar.YEH+ar.SHADDA            
        #~ if suffix.endswith(ar.DAMMA) and enclitic_nm.startswith( ar.YEH):
            #~ newsuffix = suffix[:-1] + ar.KASRA
            #~ suffix_non_irab_mark = suffix[:-1]            
        return newsuffix, suffix_non_irab_mark
Esempio n. 7
0
 def lookup(self, text, word_type = ''):
     """
     look up for all word forms in the dictionary, according to word_type
         - 'verb': lookup for verb only.
         - 'noun': look up for nouns.
         - 'unknown': the word is not alayzed, then search for unvocalized word.
         - '': look for voaclize word without type
     @param text:vocalized word.
     @type text: unicode.
     @param word_type: the word type can take 'verb', 'noun', 'unknwon', ''.
     @type word_type: unicode.        
     @return: list of dictionary entries IDs.
     @rtype: list.
     """
     idlist = []
     # strip the last haraka from the text to ensure the search
     #
     if araby.is_haraka(text[-1:]):
         text = text[:-1]
     # homogoneize with word typography
     # strip all fatha before alef into 
     text = re.sub(araby.FATHA+araby.ALEF, araby.ALEF, text)
     if word_type == 'unknown':
         sql  =  u"select * FROM %s WHERE unvocalized='%s'" % (
         self.table_name, text)
     else:
         sql  =  u"select * FROM %s WHERE vocalized='%s'" % (
         self.table_name, text)            
         if word_type == 'verb':
             sql += " AND word_type='verb' "
         elif word_type == 'noun':
             sql += " AND word_type!='verb' "
     try:
         self.cursor.execute(sql)
     except sqlite.OperationalError:
         print "Fatal Error can't execute query: file: wordfrequencydictionary"
         return []
     if self.cursor:
         # return self.curser.fetchall()
         for row in self.cursor:
             idlist.append(row)
     return idlist
Esempio n. 8
0
def get_suffix_variants(word, suffix, enclitic, mankous = False):
    """
    Get the suffix variant to be joined to the word.
    For example: word = مدرس, suffix = ة, encletic = ي. 
    The suffix is converted to Teh.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @param mankous: if the noun is mankous ends with Yeh منقوص.
    @type mankous: boolean.        
    @return: variant of suffixes  (vocalized suffix and vocalized 
    suffix without I'rab short mark).
    @rtype: (unicode, unicode)
    """
    enclitic_nm = araby.strip_tashkeel(enclitic)
    newsuffix = suffix #default value
    #if the word ends by a haraka
    if suffix.find(araby.TEH_MARBUTA) >= 0 and enclitic_nm:
        newsuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix)

    elif  not enclitic_nm and  araby.is_haraka(suffix):
        if word[-1:] in (araby.YEH, araby.ALEF):
            newsuffix = u""
        elif mankous :
            # the word is striped from YEH المنقوص حذفت ياؤه قبل قليل
            # تحول حركته إلى تنوين كسر
             newsuffix =  araby.KASRATAN
    #gererate the suffix without I'rab short mark
    # here we lookup with given suffix because the new suffix is 
    # changed and can be not found in table
    if u'متحرك' in snconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
        suffix_non_irab_mark = araby.strip_lastharaka(newsuffix)
    else:
        suffix_non_irab_mark = newsuffix
    return newsuffix, suffix_non_irab_mark 
Esempio n. 9
0
def vocalize(noun, proclitic, suffix, enclitic):
    """
    Join the  noun and its affixes, and get the vocalized form
    @param noun: noun found in dictionary.
    @type noun: unicode.
    @param proclitic: first level prefix.
    @type proclitic: unicode.

    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.
    @return: vocalized word.
    @rtype: unicode.
    """
    # procletic have only an uniq vocalization in arabic
    proclitic_voc = snconst.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0]
    # encletic can be variant according to suffix
    #print (u"vocalize: '%s' '%s'"%(enclitic, noun)).encode('utf8')
    enclitic_voc = snconst.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0]
    enclitic_voc, encl_voc_non_inflect = get_enclitic_variant(
        enclitic_voc, suffix)

    suffix_voc = suffix
    #adjust some some harakat

    #strip last if tanwin or last harakat
    if ar.is_haraka(noun[-1:]):
        #(DAMMATAN, FATHATAN, KASRATAN, FATHA, DAMMA, KASRA):
        noun = noun[:-1]
    # convert Fathatan into one fatha, in some cases where #
    #the tanwin is not at the end: eg. محتوًى
    noun = noun.replace(ar.FATHATAN, ar.FATHA)

    #add shadda if the first letter is sunny and the procletic
    #contains AL definition mark
    if u'تعريف' in snconst.COMP_PREFIX_LIST_TAGS[proclitic]["tags"]\
     and ar.is_sun(noun[0]):
        noun = u''.join([noun[0], ar.SHADDA, noun[1:]])
        #strip the Skun from the lam
        if proclitic_voc.endswith(ar.SUKUN):
            proclitic_voc = proclitic_voc[:-1]
    #completate the dictionary word vocalization
    # this allow to avoid some missed harakat before ALEF
    # in the dictionary form of word, all alefat are preceded by Fatha
    #~noun = ar.complet
    #~ print "stem_noun.vocalize; before", noun.encode('utf8');
    noun = noun.replace(ar.ALEF, ar.FATHA + ar.ALEF)
    #~ print "stem_noun.vocalize; 2", noun.encode('utf8');

    noun = noun.replace(ar.ALEF_MAKSURA, ar.FATHA + ar.ALEF_MAKSURA)
    noun = re.sub(ur"(%s)+" % ar.FATHA, ar.FATHA, noun)
    # remove initial fatha if alef is the first letter
    noun = re.sub(ur"^(%s)+" % ar.FATHA, "", noun)
    #~ print "stem_noun.vocalize; 3", noun.encode('utf8');

    # generate the word variant for some words witch ends by special
    #letters like Teh_marbuta or Alef_maksura, or hamza,
    #the variant is influed by the suffix harakat,
    # for example مدرسة+ي = مدرست+ي
    mankous = True if noun.endswith(ar.KASRA + ar.YEH) else False

    noun = get_word_variant(noun, suffix, enclitic)

    # generate the suffix variant. if the suffix is Teh_marbuta or
    #Alef_maksura, or hamza, the variant is influed by the enclitic harakat,
    # for example مدرس+ة+ي = مدرس+ت+ي
    suffix_voc, suffix_non_irab_mark = get_suffix_variants(
        noun, suffix_voc, enclitic, mankous)

    # generate the non vacalized end word: the vocalized word
    # without the I3rab Mark
    # if the suffix is a short haraka
    word_non_irab_mark = ''.join([
        proclitic_voc, noun, suffix_non_irab_mark, encl_voc_non_inflect
    ])
    # ajust the semivocalized form
    word_non_irab_mark = re.sub(ur"(%s)+" % ar.FATHA, ar.FATHA,
                                word_non_irab_mark)
    word_non_irab_mark = re.sub(
        u"(%s%s%s)+" % (ar.FATHA, ar.ALEF_MAKSURA, ar.KASRATAN),
        ar.FATHATAN + ar.ALEF_MAKSURA, word_non_irab_mark)
    word_non_irab_mark = re.sub(ur"%s%s%s" % (ar.FATHA, ar.ALEF_MAKSURA,
                                             ar.KASRA),
                                ar.FATHA + ar.ALEF_MAKSURA, word_non_irab_mark)
    word_non_irab_mark = re.sub(ur"%s[%s|%s|%s]" % (ar.ALEF_MAKSURA, ar.DAMMA,
                                                   ar.FATHA, ar.KASRA),
                                ar.ALEF_MAKSURA, word_non_irab_mark)

    #generate vocalized form

    word_vocalized = ''.join([proclitic_voc, noun, suffix_voc, enclitic_voc])
    #used for spelling purposes
    segmented = '-'.join([proclitic_voc, noun, suffix_voc, enclitic_voc])
    segmented = ar.strip_tashkeel(segmented)
    #~word_vocalized = ar.ajust_vocalization(word_vocalized)
    word_vocalized = re.sub(ur"(%s)+" % ar.FATHA, ar.FATHA, word_vocalized)
    word_vocalized = re.sub(ur"%s%s%s" % (ar.FATHA, ar.ALEF_MAKSURA,
                                         ar.KASRATAN),
                            ar.FATHATAN + ar.ALEF_MAKSURA, word_vocalized)
    word_vocalized = re.sub(ur"%s%s%s" % (ar.FATHA, ar.ALEF_MAKSURA,
                                         ar.DAMMATAN),
                            ar.FATHATAN + ar.ALEF_MAKSURA, word_vocalized)
    word_vocalized = re.sub(ur"%s%s%s" % (ar.FATHA, ar.ALEF_MAKSURA,
                                         ar.FATHATAN),
                            ar.FATHATAN + ar.ALEF_MAKSURA, word_vocalized)
    word_vocalized = re.sub(ur"%s%s%s" % (ar.FATHA, ar.ALEF_MAKSURA, ar.KASRA),
                            ar.FATHA + ar.ALEF_MAKSURA, word_vocalized)
    word_vocalized = re.sub(ur"%s[%s|%s|%s]" % (ar.ALEF_MAKSURA, ar.DAMMA,
                                               ar.FATHA, ar.KASRA),
                            ar.ALEF_MAKSURA, word_vocalized)
    return word_vocalized, word_non_irab_mark, segmented
Esempio n. 10
0
def compare_tashkeel(text):
    """
    Compare tashkeel between vocalized text and automatic vocalized text
    """
    import tashkeel.tashkeel as ArabicVocalizer
    # the entred text is vocalized correctly
    correct_text = text.strip()
    text = araby.strip_tashkeel(text.strip())
    vocalizer = ArabicVocalizer.TashkeelClass()
    #~vocalized_text = vocalizer.tashkeel(text)
    vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(text)

    # compare voalized text with a correct text
    text1 = correct_text
    #~text2 = vocalized_text
    displayed_html = u""

    #stemmer=tashaphyne.stemming.ArabicLightStemmer()
    #~texts = vocalizer.analyzer.split_into_phrases(text1)
    texts = [
        text1,
    ]
    list1 = []
    for txt in texts:
        list1 += vocalizer.analyzer.tokenize(txt)
    list2 = vocalized_dict
    print u"\t".join(list1).encode('utf8')
    correct = 0
    incorrect = 0
    total = len(list1)
    if len(list1) != len(list2):
        print "lists haven't the same length", len(list1), len(list2)
        for i in range(min(len(list1), len(list2))):
            print(u"'%s'\t'%s'" %
                  (list1[i], list2[i].get('chosen', ''))).encode("utf8")
        sys.exit()
    else:
        for i in range(total):
            wo1 = list1[i]
            wo1_strip = wo1
            wo2 = list2[i]['chosen']
            wo2_strip = list2[i]['semi']  # words without inflection mark
            inflect = list2[i]['inflect']
            link = list2[i]['link']
            rule = list2[i]['rule']
            style = "diff"
            #~if araby.is_vocalized(wo2) and araby.vocalizedlike(wo1, wo2):
            if araby.vocalizedlike(wo1, wo2):
                if wo2 == "\n":
                    wo2 = "<br/>"
                #~displayed_html += u" " + wo2
                displayed_html += u" <span id='diff'  class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % (
                    style, wo1, inflect, link, str(rule), wo2)

                correct += 1
            else:
                incorrect += 1
                # green for last mark difference
                wo1_strip = wo1
                #~wo2_strip = araby.strip_lastharaka(wo2)
                if araby.vocalizedlike(wo1_strip, wo2_strip):
                    style = 'diff-mark'
                else:
                    # if the last marks are equal
                    wm1 = wo1[-1:]
                    wm2 = wo2[-1:]
                    if (araby.is_haraka(wm1) and araby.is_haraka(wm2) and wm1 == wm2) \
                    or (bool(araby.is_haraka(wm1)) ^  bool(araby.is_haraka(wm2))):
                        style = "diff-word"
                    else:
                        style = 'diff-all'
                displayed_html += u" <span id='diff'  class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % (
                    style, wo1, inflect, link, str(rule), wo2)
    per_correct = round(correct * 100.00 / total, 2)
    per_incorrect = round(incorrect * 100.00 / total, 2)
    result = [
        displayed_html,
        "correct:%0.2f%%, incorrect:%0.2f%%" % (per_correct, per_incorrect)
    ]
    return result  #correct*100/total
Esempio n. 11
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
sys.path.append("../")
#~ import  pyarabic.harf as harf
#from   pyarabic import harf
import  pyarabic.araby as araby

for c in araby.arabicrange():
    print c.encode('utf8'),'\t', araby.name(c).encode('utf8'),
    print '\t',
    if araby.is_sukun(c): print "sukun",
    if araby.is_haraka(c): print "haraka",
    if araby.is_shadda(c): print "shadda",
    if araby.is_tatweel(c): print "tatweel",
    if araby.is_tashkeel(c): print "tashkeel",
    if araby.is_tanwin(c): print "tanwin",
    if araby.is_shortharaka(c): print "short haraka",
    if araby.is_ligature(c):print " ligature",
    if araby.is_ligature(c):print 'ligature',
    if araby.is_hamza(c):    print 'hamza',
    if araby.is_alef(c): print 'alef',
    if araby.is_yehlike(c):  print 'yeh',
    if araby.is_wawlike(c):  print 'waw',
    if araby.is_teh(c):  print 'teh',
    if araby.is_small(c):    print 'small',
    if araby.is_weak(c): print 'weak',
    if araby.is_moon(c): print 'moon',
    if araby.is_sun(c):print 'sun',
    print araby.order(c),
    print;
Esempio n. 12
0
def vocalize( noun, proclitic,  suffix, enclitic):
    """
    Join the  noun and its affixes, and get the vocalized form
    @param noun: noun found in dictionary.
    @type noun: unicode.
    @param proclitic: first level prefix.
    @type proclitic: unicode.

    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @return: vocalized word.
    @rtype: unicode.
    """
    # procletic have only an uniq vocalization in arabic
    proclitic_voc = snconst.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0]
    # encletic can be variant according to suffix
    #print (u"vocalize: '%s' '%s'"%(enclitic, noun)).encode('utf8')
    enclitic_voc = snconst.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0]
    enclitic_voc,enclitic_voc_non_inflected  = get_enclitic_variant(enclitic_voc, suffix) 

    suffix_voc = suffix
    #adjust some some harakat
    
    #strip last if tanwin or last harakat
    if araby.is_haraka(noun[-1:]):
        #(DAMMATAN, FATHATAN, KASRATAN, FATHA, DAMMA, KASRA):
        noun = noun[:-1]
    # convert Fathatan into one fatha, in some cases where #
    #the tanwin is not at the end: eg. محتوًى
    noun = noun.replace(araby.FATHATAN, araby.FATHA)

    #add shadda if the first letter is sunny and the procletic 
    #contains AL definition mark
    if (u'تعريف' in snconst.COMP_PREFIX_LIST_TAGS[proclitic]["tags"]\
     and araby.is_sun(noun[0])):
        noun = u''.join([noun[0], araby.SHADDA, noun[1:]])
        #strip the Skun from the lam
        if proclitic_voc.endswith(araby.SUKUN):
            proclitic_voc = proclitic_voc[:-1]
    # generate the word variant for some words witch ends by special 
    #letters like Teh_marbuta or Alef_maksura, or hamza, 
    #the variant is influed by the suffix harakat, 
    # for example مدرسة+ي = مدرست+ي
    noun = get_word_variant(noun, suffix+enclitic)

    # generate the suffix variant. if the suffix is Teh_marbuta or 
    #Alef_maksura, or hamza, the variant is influed by the enclitic harakat,
    # for example مدرس+ة+ي = مدرس+ت+ي        
    suffix_voc, suffix_non_irab_mark = get_suffix_variants(noun,
     suffix_voc, enclitic)


    
    #completate the dictionary word vocalization
    # this allow to avoid some missed harakat before ALEF
    # in the dictionary form of word, all alefat are preceded by Fatha
    #~noun = araby.complet
    noun = noun.replace(araby.ALEF, araby.FATHA + araby.ALEF)

    noun = noun.replace(araby.ALEF_MAKSURA, araby.FATHA + araby.ALEF_MAKSURA)
    noun = re.sub(ur"(%s)+"%araby.FATHA , araby.FATHA, noun)
    # remove initial fatha if alef is the first letter
    noun = re.sub(ur"^(%s)+"%araby.FATHA , "", noun)
    
    # generate the non vacalized end word: the vocalized word 
    # without the I3rab Mark
    # if the suffix is a short haraka 
    word_non_irab_mark = ''.join([ proclitic_voc,  noun, 
    suffix_non_irab_mark,   enclitic_voc_non_inflected]) 
    # ajust the semivocalized form
    word_non_irab_mark  = re.sub(ur"(%s)+"%araby.FATHA , araby.FATHA, word_non_irab_mark )
    word_non_irab_mark  = re.sub(ur"(%s%s%s)+"%(araby.FATHA, araby.ALEF_MAKSURA, araby.KASRATAN)
 , araby.FATHATAN + araby.ALEF_MAKSURA, word_non_irab_mark )    
    word_non_irab_mark  = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.KASRA)
 , araby.FATHA + araby.ALEF_MAKSURA, word_non_irab_mark ) 
    word_non_irab_mark  = re.sub(ur"%s[%s|%s|%s]"%(araby.ALEF_MAKSURA, araby.DAMMA, araby.FATHA, araby.KASRA)
 , araby.ALEF_MAKSURA, word_non_irab_mark ) 
    
    #generate vocalized form
    
    word_vocalized = ''.join([ proclitic_voc, noun, suffix_voc, 
       enclitic_voc])
    #~word_vocalized = araby.ajust_vocalization(word_vocalized)
    word_vocalized = re.sub(ur"(%s)+"%araby.FATHA , araby.FATHA, word_vocalized)
    word_vocalized = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.KASRATAN)
     , araby.FATHATAN + araby.ALEF_MAKSURA, word_vocalized) 
    word_vocalized = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.DAMMATAN)
     , araby.FATHATAN + araby.ALEF_MAKSURA, word_vocalized) 
    word_vocalized = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.FATHATAN)
     , araby.FATHATAN + araby.ALEF_MAKSURA, word_vocalized)    
    word_vocalized = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.KASRA)
     , araby.FATHA + araby.ALEF_MAKSURA, word_vocalized) 
    word_vocalized = re.sub(ur"%s[%s|%s|%s]"%(araby.ALEF_MAKSURA, araby.DAMMA, araby.FATHA, araby.KASRA)
     , araby.ALEF_MAKSURA, word_vocalized)      
    return word_vocalized, word_non_irab_mark 
Esempio n. 13
0
    absolute_import,
    print_function,
    unicode_literals,
    division,
)
import sys
sys.path.append("../")
#~ import  pyarabic.harf as harf
#from   pyarabic import harf
import pyarabic.araby as araby

for c in araby.arabicrange():
    print(c, '\t', araby.name(c), end=" ")
    print('\t', end=" ")
    if araby.is_sukun(c): print("sukun", end=" ")
    if araby.is_haraka(c): print("haraka", end=" ")
    if araby.is_shadda(c): print("shadda", end=" ")
    if araby.is_tatweel(c): print("tatweel", end=" ")
    if araby.is_tashkeel(c): print("tashkeel", end=" ")
    if araby.is_tanwin(c): print("tanwin", end=" ")
    if araby.is_shortharaka(c): print("short haraka", end=" ")
    if araby.is_ligature(c): print(" ligature", end=" ")
    if araby.is_ligature(c): print('ligature', end=" ")
    if araby.is_hamza(c): print('hamza', end=" ")
    if araby.is_alef(c): print('alef', end=" ")
    if araby.is_yehlike(c): print('yeh', end=" ")
    if araby.is_wawlike(c): print('waw', end=" ")
    if araby.is_teh(c): print('teh', end=" ")
    if araby.is_small(c): print('small', end=" ")
    if araby.is_weak(c): print('weak', end=" ")
    if araby.is_moon(c): print('moon', end=" ")
Esempio n. 14
0
        #drop harf alef
        word1 = word.replace(araby.second_char(word), u'')
        index = listWords.index(word)
        listWords[index] = word1

    if (araby.waznlike(word, u'يفعلوا') or araby.waznlike(word, u'افعلوا')
            or araby.waznlike(word, u'فعلوا')) and (araby.last_char(word)
                                                    == u'ا'):
        #drop harf alef
        word1 = word.replace(araby.last_char(word), u'')
        index = listWords.index(word)
        listWords[index] = word1

    if word in asmaaIshara:
        #replace the third char with ا iza kan al7arf al tani harakih w iza la bnbaddil il harf altani
        if araby.is_haraka(araby.second_char(word)):
            #replace the third char
            #word1 = araby.first_char(ss) + u'ا'+ ss[1:]
            index = listWords.index(word)
            listWords[index] = word1
        else:
            #replace second char
            word1 = araby.first_char(word) + u'ا' + word[1:]
            index = listWords.index(word)
            listWords[index] = word1

    #in these words we will add alef to the second last character
    if word == u'الله':
        word1 = word[:3] + u'ا' + araby.last_char(word)
        index = listWords.index(word)
        listWords[index] = word1
Esempio n. 15
0
    for token in text:
        frequency[token] += 1
#حذف الكلمات اللي تكرارها قليل
texts = [[token for token in text if frequency[token] > 1] for text in texts]

dictionary = corpora.Dictionary(texts)
dictionary.save('D:\GraduationProject1\projectITSelf\myDict.dict')
print(dictionary)
print(dictionary.token2id)

#we want to check harakit alharf al a5eer mn l beet
aa5irHarf = []  #list for iesh al7arf al a5eer aw 7rkto
for line in f:
    lastChar = araby.last_char(line)
    aa5irHarf.append(lastChar)
    if (araby.is_haraka(lastChar)):
        if (lastChar == araby.DAMMA):
            print(araby.DAMMA)

        elif (lastChar == araby.FATHA):
            print(araby.FATHA)

        elif (lastChar == araby.KASRA):
            print(araby.KASRA)

        elif (lastChar == araby.DAMMATAN):
            print(araby.DAMMATAN)

        elif (lastChar == araby.FATHATAN):
            print(araby.FATHATAN)
Esempio n. 16
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
sys.path.append("../")
#~ import  pyarabic.harf as harf
#from   pyarabic import harf
import pyarabic.araby as araby

for c in araby.arabicrange():
    print c.encode('utf8'), '\t', araby.name(c).encode('utf8'),
    print '\t',
    if araby.is_sukun(c): print "sukun",
    if araby.is_haraka(c): print "haraka",
    if araby.is_shadda(c): print "shadda",
    if araby.is_tatweel(c): print "tatweel",
    if araby.is_tashkeel(c): print "tashkeel",
    if araby.is_tanwin(c): print "tanwin",
    if araby.is_shortharaka(c): print "short haraka",
    if araby.is_ligature(c): print " ligature",
    if araby.is_ligature(c): print 'ligature',
    if araby.is_hamza(c): print 'hamza',
    if araby.is_alef(c): print 'alef',
    if araby.is_yehlike(c): print 'yeh',
    if araby.is_wawlike(c): print 'waw',
    if araby.is_teh(c): print 'teh',
    if araby.is_small(c): print 'small',
    if araby.is_weak(c): print 'weak',
    if araby.is_moon(c): print 'moon',
    if araby.is_sun(c): print 'sun',
    print araby.order(c),
    print
Esempio n. 17
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
sys.path.append('../')
from  pyarabic import araby


for c in araby.arabicrange():
    print (c,'\t', araby.name(c))
    print ('\t')
    if araby.is_sukun(c): print ("sukun")
    if araby.is_haraka(c): print ("haraka")
    if araby.is_shadda(c): print ("shadda")
    if araby.is_tatweel(c): print ("tatweel")
    if araby.is_tashkeel(c): print ("tashkeel")
    if araby.is_tanwin(c): print ("tanwin")
    if araby.is_shortharaka(c): print ("short haraka"),
    if araby.is_ligature(c):print (" ligature"),
    if araby.is_ligature(c):print ('ligature'),
    if araby.is_hamza(c):    print ('hamza'),
    if araby.is_alef(c): print ('alef'),
    if araby.is_yehlike(c):  print ('yeh'),
    if araby.is_wawlike(c):  print ('waw'),
    if araby.is_teh(c):  print ('teh'),
    if araby.is_small(c):    print ('small'),
    if araby.is_weak(c): print ('weak'),
    if araby.is_moon(c): print ('moon'),
    if araby.is_sun(c):print ('sun'),
    print (araby.order(c)),
    print ();
word=u"الْعَرَيِيّةُ"
    def lookup(self, text, word_type=''):
        """
        look up for all word forms in the dictionary, according to word_type
            - 'verb': lookup for verb only.
            - 'noun': look up for nouns.
            - 'unknown': the word is not alayzed, then search for unvocalized word.
            - '': look for voaclize word without type

        
        Example:
            >>> mydict = WordFreqDictionary('wordfreq')
            >>> wordlist = [u"صلاة", u'كرة', u"قَطَرً", u"أَرْض"]
            >>> for word in wordlist:
            >>>    print("word freq", mydict.get_freq(word))
            >>>    idlist = mydict.lookup(word)
            >>>    for row in idlist:
            >>>        row = dict(row)
            >>>        print('frequency', row['freq'])
            >>>        print(repr(row).decode("unicode-escape"))        
            صلاة  0
            word freq 0
            كرة  0
            word freq 0
            قَطَرً  2
            [(984, u'\u0642\u064e\u0637\u064e\u0631', u'\u0642\u0637\u0631', u'noun_prop', 154772, u'\u064e'), (13874, u'\u0642\u064e\u0637\u064e\u0631', u'\u0642\u0637\u0631', u'verb', 1859, u'\u064e')]
            word freq 154772
            frequency 154772
            {'vocalized': u'قَطَر', 'word_type': u'noun_prop', 'unvocalized': u'قطر', 'future_type': u'َ', 'freq': 154772, 'id': 984}
            frequency 1859
            {'vocalized': u'قَطَر', 'word_type': u'verb', 'unvocalized': u'قطر', 'future_type': u'َ', 'freq': 1859, 'id': 13874}
            أَرْض  1
            [(349, u'\u0623\u064e\u0631\u0652\u0636', u'\u0623\u0631\u0636', u'noun', 389839, u'\u064e')]
            word freq 389839
            frequency 389839
            {'vocalized': u'أَرْض', 'word_type': u'noun', 'unvocalized': u'أرض', 'future_type': u'َ', 'freq': 389839, 'id': 349}

        @param text:vocalized word.
        @type text: unicode.
        @param word_type: the word type can take 'verb', 'noun', 'unknwon', ''.
        @type word_type: unicode.        
        @return: list of dictionary entries IDs.
        @rtype: list.
    
        """
        idlist = []
        # strip the last haraka from the text to ensure the search
        #
        if araby.is_haraka(text[-1:]):
            text = text[:-1]
        # homogoneize with word typography
        # strip all fatha before alef into
        text = re.sub(araby.FATHA + araby.ALEF, araby.ALEF, text)
        if word_type == 'unknown':
            sql = u"select * FROM %s WHERE unvocalized='%s'" % (
                self.table_name, text)
        else:
            sql = u"select * FROM %s WHERE vocalized='%s'" % (self.table_name,
                                                              text)
            if word_type == 'verb':
                sql += " AND word_type='verb' "
            elif word_type == 'noun':
                sql += " AND word_type!='verb' "
        try:
            self.cursor.execute(sql)
        except sqlite.OperationalError:
            print(
                "Fatal Error can't execute query: file: wordfrequencydictionary"
            )
            return []
        if self.cursor:
            # return self.curser.fetchall()
            for row in self.cursor:
                idlist.append(row)
        return idlist
Esempio n. 19
0
def compare_tashkeel(text):
    """
    Compare tashkeel between vocalized text and automatic vocalized text
    """
    import tashkeel.tashkeel as ArabicVocalizer
    # the entred text is vocalized correctly
    correct_text = text.strip()
    text = araby.strip_tashkeel(text.strip())
    cpath = os.path.join(os.path.dirname(__file__), '../tmp/')
    vocalizer = ArabicVocalizer.TashkeelClass(mycache_path=cpath)
    #~vocalized_text = vocalizer.tashkeel(text)
    #~ vocalizer.disable_cache()

    vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(text)
    
    # compare voalized text with a correct text
    text1 = correct_text
    #~text2 = vocalized_text
    displayed_html = u""
    
    #stemmer=tashaphyne.stemming.ArabicLightStemmer()
    #~texts = vocalizer.analyzer.split_into_phrases(text1)
    texts = [text1, ]
    list1 =[]
    for txt in texts:
        list1 += vocalizer.analyzer.tokenize(txt)
    list2 = vocalized_dict
    print u"\t".join(list1).encode('utf8')
    correct = 0
    incorrect = 0
    total = len(list1)
    if len(list1)!= len(list2):
        print "lists haven't the same length", len(list1), len(list2)
        for i in range(min(len(list1), len(list2))):
            print (u"'%s'\t'%s'"%(list1[i], list2[i].get('chosen',''))).encode("utf8")
        sys.exit()
    else:
        for i in range(total):
            wo1 = list1[i]
            wo1_strip = wo1            
            wo2 = list2[i]['chosen']
            wo2_strip = list2[i]['semi']  # words without inflection mark
            inflect = list2[i]['inflect']
            link = list2[i]['link']
            rule = list2[i]['rule']
            style = "diff"
            #~if araby.is_vocalized(wo2) and araby.vocalizedlike(wo1, wo2):
            if araby.vocalizedlike(wo1, wo2):
                if wo2 == "\n":
                    wo2 = "<br/>"
                #~displayed_html += u" " + wo2
                displayed_html += u" <span id='diff'  class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % ( style, wo1, inflect, link, str(rule), wo2)

                correct += 1
            else:
                incorrect += 1
                # green for last mark difference
                wo1_strip = wo1
                #~wo2_strip = araby.strip_lastharaka(wo2)
                if araby.vocalizedlike(wo1_strip, wo2_strip):
                    style = 'diff-mark'
                else:
                    # if the last marks are equal
                    wm1 = wo1[-1:]
                    wm2 = wo2[-1:]
                    if (araby.is_haraka(wm1) and araby.is_haraka(wm2) and wm1 == wm2) \
                    or (bool(araby.is_haraka(wm1)) ^  bool(araby.is_haraka(wm2))):
                        style = "diff-word"
                    else:
                        style = 'diff-all'
                displayed_html += u" <span id='diff'  class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % ( style, wo1, inflect, link, str(rule), wo2)
    per_correct = round(correct*100.00/total, 2)
    per_incorrect = round(incorrect*100.00/total, 2)
    result = [displayed_html, "correct:%0.2f%%, incorrect:%0.2f%%"%(per_correct, per_incorrect)]
    return result#correct*100/total
Esempio n. 20
0
def vocalize( stop, proclitic,  suffix, enclitic):
    """
    Join the  stop and its affixes, and get the vocalized form
    @param stop: stop found in dictionary.
    @type stop: unicode.
    @param proclitic: first level prefix.
    @type proclitic: unicode.

    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @return: vocalized word.
    @rtype: unicode.
    """
    # enclitic and procletric have only an uniq vocalization in arabic
    enclitic_voc = ssconst.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0]
    proclitic_voc = ssconst.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0]
    suffix_voc = suffix#CONJ_SUFFIX_LIST_TAGS[suffix]["vocalized"][0]
    #adjust some some harakat
    
    #strip last if tanwin or last harakat
    if suffix_voc and araby.is_haraka(stop[-1:]):
        #(DAMMATAN, FATHATAN, KASRATAN, FATHA, DAMMA, KASRA):
        stop = stop[:-1]
    # convert Fathatan into one fatha, in some cases where #
    #the tanwin is not at the end: eg. محتوًى
    stop = stop.replace(araby.FATHATAN, araby.FATHA)

    #add shadda if the first letter is sunny and the procletic 
    #contains AL definition mark
    if (u'تعريف' in ssconst.COMP_PREFIX_LIST_TAGS[proclitic]["tags"]\
     and araby.is_sun(stop[0])):
        stop = u''.join([stop[0], araby.SHADDA, stop[1:]])
        #strip the Skun from the lam
        if proclitic_voc.endswith(araby.SUKUN):
            proclitic_voc = proclitic_voc[:-1]
    # generate the word variant for some words witch ends by special 
    #letters like Teh_marbuta or Alef_maksura, or hamza, 
    #the variant is influed by the suffix harakat, 
    # for example مدرسة+ي = مدرست+ي
    stop = get_word_variant(stop, suffix+enclitic)

    # generate the suffix variant. if the suffix is Teh_marbuta or 
    #Alef_maksura, or hamza, the variant is influed by the enclitic harakat,
    # for example مدرس+ة+ي = مدرس+ت+ي        
    suffix_voc, suffix_non_irab_mark = get_suffix_variants(stop,
     suffix_voc, enclitic)

    #Get the enclitic variant to be joined to the word.
    #For example: word = مدرس, suffix = ِة, encletic = هُ. 
    #The enclitic  is convert to HEH+ KAsra.
    #~enclitic_voc = self.getEncliticVariant(stop, suffix_voc, enclitic_voc)

    # generate the non vacalized end word: the vocalized word 
    # without the I3rab Mark
    # if the suffix is a short haraka 
    word_non_irab_mark = ''.join([ proclitic_voc,  stop, 
         suffix_non_irab_mark,   enclitic_voc])             
        
    word_vocalized = ''.join([proclitic_voc, stop, suffix_voc, 
       enclitic_voc])
    return word_vocalized, word_non_irab_mark