Beispiel #1
0
def vocalize(noun, proclitic, prefix, suffix, enclitic):
    """
    Join the  noun and its affixes, and get the vocalized form
    @param noun: noun found in dictionary.
    @type noun: unicode.
    @param proclitic: first level prefix.
    @type proclitic: unicode.
    @param prefix: second level suffix.
    @type prefix: unicode.
    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @return: vocalized word.
    @rtype: unicode.
    """
    enclitic_voc = snconst.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0] 
    proclitic_voc = snconst.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0] 
    suffix_voc = suffix 
    #adjust some some harakat
    
    #strip last if tanwin or harakat
    if noun[-1:] in araby.HARAKAT:
        noun = noun[:-1]
    #completate the dictionary word vocalization
    # this allow to avoid some missed harakat before ALEF
    # in the dictionary form of word, all alefat are preceded by Fatha
    #~noun = araby.complet
    #~ print "stem_unknown.vocalize; before", noun.encode('utf8');
    noun = noun.replace(araby.ALEF, araby.FATHA + araby.ALEF)
    #~ print "stem_unknown.vocalize; 2", noun.encode('utf8');

    noun = noun.replace(araby.ALEF_MAKSURA, araby.FATHA + araby.ALEF_MAKSURA)
    noun = re.sub(ur"(%s)+"%araby.FATHA , araby.FATHA, noun)
    
    # remove initial fatha if alef is the first letter
    noun = re.sub(ur"^(%s)+"%araby.FATHA , "", noun)
    #~ print "stem_unknown.vocalize; 3", noun.encode('utf8');
        
    #add shadda if the first letter is sunny and the prefix 
    #ends by al definition
    if proclitic.endswith(araby.ALEF+araby.LAM) and araby.is_sun(noun[0]):
        noun = u''.join([noun[0], araby.SHADDA, noun[1:]]) 
        #strip the Skun from the lam
        if proclitic_voc.endswith(araby.SUKUN):
            proclitic_voc = proclitic_voc[:-1] 
    noun = get_word_variant(noun, suffix) 
    noun = get_word_variant(noun, enclitic)         
    suffix_voc = get_suffix_variant(noun, suffix_voc, enclitic) 
    return ''.join([ proclitic_voc, prefix, noun, suffix_voc,
     enclitic_voc]) 
Beispiel #2
0
    def test_is_letter(self):

        self.assertTrue(Araby.is_sukun(Araby.SUKUN))
        self.assertTrue(Araby.is_shadda(Araby.SHADDA))
        self.assertTrue(Araby.is_tatweel(Araby.TATWEEL))

        for archar in Araby.TANWIN:
            self.assertTrue(Araby.is_tanwin(archar))

        for archar in Araby.TASHKEEL:
            self.assertTrue(Araby.is_tashkeel(archar))

        for haraka in Araby.HARAKAT:
            self.assertTrue(Araby.is_haraka(haraka))

        for short_haraka in Araby.SHORTHARAKAT:
            self.assertTrue(Araby.is_shortharaka(short_haraka))

        for liguature in Araby.LIGUATURES:
            self.assertTrue(Araby.is_ligature(liguature))

        for hamza in Araby.HAMZAT:
            self.assertTrue(Araby.is_hamza(hamza))

        for alef in Araby.ALEFAT:
            self.assertTrue(Araby.is_alef(alef))

        for yeh in Araby.YEHLIKE:
            self.assertTrue(Araby.is_yehlike(yeh))

        for waw in Araby.WAWLIKE:
            self.assertTrue(Araby.is_wawlike(waw))

        for teh in Araby.TEHLIKE:
            self.assertTrue(Araby.is_teh)

        for small in Araby.SMALL:
            self.assertTrue(Araby.is_small(small))

        for weak in Araby.WEAK:
            self.assertTrue(Araby.is_weak(weak))

        for archar in Araby.MOON:
            self.assertTrue(Araby.is_moon(archar))

        for archar in  Araby.SUN:
            self.assertTrue(Araby.is_sun(archar))
Beispiel #3
0
def vocalize(noun, proclitic, prefix, suffix, enclitic):
    """
    Join the  noun and its affixes, and get the vocalized form
    @param noun: noun found in dictionary.
    @type noun: unicode.
    @param proclitic: first level prefix.
    @type proclitic: unicode.
    @param prefix: second level suffix.
    @type prefix: unicode.
    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @return: vocalized word.
    @rtype: unicode.
    """
    enclitic_voc = snconst.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0] 
    proclitic_voc = \
    snconst.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0] 
    suffix_voc = suffix #CONJ_SUFFIX_LIST_TAGS[suffix]["vocalized"][0] 
    #adjust some some harakat
    
    #strip last if tanwin or harakat
    if noun[-1:] in araby.HARAKAT:
        noun = noun[:-1] 
    #add shadda if the first letter is sunny and the prefix 
    #ends by al definition
    if proclitic.endswith(araby.ALEF+araby.LAM) and araby.is_sun(noun[0]):
        noun = u''.join([noun[0], araby.SHADDA, noun[1:]]) 
        #strip the Skun from the lam
        if proclitic_voc.endswith(araby.SUKUN):
            proclitic_voc = proclitic_voc[:-1] 
    noun = get_word_variant(noun, suffix) 
    noun = get_word_variant(noun, enclitic)         
    suffix_voc = get_suffix_variant(noun, suffix_voc, enclitic) 
    return ''.join([ proclitic_voc, prefix, noun, suffix_voc,
     enclitic_voc]) 
Beispiel #4
0
    if araby.is_shadda(c): print "shadda",
    if araby.is_tatweel(c): print "tatweel",
    if araby.is_tashkeel(c): print "tashkeel",
    if araby.is_tanwin(c): print "tanwin",
    if araby.is_shortharaka(c): print "short haraka",
    if araby.is_ligature(c):print " ligature",
    if araby.is_ligature(c):print 'ligature',
    if araby.is_hamza(c):    print 'hamza',
    if araby.is_alef(c): print 'alef',
    if araby.is_yehlike(c):  print 'yeh',
    if araby.is_wawlike(c):  print 'waw',
    if araby.is_teh(c):  print 'teh',
    if araby.is_small(c):    print 'small',
    if araby.is_weak(c): print 'weak',
    if araby.is_moon(c): print 'moon',
    if araby.is_sun(c):print 'sun',
    print araby.order(c),
    print;
word=u"الْعَرَيِيّةُ"
word_list=[
u"الْعَرَيِيّةُ",
u"العربية",
u"الْعَرَيِيّةُ الفصحى",
u"غير مشكول",
"Taha",
]
word1=u""
for word in word_list:
    print word.encode('utf8'),'\t',
    if araby.is_vocalized(word): print ' is vocalized',
##    if araby.isArabicstring(word): print ' iisArabicstring',
Beispiel #5
0
def vocalize( noun, proclitic,  suffix, enclitic):
    """
    Join the  noun and its affixes, and get the vocalized form
    @param noun: noun found in dictionary.
    @type noun: unicode.
    @param proclitic: first level prefix.
    @type proclitic: unicode.

    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @return: vocalized word.
    @rtype: unicode.
    """
    # procletic have only an uniq vocalization in arabic
    proclitic_voc = snconst.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0]
    # encletic can be variant according to suffix
    #print (u"vocalize: '%s' '%s'"%(enclitic, noun)).encode('utf8')
    enclitic_voc = snconst.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0]
    enclitic_voc,enclitic_voc_non_inflected  = get_enclitic_variant(enclitic_voc, suffix) 

    suffix_voc = suffix
    #adjust some some harakat
    
    #strip last if tanwin or last harakat
    if araby.is_haraka(noun[-1:]):
        #(DAMMATAN, FATHATAN, KASRATAN, FATHA, DAMMA, KASRA):
        noun = noun[:-1]
    # convert Fathatan into one fatha, in some cases where #
    #the tanwin is not at the end: eg. محتوًى
    noun = noun.replace(araby.FATHATAN, araby.FATHA)

    #add shadda if the first letter is sunny and the procletic 
    #contains AL definition mark
    if (u'تعريف' in snconst.COMP_PREFIX_LIST_TAGS[proclitic]["tags"]\
     and araby.is_sun(noun[0])):
        noun = u''.join([noun[0], araby.SHADDA, noun[1:]])
        #strip the Skun from the lam
        if proclitic_voc.endswith(araby.SUKUN):
            proclitic_voc = proclitic_voc[:-1]
    # generate the word variant for some words witch ends by special 
    #letters like Teh_marbuta or Alef_maksura, or hamza, 
    #the variant is influed by the suffix harakat, 
    # for example مدرسة+ي = مدرست+ي
    noun = get_word_variant(noun, suffix+enclitic)

    # generate the suffix variant. if the suffix is Teh_marbuta or 
    #Alef_maksura, or hamza, the variant is influed by the enclitic harakat,
    # for example مدرس+ة+ي = مدرس+ت+ي        
    suffix_voc, suffix_non_irab_mark = get_suffix_variants(noun,
     suffix_voc, enclitic)


    
    #completate the dictionary word vocalization
    # this allow to avoid some missed harakat before ALEF
    # in the dictionary form of word, all alefat are preceded by Fatha
    #~noun = araby.complet
    noun = noun.replace(araby.ALEF, araby.FATHA + araby.ALEF)

    noun = noun.replace(araby.ALEF_MAKSURA, araby.FATHA + araby.ALEF_MAKSURA)
    noun = re.sub(ur"(%s)+"%araby.FATHA , araby.FATHA, noun)
    # remove initial fatha if alef is the first letter
    noun = re.sub(ur"^(%s)+"%araby.FATHA , "", noun)
    
    # generate the non vacalized end word: the vocalized word 
    # without the I3rab Mark
    # if the suffix is a short haraka 
    word_non_irab_mark = ''.join([ proclitic_voc,  noun, 
    suffix_non_irab_mark,   enclitic_voc_non_inflected]) 
    # ajust the semivocalized form
    word_non_irab_mark  = re.sub(ur"(%s)+"%araby.FATHA , araby.FATHA, word_non_irab_mark )
    word_non_irab_mark  = re.sub(ur"(%s%s%s)+"%(araby.FATHA, araby.ALEF_MAKSURA, araby.KASRATAN)
 , araby.FATHATAN + araby.ALEF_MAKSURA, word_non_irab_mark )    
    word_non_irab_mark  = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.KASRA)
 , araby.FATHA + araby.ALEF_MAKSURA, word_non_irab_mark ) 
    word_non_irab_mark  = re.sub(ur"%s[%s|%s|%s]"%(araby.ALEF_MAKSURA, araby.DAMMA, araby.FATHA, araby.KASRA)
 , araby.ALEF_MAKSURA, word_non_irab_mark ) 
    
    #generate vocalized form
    
    word_vocalized = ''.join([ proclitic_voc, noun, suffix_voc, 
       enclitic_voc])
    #~word_vocalized = araby.ajust_vocalization(word_vocalized)
    word_vocalized = re.sub(ur"(%s)+"%araby.FATHA , araby.FATHA, word_vocalized)
    word_vocalized = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.KASRATAN)
     , araby.FATHATAN + araby.ALEF_MAKSURA, word_vocalized) 
    word_vocalized = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.DAMMATAN)
     , araby.FATHATAN + araby.ALEF_MAKSURA, word_vocalized) 
    word_vocalized = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.FATHATAN)
     , araby.FATHATAN + araby.ALEF_MAKSURA, word_vocalized)    
    word_vocalized = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.KASRA)
     , araby.FATHA + araby.ALEF_MAKSURA, word_vocalized) 
    word_vocalized = re.sub(ur"%s[%s|%s|%s]"%(araby.ALEF_MAKSURA, araby.DAMMA, araby.FATHA, araby.KASRA)
     , araby.ALEF_MAKSURA, word_vocalized)      
    return word_vocalized, word_non_irab_mark 
Beispiel #6
0
def vocalize( stop, proclitic,  suffix, enclitic):
    """
    Join the  stop and its affixes, and get the vocalized form
    @param stop: stop found in dictionary.
    @type stop: unicode.
    @param proclitic: first level prefix.
    @type proclitic: unicode.

    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @return: vocalized word.
    @rtype: unicode.
    """
    # enclitic and procletric have only an uniq vocalization in arabic
    enclitic_voc = ssconst.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0]
    proclitic_voc = ssconst.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0]
    suffix_voc = suffix#CONJ_SUFFIX_LIST_TAGS[suffix]["vocalized"][0]
    #adjust some some harakat
    
    #strip last if tanwin or last harakat
    if suffix_voc and araby.is_haraka(stop[-1:]):
        #(DAMMATAN, FATHATAN, KASRATAN, FATHA, DAMMA, KASRA):
        stop = stop[:-1]
    # convert Fathatan into one fatha, in some cases where #
    #the tanwin is not at the end: eg. محتوًى
    stop = stop.replace(araby.FATHATAN, araby.FATHA)

    #add shadda if the first letter is sunny and the procletic 
    #contains AL definition mark
    if (u'تعريف' in ssconst.COMP_PREFIX_LIST_TAGS[proclitic]["tags"]\
     and araby.is_sun(stop[0])):
        stop = u''.join([stop[0], araby.SHADDA, stop[1:]])
        #strip the Skun from the lam
        if proclitic_voc.endswith(araby.SUKUN):
            proclitic_voc = proclitic_voc[:-1]
    # generate the word variant for some words witch ends by special 
    #letters like Teh_marbuta or Alef_maksura, or hamza, 
    #the variant is influed by the suffix harakat, 
    # for example مدرسة+ي = مدرست+ي
    stop = get_word_variant(stop, suffix+enclitic)

    # generate the suffix variant. if the suffix is Teh_marbuta or 
    #Alef_maksura, or hamza, the variant is influed by the enclitic harakat,
    # for example مدرس+ة+ي = مدرس+ت+ي        
    suffix_voc, suffix_non_irab_mark = get_suffix_variants(stop,
     suffix_voc, enclitic)

    #Get the enclitic variant to be joined to the word.
    #For example: word = مدرس, suffix = ِة, encletic = هُ. 
    #The enclitic  is convert to HEH+ KAsra.
    #~enclitic_voc = self.getEncliticVariant(stop, suffix_voc, enclitic_voc)

    # generate the non vacalized end word: the vocalized word 
    # without the I3rab Mark
    # if the suffix is a short haraka 
    word_non_irab_mark = ''.join([ proclitic_voc,  stop, 
         suffix_non_irab_mark,   enclitic_voc])             
        
    word_vocalized = ''.join([proclitic_voc, stop, suffix_voc, 
       enclitic_voc])
    return word_vocalized, word_non_irab_mark