def getSuffixVariant(self, word, suffix, enclitic): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix=ة, encletic=ي. The suffix is converted to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: variant of suffixes (vocalized suffix and vocalized suffix without I'rab short mark). @rtype: (unicode, unicode) """ enclitic_nm = araby.stripTashkeel(enclitic) newSuffix = suffix #default value #if the word ends by a haraka if suffix.find(araby.TEH_MARBUTA) >= 0 and len(enclitic_nm) > 0: newSuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix) elif not enclitic_nm and word[-1:] in ( araby.ALEF_MAKSURA, araby.YEH, araby.ALEF) and araby.isHaraka(suffix): newSuffix = u"" #gererate the suffix without I'rab short mark # here we lookup with given suffix because the new suffix is changed and can be not found in table if u'متحرك' in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']: suffixNonIrabMark = araby.stripLastHaraka(newSuffix) else: suffixNonIrabMark = newSuffix return newSuffix, suffixNonIrabMark
def vocalize(self, noun, proclitic, suffix, enclitic): """ Join the noun and its affixes, and get the vocalized form @param noun: noun found in dictionary. @type noun: unicode. @param proclitic: first level prefix. @type proclitic: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: vocalized word. @rtype: unicode. """ # enclitic and procletric have only an uniq vocalization in arabic enclitic_voc = stem_noun_const.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0]; proclitic_voc = stem_noun_const.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0]; suffix_voc = suffix;#CONJ_SUFFIX_LIST_TAGS[suffix]["vocalized"][0]; #adjust some some harakat #strip last if tanwin or last harakat if araby.isHaraka(noun[-1:]):#(DAMMATAN, FATHATAN, KASRATAN, FATHA, DAMMA, KASRA): noun = noun[:-1]; # convert Fathatan into one fatha, in some cases where the tanwin is not at the end: eg. محتوًى noun = noun.replace(araby.FATHATAN, araby.FATHA); #add shadda if the first letter is sunny and the procletic contains AL definition mark if (u'تعريف' in stem_noun_const.COMP_PREFIX_LIST_TAGS[proclitic]["tags"] and araby.isSun(noun[0])): #if (u'تعريف' in proclitic.endswith(araby.ALEF+araby.LAM) or proclitic.endswith(araby.LAM+araby.LAM)) and araby.isSun(noun[0]): noun = u''.join([noun[0], araby.SHADDA, noun[1:]]); #strip the Skun from the lam if proclitic_voc.endswith(araby.SUKUN): proclitic_voc=proclitic_voc[:-1]; # generate the word variant for some words witch ends by special letters like Teh_marbuta or Alef_maksura, or hamza, the variant is influed by the suffix harakat, # for example مدرسة+ي= مدرست+ي noun = self.getWordVariant(noun, suffix+enclitic); # generate the suffix variant. if the suffix is Teh_marbuta or Alef_maksura, or hamza, the variant is influed by the enclitic harakat, # for example مدرس+ة+ي=مدرس+ت+ي suffix_voc, suffix_NonIrabMark = self.getSuffixVariant(noun, suffix_voc, enclitic); #Get the enclitic variant to be joined to the word. #For example: word = مدرس, suffix=ِة, encletic=هُ. The enclitic is convert to HEH+ KAsra. enclitic_voc = self.getEncliticVariant(noun, suffix_voc, enclitic_voc); # generate the non vacalized end word: the vocalized word without the I3rab Mark # if the suffix is a short haraka wordNonIrabMark= ''.join([ proclitic_voc, noun, suffix_NonIrabMark, enclitic_voc]) wordVocalized =''.join([ proclitic_voc, noun, suffix_voc, enclitic_voc]); return wordVocalized,wordNonIrabMark
def lookup(self, text, word_type=''): """ look up for all word forms in the dictionary, according to word_type - 'verb': lookup for verb only. - 'noun': look up for nouns. - 'unknown': the word is not alayzed, then search for unvocalized word. - '': look for voaclize word without type @param text:vocalized word. @type text: unicode. @param word_type: the word type can take 'verb', 'noun', 'unknwon', ''. @type word_type: unicode. @return: list of dictionary entries IDs. @rtype: list. """ idList = [] # strip the last haraka from the text to ensure the search # if araby.isHaraka(text[-1:]): text = text[:-1] # homogoneize with word typography # strip all fatha before alef into text = re.sub(araby.FATHA + araby.ALEF, araby.ALEF, text) if word_type == 'unknown': sql = u"select * FROM %s WHERE unvocalized='%s'" % (self.tableName, text) else: sql = u"select * FROM %s WHERE vocalized='%s'" % (self.tableName, text) if word_type == 'verb': sql += " AND word_type='verb' " elif word_type == 'noun': sql += " AND word_type!='verb' " try: self.cursor.execute(sql) if self.cursor: # return self.curser.fetchall(); for row in self.cursor: idList.append(row) return idList except: return []
def lookup(self,text, word_type=''): """ look up for all word forms in the dictionary, according to word_type - 'verb': lookup for verb only. - 'noun': look up for nouns. - 'unknown': the word is not alayzed, then search for unvocalized word. - '': look for voaclize word without type @param text:vocalized word. @type text: unicode. @param word_type: the word type can take 'verb', 'noun', 'unknwon', ''. @type word_type: unicode. @return: list of dictionary entries IDs. @rtype: list. """ idList=[]; # strip the last haraka from the text to ensure the search # if araby.isHaraka(text[-1:]): text=text[:-1]; # homogoneize with word typography # strip all fatha before alef into text=re.sub(araby.FATHA+araby.ALEF, araby.ALEF, text); if word_type=='unknown': sql = u"select * FROM %s WHERE unvocalized='%s'"%(self.tableName,text); else: sql = u"select * FROM %s WHERE vocalized='%s'"%(self.tableName,text); if word_type=='verb': sql+=" AND word_type='verb' "; elif word_type=='noun': sql+=" AND word_type!='verb' "; try: self.cursor.execute(sql); if self.cursor: # return self.curser.fetchall(); for row in self.cursor: idList.append(row); return idList; except: return [];
def getSuffixVariant(self, word, suffix, enclitic): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix=ة, encletic=ي. The suffix is converted to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: variant of suffixes (vocalized suffix and vocalized suffix without I'rab short mark). @rtype: (unicode, unicode) """ enclitic_nm=araby.stripTashkeel(enclitic) newSuffix =suffix; #default value #if the word ends by a haraka if suffix.find(araby.TEH_MARBUTA)>=0 and len (enclitic_nm)>0: newSuffix=re.sub(araby.TEH_MARBUTA, araby.TEH, suffix); elif not enclitic_nm and word[-1:] in (araby.ALEF_MAKSURA, araby.YEH, araby.ALEF) and araby.isHaraka(suffix): newSuffix=u""; #gererate the suffix without I'rab short mark # here we lookup with given suffix because the new suffix is changed and can be not found in table if u'متحرك' in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']: suffixNonIrabMark =araby.stripLastHaraka(newSuffix); else: suffixNonIrabMark = newSuffix return newSuffix, suffixNonIrabMark ;