Example #1
0
    def getSuffixVariant(self, word, suffix, enclitic):
        """
		Get the suffix variant to be joined to the word.
		For example: word = مدرس, suffix=ة, encletic=ي. The suffix is converted to Teh.
		@param word: word found in dictionary.
		@type word: unicode.
		@param suffix: second level suffix.
		@type suffix: unicode.
		@param enclitic: first level suffix.
		@type enclitic: unicode.		
		@return: variant of suffixes  (vocalized suffix and vocalized suffix without I'rab short mark).
		@rtype: (unicode, unicode)
		"""
        enclitic_nm = araby.stripTashkeel(enclitic)
        newSuffix = suffix
        #default value
        #if the word ends by a haraka
        if suffix.find(araby.TEH_MARBUTA) >= 0 and len(enclitic_nm) > 0:
            newSuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix)
        elif not enclitic_nm and word[-1:] in (
                araby.ALEF_MAKSURA, araby.YEH,
                araby.ALEF) and araby.isHaraka(suffix):
            newSuffix = u""
        #gererate the suffix without I'rab short mark
        # here we lookup with given suffix because the new suffix is changed and can be not found in table
        if u'متحرك' in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
            suffixNonIrabMark = araby.stripLastHaraka(newSuffix)
        else:
            suffixNonIrabMark = newSuffix
        return newSuffix, suffixNonIrabMark
	def vocalize(self, noun, proclitic,  suffix, enclitic):
		"""
		Join the  noun and its affixes, and get the vocalized form
		@param noun: noun found in dictionary.
		@type noun: unicode.
		@param proclitic: first level prefix.
		@type proclitic: unicode.

		@param suffix: second level suffix.
		@type suffix: unicode.
		@param enclitic: first level suffix.
		@type enclitic: unicode.		
		@return: vocalized word.
		@rtype: unicode.
		"""
		# enclitic and procletric have only an uniq vocalization in arabic
		enclitic_voc  = stem_noun_const.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0];
		proclitic_voc = stem_noun_const.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0];
		suffix_voc    = suffix;#CONJ_SUFFIX_LIST_TAGS[suffix]["vocalized"][0];
		#adjust some some harakat
		
		#strip last if tanwin or last harakat
		if araby.isHaraka(noun[-1:]):#(DAMMATAN, FATHATAN, KASRATAN, FATHA, DAMMA, KASRA):
			noun = noun[:-1];
		# convert Fathatan into one fatha, in some cases where the tanwin is not at the end: eg. محتوًى
		noun = noun.replace(araby.FATHATAN, araby.FATHA);

		#add shadda if the first letter is sunny and the procletic contains AL definition mark
		if (u'تعريف' in stem_noun_const.COMP_PREFIX_LIST_TAGS[proclitic]["tags"] and araby.isSun(noun[0])):
		#if (u'تعريف' in proclitic.endswith(araby.ALEF+araby.LAM) or proclitic.endswith(araby.LAM+araby.LAM)) and araby.isSun(noun[0]):
			noun = u''.join([noun[0], araby.SHADDA, noun[1:]]);
			#strip the Skun from the lam
			if proclitic_voc.endswith(araby.SUKUN):
				proclitic_voc=proclitic_voc[:-1];
		# generate the word variant for some words witch ends by special letters like Teh_marbuta or Alef_maksura, or hamza, the variant is influed by the suffix harakat, 
		# for example مدرسة+ي= مدرست+ي
		noun         = self.getWordVariant(noun, suffix+enclitic);

		# generate the suffix variant. if the suffix is Teh_marbuta or Alef_maksura, or hamza, the variant is influed by the enclitic harakat, 
		# for example مدرس+ة+ي=مدرس+ت+ي		
		suffix_voc, suffix_NonIrabMark   = self.getSuffixVariant(noun, suffix_voc, enclitic);

		#Get the enclitic variant to be joined to the word.
		#For example: word = مدرس, suffix=ِة, encletic=هُ. The enclitic  is convert to HEH+ KAsra.
		enclitic_voc = self.getEncliticVariant(noun, suffix_voc, enclitic_voc);

		# generate the non vacalized end word: the vocalized word without the I3rab Mark
		# if the suffix is a short haraka 
		wordNonIrabMark= ''.join([ proclitic_voc,  noun, suffix_NonIrabMark,   enclitic_voc])			 
			
		wordVocalized =''.join([ proclitic_voc, noun, suffix_voc, enclitic_voc]);
		return wordVocalized,wordNonIrabMark 
Example #3
0
	def vocalize(self, noun, proclitic,  suffix, enclitic):
		"""
		Join the  noun and its affixes, and get the vocalized form
		@param noun: noun found in dictionary.
		@type noun: unicode.
		@param proclitic: first level prefix.
		@type proclitic: unicode.

		@param suffix: second level suffix.
		@type suffix: unicode.
		@param enclitic: first level suffix.
		@type enclitic: unicode.		
		@return: vocalized word.
		@rtype: unicode.
		"""
		# enclitic and procletric have only an uniq vocalization in arabic
		enclitic_voc  = stem_noun_const.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0];
		proclitic_voc = stem_noun_const.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0];
		suffix_voc    = suffix;#CONJ_SUFFIX_LIST_TAGS[suffix]["vocalized"][0];
		#adjust some some harakat
		
		#strip last if tanwin or last harakat
		if araby.isHaraka(noun[-1:]):#(DAMMATAN, FATHATAN, KASRATAN, FATHA, DAMMA, KASRA):
			noun = noun[:-1];
		# convert Fathatan into one fatha, in some cases where the tanwin is not at the end: eg. محتوًى
		noun = noun.replace(araby.FATHATAN, araby.FATHA);

		#add shadda if the first letter is sunny and the procletic contains AL definition mark
		if (u'تعريف' in stem_noun_const.COMP_PREFIX_LIST_TAGS[proclitic]["tags"] and araby.isSun(noun[0])):
		#if (u'تعريف' in proclitic.endswith(araby.ALEF+araby.LAM) or proclitic.endswith(araby.LAM+araby.LAM)) and araby.isSun(noun[0]):
			noun = u''.join([noun[0], araby.SHADDA, noun[1:]]);
			#strip the Skun from the lam
			if proclitic_voc.endswith(araby.SUKUN):
				proclitic_voc=proclitic_voc[:-1];
		# generate the word variant for some words witch ends by special letters like Teh_marbuta or Alef_maksura, or hamza, the variant is influed by the suffix harakat, 
		# for example مدرسة+ي= مدرست+ي
		noun         = self.getWordVariant(noun, suffix+enclitic);

		# generate the suffix variant. if the suffix is Teh_marbuta or Alef_maksura, or hamza, the variant is influed by the enclitic harakat, 
		# for example مدرس+ة+ي=مدرس+ت+ي		
		suffix_voc, suffix_NonIrabMark   = self.getSuffixVariant(noun, suffix_voc, enclitic);

		#Get the enclitic variant to be joined to the word.
		#For example: word = مدرس, suffix=ِة, encletic=هُ. The enclitic  is convert to HEH+ KAsra.
		enclitic_voc = self.getEncliticVariant(noun, suffix_voc, enclitic_voc);

		# generate the non vacalized end word: the vocalized word without the I3rab Mark
		# if the suffix is a short haraka 
		wordNonIrabMark= ''.join([ proclitic_voc,  noun, suffix_NonIrabMark,   enclitic_voc])			 
			
		wordVocalized =''.join([ proclitic_voc, noun, suffix_voc, enclitic_voc]);
		return wordVocalized,wordNonIrabMark 
    def lookup(self, text, word_type=''):
        """
		look up for all word forms in the dictionary, according to word_type
			- 'verb': lookup for verb only.
			- 'noun': look up for nouns.
			- 'unknown': the word is not alayzed, then search for unvocalized word.
			- '': look for voaclize word without type
		@param text:vocalized word.
		@type text: unicode.
		@param word_type: the word type can take 'verb', 'noun', 'unknwon', ''.
		@type word_type: unicode.		
		@return: list of dictionary entries IDs.
		@rtype: list.
		"""
        idList = []
        # strip the last haraka from the text to ensure the search
        #
        if araby.isHaraka(text[-1:]): text = text[:-1]
        # homogoneize with word typography
        # strip all fatha before alef into
        text = re.sub(araby.FATHA + araby.ALEF, araby.ALEF, text)
        if word_type == 'unknown':
            sql = u"select * FROM %s WHERE unvocalized='%s'" % (self.tableName,
                                                                text)
        else:
            sql = u"select * FROM %s WHERE vocalized='%s'" % (self.tableName,
                                                              text)
            if word_type == 'verb':
                sql += " AND word_type='verb' "
            elif word_type == 'noun':
                sql += " AND word_type!='verb' "
        try:
            self.cursor.execute(sql)
            if self.cursor:
                # return self.curser.fetchall();
                for row in self.cursor:
                    idList.append(row)
            return idList
        except:
            return []
	def lookup(self,text, word_type=''):
		"""
		look up for all word forms in the dictionary, according to word_type
			- 'verb': lookup for verb only.
			- 'noun': look up for nouns.
			- 'unknown': the word is not alayzed, then search for unvocalized word.
			- '': look for voaclize word without type
		@param text:vocalized word.
		@type text: unicode.
		@param word_type: the word type can take 'verb', 'noun', 'unknwon', ''.
		@type word_type: unicode.		
		@return: list of dictionary entries IDs.
		@rtype: list.
		"""
		idList=[];
		# strip the last haraka from the text to ensure the search
		#
		if araby.isHaraka(text[-1:]): text=text[:-1];
		# homogoneize with word typography
		# strip all fatha before alef into 
		text=re.sub(araby.FATHA+araby.ALEF, araby.ALEF, text);
		if word_type=='unknown':
			sql = u"select * FROM %s WHERE unvocalized='%s'"%(self.tableName,text);
		else:
			sql = u"select * FROM %s WHERE vocalized='%s'"%(self.tableName,text);			
			if word_type=='verb':
				sql+=" AND word_type='verb' ";
			elif word_type=='noun':
				sql+=" AND word_type!='verb' ";
		try:
			self.cursor.execute(sql);
			if self.cursor:
				# return self.curser.fetchall();
				for row in self.cursor:
					idList.append(row);
			return idList;
		except:
			return [];
	def getSuffixVariant(self, word, suffix, enclitic):
		"""
		Get the suffix variant to be joined to the word.
		For example: word = مدرس, suffix=ة, encletic=ي. The suffix is converted to Teh.
		@param word: word found in dictionary.
		@type word: unicode.
		@param suffix: second level suffix.
		@type suffix: unicode.
		@param enclitic: first level suffix.
		@type enclitic: unicode.		
		@return: variant of suffixes  (vocalized suffix and vocalized suffix without I'rab short mark).
		@rtype: (unicode, unicode)
		"""
		enclitic_nm=araby.stripTashkeel(enclitic)
		newSuffix =suffix; #default value
		#if the word ends by a haraka
		if suffix.find(araby.TEH_MARBUTA)>=0 and len (enclitic_nm)>0:
			newSuffix=re.sub(araby.TEH_MARBUTA, araby.TEH, suffix);
		elif 	not enclitic_nm and word[-1:] in (araby.ALEF_MAKSURA, araby.YEH, araby.ALEF) and araby.isHaraka(suffix):
			newSuffix=u"";
		#gererate the suffix without I'rab short mark
		# here we lookup with given suffix because the new suffix is changed and can be not found in table
		if u'متحرك' in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
			suffixNonIrabMark =araby.stripLastHaraka(newSuffix);
		else:
			suffixNonIrabMark = newSuffix
		return newSuffix, suffixNonIrabMark ;