Example #1
0
    def getSuffixVariant(self, word, suffix, enclitic):
        """
		Get the suffix variant to be joined to the word.
		For example: word = مدرس, suffix=ة, encletic=ي. The suffix is converted to Teh.
		@param word: word found in dictionary.
		@type word: unicode.
		@param suffix: second level suffix.
		@type suffix: unicode.
		@param enclitic: first level suffix.
		@type enclitic: unicode.		
		@return: variant of suffixes  (vocalized suffix and vocalized suffix without I'rab short mark).
		@rtype: (unicode, unicode)
		"""
        enclitic_nm = araby.stripTashkeel(enclitic)
        newSuffix = suffix
        #default value
        #if the word ends by a haraka
        if suffix.find(araby.TEH_MARBUTA) >= 0 and len(enclitic_nm) > 0:
            newSuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix)
        elif not enclitic_nm and word[-1:] in (
                araby.ALEF_MAKSURA, araby.YEH,
                araby.ALEF) and araby.isHaraka(suffix):
            newSuffix = u""
        #gererate the suffix without I'rab short mark
        # here we lookup with given suffix because the new suffix is changed and can be not found in table
        if u'متحرك' in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
            suffixNonIrabMark = araby.stripLastHaraka(newSuffix)
        else:
            suffixNonIrabMark = newSuffix
        return newSuffix, suffixNonIrabMark
Example #2
0
    def getWordVariant(self, word, suffix):
        """
		Get the word variant to be joined to the suffix.
		For example: word = ةمدرس, suffix=ي. The word is converted to مدرست.
		@param word: word found in dictionary.
		@type word: unicode.
		@param suffix: suffix ( firts or second level).
		@type suffix: unicode.
		@return: variant of word.
		@rtype: unicode.
		"""
        word_stem = word
        # print word.encode('utf8');
        #HARAKAT=(FATHA,DAMMA,KASRA,SUKUN, DAMMA, DAMMATAN, KASRATAN, FATHATAN);
        suffix_nm = araby.stripTashkeel(suffix)
        #if the word ends by a haraka
        word_stem = araby.stripLastHaraka(word_stem)

        if word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm in (
                araby.ALEF + araby.TEH, araby.YEH + araby.TEH_MARBUTA,
                araby.YEH, araby.YEH + araby.ALEF + araby.TEH):
            word_stem = word_stem[:-1]
        elif word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm != u"":
            word_stem = word_stem[:-1] + araby.TEH
        elif word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm != u"":
            word_stem = word_stem[:-1] + araby.YEH
        elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"":
            if suffix.startswith(araby.DAMMA):
                word_stem = word_stem[:-1] + araby.WAW_HAMZA
            elif suffix.startswith(araby.KASRA):
                word_stem = word_stem[:-1] + araby.YEH_HAMZA

        return word_stem
	def getWordVariant(self, word, suffix):
		"""
		Get the word variant to be joined to the suffix.
		For example: word = مدرسة, suffix=ي. The word is converted to مدرست.
		@param word: word found in dictionary.
		@type word: unicode.
		@param suffix: suffix ( firts or second level).
		@type suffix: unicode.
		@return: variant of word.
		@rtype: unicode.
		"""
		word_stem=word;
		# print word.encode('utf8');
		#HARAKAT=(FATHA, DAMMA, KASRA, SUKUN, DAMMA, DAMMATAN, KASRATAN, FATHATAN);
		suffix_nm=araby.stripTashkeel(suffix)
		#if the word ends by a haraka
		word_stem=araby.stripLastHaraka(word_stem);

		if word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA, araby.YEH, araby.YEH+araby.ALEF+araby.TEH):
			word_stem=word_stem[:-1];
		elif word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm!=u"":
			word_stem=word_stem[:-1]+araby.TEH;
		elif word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm!=u"":
			word_stem = word_stem[:-1]+araby.YEH;			
		elif word_stem.endswith(araby.HAMZA) and suffix_nm!=u"":
			if suffix.startswith(araby.DAMMA):
				word_stem = word_stem[:-1] + araby.WAW_HAMZA;
			elif suffix.startswith(araby.KASRA):
				word_stem = word_stem[:-1] + araby.YEH_HAMZA;
				
		return word_stem;
	def getSuffixVariant(self, word, suffix, enclitic):
		"""
		Get the suffix variant to be joined to the word.
		For example: word = مدرس, suffix=ة, encletic=ي. The suffix is converted to Teh.
		@param word: word found in dictionary.
		@type word: unicode.
		@param suffix: second level suffix.
		@type suffix: unicode.
		@param enclitic: first level suffix.
		@type enclitic: unicode.		
		@return: variant of suffixes  (vocalized suffix and vocalized suffix without I'rab short mark).
		@rtype: (unicode, unicode)
		"""
		enclitic_nm=araby.stripTashkeel(enclitic)
		newSuffix =suffix; #default value
		#if the word ends by a haraka
		if suffix.find(araby.TEH_MARBUTA)>=0 and len (enclitic_nm)>0:
			newSuffix=re.sub(araby.TEH_MARBUTA, araby.TEH, suffix);
		elif 	not enclitic_nm and word[-1:] in (araby.ALEF_MAKSURA, araby.YEH, araby.ALEF) and araby.isHaraka(suffix):
			newSuffix=u"";
		#gererate the suffix without I'rab short mark
		# here we lookup with given suffix because the new suffix is changed and can be not found in table
		if u'متحرك' in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
			suffixNonIrabMark =araby.stripLastHaraka(newSuffix);
		else:
			suffixNonIrabMark = newSuffix
		return newSuffix, suffixNonIrabMark ;
	def tashkeel(self,inputtext,suggestion=False, format='text'):
		"""
		Vocalize the text and give suggestion to improve tashkeel by user.
		@param text: input text.
		@type text: unicode.
		@return: vocalized text.
		rtype: dict of dict or text.
		"""
		inputtext = self.preTashkeel(inputtext);
		# print "PreTashkeel", inputtext.encode('utf8');
		# The statistical tashkeel must return a text.
		#comment this after tests
		if self.getEnabledStatTashkeel():
			inputtext = self.statTashkeel(inputtext);
	
		#split texts into phrases to treat one phrase in time
		texts=self.analyzer.splitIntoPhrases(inputtext);
		# texts=[inputtext,]
		vocalized_text=u"";
		previous=None;
		outputSuggestList=[]
		ChosenList=[]	
		suggestsList=[]	
		for text in texts:
			
			#morpholigical analysis of text
			detailled_syntax, synodelist = self.fullStemmer(text);

			# calculate scores to enalbe chosing tashkeel by scoring
			# if self.enabledSyntaxicAnalysis and self.enabledSemanticAnalysis:
				# detailled_syntax = self.anasem.calculateScores(detailled_syntax);

			previous = None;
			nextNode = None;
			preNode  = None;
			for wordCasesList in detailled_syntax:

				#wordCasesList = self.anasynt.exclode_cases(wordCasesList)
				currentChosen = self.choose_tashkeel(wordCasesList,previous,preNode, nextNode);
				# ajust tanwin case
				# if previous and previous.canHaveTanwin() and not self.anasynt.isRelated(previous, currentChosen):
					# #vocalized_text+="1";
					# ChosenList[len(ChosenList)-1].ajustTanwin(); 
				# o ajust relation between words
				# if the actual word is transparent don't change the previous
				# add this to Sytaxic Analyser
				if not currentChosen.isTransparent():
					previous = currentChosen;
				ChosenList.append(currentChosen);

				# create a suggest list
				suggest=[];
				for item in wordCasesList:
					# ITEM IS A stemmedSynWord instance
					voc=item.getVocalized();
					suggest.append(voc);
					# if item.canHaveTanwin():
						# # يمكن لهذا أن يولد صيغا جديدة بها تنوي
						# # في بعض الحالات قد لا يكون شيئا جديدا 
						# # نقارنه مع الكلمة السابقة منوّنة ومن ثمّ نقرر إضافتها أولا
						# item.ajustTanwin();
						# vocTnwn = item.getVocalized()
						# if vocTnwn!=voc:
							# suggest.append(vocTnwn);
				suggest.sort();
				suggestsList.append(suggest);
		outputSuggestList=[]
		#create texts from chosen cases
		for i in range(len(ChosenList)):
			word = ChosenList[i].getVocalized();
			# omit the last haraka if the option LastMark is False
			if not self.getEnabledLastMark():
				word = araby.stripLastHaraka(word);
			vocalized_text=u" ".join([vocalized_text,self.display(word,format)]);
			outputSuggestList.append({'chosen':word,'suggest':u";".join(suggestsList[i])});
		
		# correct the resulted text to ajust some case of consonant neighbor
		#معالجة حالات التقاء الساكنين
		if self.getEnabledAjustVocalization():
			vocalized_text = self.ajustVocalizedResult(vocalized_text);
		if suggestion:
			outputSuggestList = self.ajustVocalizedSuggestionResult(outputSuggestList);
			return outputSuggestList;
		else:
			return vocalized_text;
Example #6
0
def extract(word):
    """
    """
    #print word.encode('utf8');
    if araby.isArabicword(word):
        print araby.stripLastHaraka(word).encode('utf8');
def extract(word):
    """
	"""
    #print word.encode('utf8');
    if araby.isArabicword(word):
        print araby.stripLastHaraka(word).encode('utf8')
Example #8
0
    def tashkeel(self, inputtext, suggestion=False, format='text'):
        """
		Vocalize the text and give suggestion to improve tashkeel by user.
		@param text: input text.
		@type text: unicode.
		@return: vocalized text.
		rtype: dict of dict or text.
		"""
        inputtext = self.preTashkeel(inputtext)
        # print "PreTashkeel", inputtext.encode('utf8');
        # The statistical tashkeel must return a text.
        #comment this after tests
        if self.getEnabledStatTashkeel():
            inputtext = self.statTashkeel(inputtext)

        #split texts into phrases to treat one phrase in time
        texts = self.analyzer.splitIntoPhrases(inputtext)
        # texts=[inputtext,]
        vocalized_text = u""
        previous = None
        outputSuggestList = []
        ChosenList = []
        suggestsList = []
        for text in texts:

            #morpholigical analysis of text
            detailled_syntax, synodelist = self.fullStemmer(text)

            # calculate scores to enalbe chosing tashkeel by scoring
            # if self.enabledSyntaxicAnalysis and self.enabledSemanticAnalysis:
            # detailled_syntax = self.anasem.calculateScores(detailled_syntax);

            previous = None
            nextNode = None
            preNode = None
            for wordCasesList in detailled_syntax:

                #wordCasesList = self.anasynt.exclode_cases(wordCasesList)
                currentChosen = self.choose_tashkeel(wordCasesList, previous,
                                                     preNode, nextNode)
                # ajust tanwin case
                # if previous and previous.canHaveTanwin() and not self.anasynt.isRelated(previous, currentChosen):
                # #vocalized_text+="1";
                # ChosenList[len(ChosenList)-1].ajustTanwin();
                # o ajust relation between words
                # if the actual word is transparent don't change the previous
                # add this to Sytaxic Analyser
                if not currentChosen.isTransparent():
                    previous = currentChosen
                ChosenList.append(currentChosen)

                # create a suggest list
                suggest = []
                for item in wordCasesList:
                    # ITEM IS A stemmedSynWord instance
                    voc = item.getVocalized()
                    suggest.append(voc)
                    # if item.canHaveTanwin():
                    # # يمكن لهذا أن يولد صيغا جديدة بها تنوي
                    # # في بعض الحالات قد لا يكون شيئا جديدا
                    # # نقارنه مع الكلمة السابقة منوّنة ومن ثمّ نقرر إضافتها أولا
                    # item.ajustTanwin();
                    # vocTnwn = item.getVocalized()
                    # if vocTnwn!=voc:
                    # suggest.append(vocTnwn);
                suggest.sort()
                suggestsList.append(suggest)
        outputSuggestList = []
        #create texts from chosen cases
        for i in range(len(ChosenList)):
            word = ChosenList[i].getVocalized()
            # omit the last haraka if the option LastMark is False
            if not self.getEnabledLastMark():
                word = araby.stripLastHaraka(word)
            vocalized_text = u" ".join(
                [vocalized_text, self.display(word, format)])
            outputSuggestList.append({
                'chosen': word,
                'suggest': u";".join(suggestsList[i])
            })

        # correct the resulted text to ajust some case of consonant neighbor
        #معالجة حالات التقاء الساكنين
        if self.getEnabledAjustVocalization():
            vocalized_text = self.ajustVocalizedResult(vocalized_text)
        if suggestion:
            outputSuggestList = self.ajustVocalizedSuggestionResult(
                outputSuggestList)
            return outputSuggestList
        else:
            return vocalized_text
Example #9
0
def test():
	filename, text,  stripTashkeel, reducedTashkeel, disableSyntax, disableSemantic, disableStat, ignore, limit, compare =grabargs()
	#filename="samples/randomtext.txt"	
	if not text and not filename:
		usage()
		sys.exit(0)
		
	if not text:
		try:
			myfile=open(filename)
		except:
			print " Can't Open the given File ", filename;
			sys.exit();
	else:
		lines = text.split('\n');
	# all things are well, import library
	import core.adaat 
	import pyarabic.araby as araby

	counter=1;
	if not limit : 
		limit=	100000000
	if not stripTashkeel: 
		vocalizer=ArabicVocalizer.TashkeelClass();
		if ignore : 
			vocalizer.disableLastMark();
		if disableSemantic:
			vocalizer.disableSemanticAnalysis();
		if disableSyntax:
			vocalizer.disableSyntaxicAnalysis();
		if disableStat:
			vocalizer.disableStatTashkeel();

	#vocalizer.disableShowCollocationMark();
	#print "show delimiter", vocalizer.collo.showDelimiter;
	#nolimit = True;
	nolimit = False;
	if not text:
		line=(myfile.readline()).decode('utf8');
	else:
		if len(lines)>0:
			line= lines[0];
	correct=0;
	incorrect=0;
	total=0;
	totLetters =0;
	LettersError =0
	WLMIncorrect =0;
	if compare:
		#dispaly stats for the current line
		print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct"
		
		# print "Full\tPartial\tFull correct \tfull incorrect\tpartial correct\tpartial incorrect\tWER\tLER\tTotal"
	
	while line and (nolimit or counter<=limit):
		if not line.startswith('#'):
			# lineIncorrect = 0;
			lineCorrect   = 0;
			lineWLMIncorrect =0;
			if stripTashkeel:
				result = araby.stripTashkeel(line);
			else:	#vocalize line by line
				if compare:
					vocalizedLine = line;
					line = araby.stripTashkeel(line)
				result=vocalizer.tashkeel(line);
				#compare resultLine and vocalizedLine
				if compare:
					list1=vocalizer.analyzer.tokenize(vocalizedLine);
					list2=vocalizer.analyzer.tokenize(result);
					#print u":".join(list1).encode('utf8');
					#print u":".join(list2).encode('utf8');
					total+=len(list1);
					lineTotal = len(list1);
					if len(list1)!=len(list2):
						print "lists haven't the same length";
					else:
						for i in range(len(list1)):
							simi = araby.vocalizedSimilarity(list1[i],list2[i]);
							if simi<0:
								LettersError+= -simi;
								incorrect   +=1;
								# lineIncorrect += 1;
								# evaluation without last haraka
								simi2 = araby.vocalizedSimilarity(araby.stripLastHaraka(list1[i]),araby.stripLastHaraka(list2[i]));
								if simi2<0: 
									WLMIncorrect    +=1;
									lineWLMIncorrect+=1;								

							else:
								correct+=1;
								lineCorrect += 1;
					
			#compare resultLine and vocalizedLine
			if reducedTashkeel:
				result= araby.reduceTashkeel(result)
			# print result.encode('utf8');
			counter+=1;

			#display stat for every line
			if compare:
				print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t"%(
						counter-1,#id
						round(correct*100.00/total,2),#fully Correct
						round((total-WLMIncorrect)*100.00/total,2),#Strip Correct
						incorrect,#fully WER
						WLMIncorrect,#Strip WER
						LettersError,#LER
						total,#Total
						),
				if lineTotal:
					print "%0.2f%%\t"%round(lineCorrect*100.00/lineTotal,2),#line Fully correct
					print "%0.2f%%\t"%round((lineTotal-lineWLMIncorrect)*100.00/lineTotal,2),#line Strip correct
						
			print result.encode('utf8');
		#get the next line
		if not text:
			line=(myfile.readline()).decode('utf8');
		else:
			if counter<len(lines):
				line= lines[counter];
			else:
				line =None;
Example #10
0
def test():
    filename, text, stripTashkeel, reducedTashkeel, disableSyntax, disableSemantic, disableStat, ignore, limit, compare = grabargs(
    )
    #filename="samples/randomtext.txt"
    if not text and not filename:
        usage()
        sys.exit(0)

    if not text:
        try:
            myfile = open(filename)
        except:
            print " Can't Open the given File ", filename
            sys.exit()
    else:
        lines = text.split('\n')
    # all things are well, import library
    import core.adaat
    import pyarabic.araby as araby

    counter = 1
    if not limit:
        limit = 100000000
    if not stripTashkeel:
        vocalizer = ArabicVocalizer.TashkeelClass()
        if ignore:
            vocalizer.disableLastMark()
        if disableSemantic:
            vocalizer.disableSemanticAnalysis()
        if disableSyntax:
            vocalizer.disableSyntaxicAnalysis()
        if disableStat:
            vocalizer.disableStatTashkeel()

    #vocalizer.disableShowCollocationMark();
    #print "show delimiter", vocalizer.collo.showDelimiter;
    #nolimit = True;
    nolimit = False
    if not text:
        line = (myfile.readline()).decode('utf8')
    else:
        if len(lines) > 0:
            line = lines[0]
    correct = 0
    incorrect = 0
    total = 0
    totLetters = 0
    LettersError = 0
    WLMIncorrect = 0
    if compare:
        #dispaly stats for the current line
        print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct"

        # print "Full\tPartial\tFull correct \tfull incorrect\tpartial correct\tpartial incorrect\tWER\tLER\tTotal"

    while line and (nolimit or counter <= limit):
        if not line.startswith('#'):
            # lineIncorrect = 0;
            lineCorrect = 0
            lineWLMIncorrect = 0
            if stripTashkeel:
                result = araby.stripTashkeel(line)
            else:  #vocalize line by line
                if compare:
                    vocalizedLine = line
                    line = araby.stripTashkeel(line)
                result = vocalizer.tashkeel(line)
                #compare resultLine and vocalizedLine
                if compare:
                    list1 = vocalizer.analyzer.tokenize(vocalizedLine)
                    list2 = vocalizer.analyzer.tokenize(result)
                    #print u":".join(list1).encode('utf8');
                    #print u":".join(list2).encode('utf8');
                    total += len(list1)
                    lineTotal = len(list1)
                    if len(list1) != len(list2):
                        print "lists haven't the same length"
                    else:
                        for i in range(len(list1)):
                            simi = araby.vocalizedSimilarity(
                                list1[i], list2[i])
                            if simi < 0:
                                LettersError += -simi
                                incorrect += 1
                                # lineIncorrect += 1;
                                # evaluation without last haraka
                                simi2 = araby.vocalizedSimilarity(
                                    araby.stripLastHaraka(list1[i]),
                                    araby.stripLastHaraka(list2[i]))
                                if simi2 < 0:
                                    WLMIncorrect += 1
                                    lineWLMIncorrect += 1

                            else:
                                correct += 1
                                lineCorrect += 1

            #compare resultLine and vocalizedLine
            if reducedTashkeel:
                result = araby.reduceTashkeel(result)
            # print result.encode('utf8');
            counter += 1

            #display stat for every line
            if compare:
                print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t" % (
                    counter - 1,  #id
                    round(correct * 100.00 / total, 2),  #fully Correct
                    round((total - WLMIncorrect) * 100.00 / total,
                          2),  #Strip Correct
                    incorrect,  #fully WER
                    WLMIncorrect,  #Strip WER
                    LettersError,  #LER
                    total,  #Total
                ),
                if lineTotal:
                    print "%0.2f%%\t" % round(lineCorrect * 100.00 / lineTotal,
                                              2),  #line Fully correct
                    print "%0.2f%%\t" % round(
                        (lineTotal - lineWLMIncorrect) * 100.00 / lineTotal,
                        2),  #line Strip correct

            print result.encode('utf8')
        #get the next line
        if not text:
            line = (myfile.readline()).decode('utf8')
        else:
            if counter < len(lines):
                line = lines[counter]
            else:
                line = None