def getSuffixVariant(self, word, suffix, enclitic): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix=ة, encletic=ي. The suffix is converted to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: variant of suffixes (vocalized suffix and vocalized suffix without I'rab short mark). @rtype: (unicode, unicode) """ enclitic_nm = araby.stripTashkeel(enclitic) newSuffix = suffix #default value #if the word ends by a haraka if suffix.find(araby.TEH_MARBUTA) >= 0 and len(enclitic_nm) > 0: newSuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix) elif not enclitic_nm and word[-1:] in ( araby.ALEF_MAKSURA, araby.YEH, araby.ALEF) and araby.isHaraka(suffix): newSuffix = u"" #gererate the suffix without I'rab short mark # here we lookup with given suffix because the new suffix is changed and can be not found in table if u'متحرك' in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']: suffixNonIrabMark = araby.stripLastHaraka(newSuffix) else: suffixNonIrabMark = newSuffix return newSuffix, suffixNonIrabMark
def getWordVariant(self, word, suffix): """ Get the word variant to be joined to the suffix. For example: word = ةمدرس, suffix=ي. The word is converted to مدرست. @param word: word found in dictionary. @type word: unicode. @param suffix: suffix ( firts or second level). @type suffix: unicode. @return: variant of word. @rtype: unicode. """ word_stem = word # print word.encode('utf8'); #HARAKAT=(FATHA,DAMMA,KASRA,SUKUN, DAMMA, DAMMATAN, KASRATAN, FATHATAN); suffix_nm = araby.stripTashkeel(suffix) #if the word ends by a haraka word_stem = araby.stripLastHaraka(word_stem) if word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm in ( araby.ALEF + araby.TEH, araby.YEH + araby.TEH_MARBUTA, araby.YEH, araby.YEH + araby.ALEF + araby.TEH): word_stem = word_stem[:-1] elif word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm != u"": word_stem = word_stem[:-1] + araby.TEH elif word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm != u"": word_stem = word_stem[:-1] + araby.YEH elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"": if suffix.startswith(araby.DAMMA): word_stem = word_stem[:-1] + araby.WAW_HAMZA elif suffix.startswith(araby.KASRA): word_stem = word_stem[:-1] + araby.YEH_HAMZA return word_stem
def getWordVariant(self, word, suffix): """ Get the word variant to be joined to the suffix. For example: word = مدرسة, suffix=ي. The word is converted to مدرست. @param word: word found in dictionary. @type word: unicode. @param suffix: suffix ( firts or second level). @type suffix: unicode. @return: variant of word. @rtype: unicode. """ word_stem=word; # print word.encode('utf8'); #HARAKAT=(FATHA, DAMMA, KASRA, SUKUN, DAMMA, DAMMATAN, KASRATAN, FATHATAN); suffix_nm=araby.stripTashkeel(suffix) #if the word ends by a haraka word_stem=araby.stripLastHaraka(word_stem); if word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA, araby.YEH, araby.YEH+araby.ALEF+araby.TEH): word_stem=word_stem[:-1]; elif word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm!=u"": word_stem=word_stem[:-1]+araby.TEH; elif word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm!=u"": word_stem = word_stem[:-1]+araby.YEH; elif word_stem.endswith(araby.HAMZA) and suffix_nm!=u"": if suffix.startswith(araby.DAMMA): word_stem = word_stem[:-1] + araby.WAW_HAMZA; elif suffix.startswith(araby.KASRA): word_stem = word_stem[:-1] + araby.YEH_HAMZA; return word_stem;
def getSuffixVariant(self, word, suffix, enclitic): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix=ة, encletic=ي. The suffix is converted to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: variant of suffixes (vocalized suffix and vocalized suffix without I'rab short mark). @rtype: (unicode, unicode) """ enclitic_nm=araby.stripTashkeel(enclitic) newSuffix =suffix; #default value #if the word ends by a haraka if suffix.find(araby.TEH_MARBUTA)>=0 and len (enclitic_nm)>0: newSuffix=re.sub(araby.TEH_MARBUTA, araby.TEH, suffix); elif not enclitic_nm and word[-1:] in (araby.ALEF_MAKSURA, araby.YEH, araby.ALEF) and araby.isHaraka(suffix): newSuffix=u""; #gererate the suffix without I'rab short mark # here we lookup with given suffix because the new suffix is changed and can be not found in table if u'متحرك' in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']: suffixNonIrabMark =araby.stripLastHaraka(newSuffix); else: suffixNonIrabMark = newSuffix return newSuffix, suffixNonIrabMark ;
def tashkeel(self,inputtext,suggestion=False, format='text'): """ Vocalize the text and give suggestion to improve tashkeel by user. @param text: input text. @type text: unicode. @return: vocalized text. rtype: dict of dict or text. """ inputtext = self.preTashkeel(inputtext); # print "PreTashkeel", inputtext.encode('utf8'); # The statistical tashkeel must return a text. #comment this after tests if self.getEnabledStatTashkeel(): inputtext = self.statTashkeel(inputtext); #split texts into phrases to treat one phrase in time texts=self.analyzer.splitIntoPhrases(inputtext); # texts=[inputtext,] vocalized_text=u""; previous=None; outputSuggestList=[] ChosenList=[] suggestsList=[] for text in texts: #morpholigical analysis of text detailled_syntax, synodelist = self.fullStemmer(text); # calculate scores to enalbe chosing tashkeel by scoring # if self.enabledSyntaxicAnalysis and self.enabledSemanticAnalysis: # detailled_syntax = self.anasem.calculateScores(detailled_syntax); previous = None; nextNode = None; preNode = None; for wordCasesList in detailled_syntax: #wordCasesList = self.anasynt.exclode_cases(wordCasesList) currentChosen = self.choose_tashkeel(wordCasesList,previous,preNode, nextNode); # ajust tanwin case # if previous and previous.canHaveTanwin() and not self.anasynt.isRelated(previous, currentChosen): # #vocalized_text+="1"; # ChosenList[len(ChosenList)-1].ajustTanwin(); # o ajust relation between words # if the actual word is transparent don't change the previous # add this to Sytaxic Analyser if not currentChosen.isTransparent(): previous = currentChosen; ChosenList.append(currentChosen); # create a suggest list suggest=[]; for item in wordCasesList: # ITEM IS A stemmedSynWord instance voc=item.getVocalized(); suggest.append(voc); # if item.canHaveTanwin(): # # يمكن لهذا أن يولد صيغا جديدة بها تنوي # # في بعض الحالات قد لا يكون شيئا جديدا # # نقارنه مع الكلمة السابقة منوّنة ومن ثمّ نقرر إضافتها أولا # item.ajustTanwin(); # vocTnwn = item.getVocalized() # if vocTnwn!=voc: # suggest.append(vocTnwn); suggest.sort(); suggestsList.append(suggest); outputSuggestList=[] #create texts from chosen cases for i in range(len(ChosenList)): word = ChosenList[i].getVocalized(); # omit the last haraka if the option LastMark is False if not self.getEnabledLastMark(): word = araby.stripLastHaraka(word); vocalized_text=u" ".join([vocalized_text,self.display(word,format)]); outputSuggestList.append({'chosen':word,'suggest':u";".join(suggestsList[i])}); # correct the resulted text to ajust some case of consonant neighbor #معالجة حالات التقاء الساكنين if self.getEnabledAjustVocalization(): vocalized_text = self.ajustVocalizedResult(vocalized_text); if suggestion: outputSuggestList = self.ajustVocalizedSuggestionResult(outputSuggestList); return outputSuggestList; else: return vocalized_text;
def extract(word): """ """ #print word.encode('utf8'); if araby.isArabicword(word): print araby.stripLastHaraka(word).encode('utf8');
def extract(word): """ """ #print word.encode('utf8'); if araby.isArabicword(word): print araby.stripLastHaraka(word).encode('utf8')
def tashkeel(self, inputtext, suggestion=False, format='text'): """ Vocalize the text and give suggestion to improve tashkeel by user. @param text: input text. @type text: unicode. @return: vocalized text. rtype: dict of dict or text. """ inputtext = self.preTashkeel(inputtext) # print "PreTashkeel", inputtext.encode('utf8'); # The statistical tashkeel must return a text. #comment this after tests if self.getEnabledStatTashkeel(): inputtext = self.statTashkeel(inputtext) #split texts into phrases to treat one phrase in time texts = self.analyzer.splitIntoPhrases(inputtext) # texts=[inputtext,] vocalized_text = u"" previous = None outputSuggestList = [] ChosenList = [] suggestsList = [] for text in texts: #morpholigical analysis of text detailled_syntax, synodelist = self.fullStemmer(text) # calculate scores to enalbe chosing tashkeel by scoring # if self.enabledSyntaxicAnalysis and self.enabledSemanticAnalysis: # detailled_syntax = self.anasem.calculateScores(detailled_syntax); previous = None nextNode = None preNode = None for wordCasesList in detailled_syntax: #wordCasesList = self.anasynt.exclode_cases(wordCasesList) currentChosen = self.choose_tashkeel(wordCasesList, previous, preNode, nextNode) # ajust tanwin case # if previous and previous.canHaveTanwin() and not self.anasynt.isRelated(previous, currentChosen): # #vocalized_text+="1"; # ChosenList[len(ChosenList)-1].ajustTanwin(); # o ajust relation between words # if the actual word is transparent don't change the previous # add this to Sytaxic Analyser if not currentChosen.isTransparent(): previous = currentChosen ChosenList.append(currentChosen) # create a suggest list suggest = [] for item in wordCasesList: # ITEM IS A stemmedSynWord instance voc = item.getVocalized() suggest.append(voc) # if item.canHaveTanwin(): # # يمكن لهذا أن يولد صيغا جديدة بها تنوي # # في بعض الحالات قد لا يكون شيئا جديدا # # نقارنه مع الكلمة السابقة منوّنة ومن ثمّ نقرر إضافتها أولا # item.ajustTanwin(); # vocTnwn = item.getVocalized() # if vocTnwn!=voc: # suggest.append(vocTnwn); suggest.sort() suggestsList.append(suggest) outputSuggestList = [] #create texts from chosen cases for i in range(len(ChosenList)): word = ChosenList[i].getVocalized() # omit the last haraka if the option LastMark is False if not self.getEnabledLastMark(): word = araby.stripLastHaraka(word) vocalized_text = u" ".join( [vocalized_text, self.display(word, format)]) outputSuggestList.append({ 'chosen': word, 'suggest': u";".join(suggestsList[i]) }) # correct the resulted text to ajust some case of consonant neighbor #معالجة حالات التقاء الساكنين if self.getEnabledAjustVocalization(): vocalized_text = self.ajustVocalizedResult(vocalized_text) if suggestion: outputSuggestList = self.ajustVocalizedSuggestionResult( outputSuggestList) return outputSuggestList else: return vocalized_text
def test(): filename, text, stripTashkeel, reducedTashkeel, disableSyntax, disableSemantic, disableStat, ignore, limit, compare =grabargs() #filename="samples/randomtext.txt" if not text and not filename: usage() sys.exit(0) if not text: try: myfile=open(filename) except: print " Can't Open the given File ", filename; sys.exit(); else: lines = text.split('\n'); # all things are well, import library import core.adaat import pyarabic.araby as araby counter=1; if not limit : limit= 100000000 if not stripTashkeel: vocalizer=ArabicVocalizer.TashkeelClass(); if ignore : vocalizer.disableLastMark(); if disableSemantic: vocalizer.disableSemanticAnalysis(); if disableSyntax: vocalizer.disableSyntaxicAnalysis(); if disableStat: vocalizer.disableStatTashkeel(); #vocalizer.disableShowCollocationMark(); #print "show delimiter", vocalizer.collo.showDelimiter; #nolimit = True; nolimit = False; if not text: line=(myfile.readline()).decode('utf8'); else: if len(lines)>0: line= lines[0]; correct=0; incorrect=0; total=0; totLetters =0; LettersError =0 WLMIncorrect =0; if compare: #dispaly stats for the current line print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct" # print "Full\tPartial\tFull correct \tfull incorrect\tpartial correct\tpartial incorrect\tWER\tLER\tTotal" while line and (nolimit or counter<=limit): if not line.startswith('#'): # lineIncorrect = 0; lineCorrect = 0; lineWLMIncorrect =0; if stripTashkeel: result = araby.stripTashkeel(line); else: #vocalize line by line if compare: vocalizedLine = line; line = araby.stripTashkeel(line) result=vocalizer.tashkeel(line); #compare resultLine and vocalizedLine if compare: list1=vocalizer.analyzer.tokenize(vocalizedLine); list2=vocalizer.analyzer.tokenize(result); #print u":".join(list1).encode('utf8'); #print u":".join(list2).encode('utf8'); total+=len(list1); lineTotal = len(list1); if len(list1)!=len(list2): print "lists haven't the same length"; else: for i in range(len(list1)): simi = araby.vocalizedSimilarity(list1[i],list2[i]); if simi<0: LettersError+= -simi; incorrect +=1; # lineIncorrect += 1; # evaluation without last haraka simi2 = araby.vocalizedSimilarity(araby.stripLastHaraka(list1[i]),araby.stripLastHaraka(list2[i])); if simi2<0: WLMIncorrect +=1; lineWLMIncorrect+=1; else: correct+=1; lineCorrect += 1; #compare resultLine and vocalizedLine if reducedTashkeel: result= araby.reduceTashkeel(result) # print result.encode('utf8'); counter+=1; #display stat for every line if compare: print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t"%( counter-1,#id round(correct*100.00/total,2),#fully Correct round((total-WLMIncorrect)*100.00/total,2),#Strip Correct incorrect,#fully WER WLMIncorrect,#Strip WER LettersError,#LER total,#Total ), if lineTotal: print "%0.2f%%\t"%round(lineCorrect*100.00/lineTotal,2),#line Fully correct print "%0.2f%%\t"%round((lineTotal-lineWLMIncorrect)*100.00/lineTotal,2),#line Strip correct print result.encode('utf8'); #get the next line if not text: line=(myfile.readline()).decode('utf8'); else: if counter<len(lines): line= lines[counter]; else: line =None;
def test(): filename, text, stripTashkeel, reducedTashkeel, disableSyntax, disableSemantic, disableStat, ignore, limit, compare = grabargs( ) #filename="samples/randomtext.txt" if not text and not filename: usage() sys.exit(0) if not text: try: myfile = open(filename) except: print " Can't Open the given File ", filename sys.exit() else: lines = text.split('\n') # all things are well, import library import core.adaat import pyarabic.araby as araby counter = 1 if not limit: limit = 100000000 if not stripTashkeel: vocalizer = ArabicVocalizer.TashkeelClass() if ignore: vocalizer.disableLastMark() if disableSemantic: vocalizer.disableSemanticAnalysis() if disableSyntax: vocalizer.disableSyntaxicAnalysis() if disableStat: vocalizer.disableStatTashkeel() #vocalizer.disableShowCollocationMark(); #print "show delimiter", vocalizer.collo.showDelimiter; #nolimit = True; nolimit = False if not text: line = (myfile.readline()).decode('utf8') else: if len(lines) > 0: line = lines[0] correct = 0 incorrect = 0 total = 0 totLetters = 0 LettersError = 0 WLMIncorrect = 0 if compare: #dispaly stats for the current line print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct" # print "Full\tPartial\tFull correct \tfull incorrect\tpartial correct\tpartial incorrect\tWER\tLER\tTotal" while line and (nolimit or counter <= limit): if not line.startswith('#'): # lineIncorrect = 0; lineCorrect = 0 lineWLMIncorrect = 0 if stripTashkeel: result = araby.stripTashkeel(line) else: #vocalize line by line if compare: vocalizedLine = line line = araby.stripTashkeel(line) result = vocalizer.tashkeel(line) #compare resultLine and vocalizedLine if compare: list1 = vocalizer.analyzer.tokenize(vocalizedLine) list2 = vocalizer.analyzer.tokenize(result) #print u":".join(list1).encode('utf8'); #print u":".join(list2).encode('utf8'); total += len(list1) lineTotal = len(list1) if len(list1) != len(list2): print "lists haven't the same length" else: for i in range(len(list1)): simi = araby.vocalizedSimilarity( list1[i], list2[i]) if simi < 0: LettersError += -simi incorrect += 1 # lineIncorrect += 1; # evaluation without last haraka simi2 = araby.vocalizedSimilarity( araby.stripLastHaraka(list1[i]), araby.stripLastHaraka(list2[i])) if simi2 < 0: WLMIncorrect += 1 lineWLMIncorrect += 1 else: correct += 1 lineCorrect += 1 #compare resultLine and vocalizedLine if reducedTashkeel: result = araby.reduceTashkeel(result) # print result.encode('utf8'); counter += 1 #display stat for every line if compare: print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t" % ( counter - 1, #id round(correct * 100.00 / total, 2), #fully Correct round((total - WLMIncorrect) * 100.00 / total, 2), #Strip Correct incorrect, #fully WER WLMIncorrect, #Strip WER LettersError, #LER total, #Total ), if lineTotal: print "%0.2f%%\t" % round(lineCorrect * 100.00 / lineTotal, 2), #line Fully correct print "%0.2f%%\t" % round( (lineTotal - lineWLMIncorrect) * 100.00 / lineTotal, 2), #line Strip correct print result.encode('utf8') #get the next line if not text: line = (myfile.readline()).decode('utf8') else: if counter < len(lines): line = lines[counter] else: line = None