def test(): filename, text, stripTashkeel, reducedTashkeel, disableSyntax, disableSemantic, disableStat, ignore, limit, compare =grabargs() #filename="samples/randomtext.txt" if not text and not filename: usage() sys.exit(0) if not text: try: myfile=open(filename) except: print " Can't Open the given File ", filename; sys.exit(); else: lines = text.split('\n'); # all things are well, import library import core.adaat import pyarabic.araby as araby counter=1; if not limit : limit= 100000000 if not stripTashkeel: vocalizer=ArabicVocalizer.TashkeelClass(); if ignore : vocalizer.disableLastMark(); if disableSemantic: vocalizer.disableSemanticAnalysis(); if disableSyntax: vocalizer.disableSyntaxicAnalysis(); if disableStat: vocalizer.disableStatTashkeel(); #vocalizer.disableShowCollocationMark(); #print "show delimiter", vocalizer.collo.showDelimiter; #nolimit = True; nolimit = False; if not text: line=(myfile.readline()).decode('utf8'); else: if len(lines)>0: line= lines[0]; correct=0; incorrect=0; total=0; totLetters =0; LettersError =0 WLMIncorrect =0; if compare: #dispaly stats for the current line print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct" # print "Full\tPartial\tFull correct \tfull incorrect\tpartial correct\tpartial incorrect\tWER\tLER\tTotal" while line and (nolimit or counter<=limit): if not line.startswith('#'): # lineIncorrect = 0; lineCorrect = 0; lineWLMIncorrect =0; if stripTashkeel: result = araby.stripTashkeel(line); else: #vocalize line by line if compare: vocalizedLine = line; line = araby.stripTashkeel(line) result=vocalizer.tashkeel(line); #compare resultLine and vocalizedLine if compare: list1=vocalizer.analyzer.tokenize(vocalizedLine); list2=vocalizer.analyzer.tokenize(result); #print u":".join(list1).encode('utf8'); #print u":".join(list2).encode('utf8'); total+=len(list1); lineTotal = len(list1); if len(list1)!=len(list2): print "lists haven't the same length"; else: for i in range(len(list1)): simi = araby.vocalizedSimilarity(list1[i],list2[i]); if simi<0: LettersError+= -simi; incorrect +=1; # lineIncorrect += 1; # evaluation without last haraka simi2 = araby.vocalizedSimilarity(araby.stripLastHaraka(list1[i]),araby.stripLastHaraka(list2[i])); if simi2<0: WLMIncorrect +=1; lineWLMIncorrect+=1; else: correct+=1; lineCorrect += 1; #compare resultLine and vocalizedLine if reducedTashkeel: result= araby.reduceTashkeel(result) # print result.encode('utf8'); counter+=1; #display stat for every line if compare: print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t"%( counter-1,#id round(correct*100.00/total,2),#fully Correct round((total-WLMIncorrect)*100.00/total,2),#Strip Correct incorrect,#fully WER WLMIncorrect,#Strip WER LettersError,#LER total,#Total ), if lineTotal: print "%0.2f%%\t"%round(lineCorrect*100.00/lineTotal,2),#line Fully correct print "%0.2f%%\t"%round((lineTotal-lineWLMIncorrect)*100.00/lineTotal,2),#line Strip correct print result.encode('utf8'); #get the next line if not text: line=(myfile.readline()).decode('utf8'); else: if counter<len(lines): line= lines[counter]; else: line =None;
import tashkeel if __name__ == '__main__': filename, disableSyntax, disableSemantic, disableStat, ignore, limit, compare =grabargs() #filename="samples/randomtext.txt" try: myfile=open(filename) except: print " Can't Open the given File ", filename; counter=1; if not limit : limit= 100000000 nolimit = False; correct=0; total=0; line=(myfile.readline()).decode('utf8'); while line and (nolimit or counter<=limit): unvocline= araby.stripTashkeel(line); vocalized=pyarabic.number.preTashkeelNumber(araby.tokenize(unvocline)); vocalized=u' '.join(vocalized); if vocalized!=unvocline: total+=1; sim = araby.vocalizedSimilarity(vocalized, araby.stripShadda( line)); if sim>=0: correct+=1; # for res in result: if sim<0: print u"\t".join([str(sim),str(counter),str(len(vocalized)),str(len(line)),vocalized, line]).encode('utf8'); #get the next line line=(myfile.readline()).decode('utf8'); counter+=1; print correct, total, round(correct*100.00/total,2)
def test(): filename, text, stripTashkeel, reducedTashkeel, disableSyntax, disableSemantic, disableStat, ignore, limit, compare = grabargs( ) #filename="samples/randomtext.txt" if not text and not filename: usage() sys.exit(0) if not text: try: myfile = open(filename) except: print " Can't Open the given File ", filename sys.exit() else: lines = text.split('\n') # all things are well, import library import core.adaat import pyarabic.araby as araby counter = 1 if not limit: limit = 100000000 if not stripTashkeel: vocalizer = ArabicVocalizer.TashkeelClass() if ignore: vocalizer.disableLastMark() if disableSemantic: vocalizer.disableSemanticAnalysis() if disableSyntax: vocalizer.disableSyntaxicAnalysis() if disableStat: vocalizer.disableStatTashkeel() #vocalizer.disableShowCollocationMark(); #print "show delimiter", vocalizer.collo.showDelimiter; #nolimit = True; nolimit = False if not text: line = (myfile.readline()).decode('utf8') else: if len(lines) > 0: line = lines[0] correct = 0 incorrect = 0 total = 0 totLetters = 0 LettersError = 0 WLMIncorrect = 0 if compare: #dispaly stats for the current line print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct" # print "Full\tPartial\tFull correct \tfull incorrect\tpartial correct\tpartial incorrect\tWER\tLER\tTotal" while line and (nolimit or counter <= limit): if not line.startswith('#'): # lineIncorrect = 0; lineCorrect = 0 lineWLMIncorrect = 0 if stripTashkeel: result = araby.stripTashkeel(line) else: #vocalize line by line if compare: vocalizedLine = line line = araby.stripTashkeel(line) result = vocalizer.tashkeel(line) #compare resultLine and vocalizedLine if compare: list1 = vocalizer.analyzer.tokenize(vocalizedLine) list2 = vocalizer.analyzer.tokenize(result) #print u":".join(list1).encode('utf8'); #print u":".join(list2).encode('utf8'); total += len(list1) lineTotal = len(list1) if len(list1) != len(list2): print "lists haven't the same length" else: for i in range(len(list1)): simi = araby.vocalizedSimilarity( list1[i], list2[i]) if simi < 0: LettersError += -simi incorrect += 1 # lineIncorrect += 1; # evaluation without last haraka simi2 = araby.vocalizedSimilarity( araby.stripLastHaraka(list1[i]), araby.stripLastHaraka(list2[i])) if simi2 < 0: WLMIncorrect += 1 lineWLMIncorrect += 1 else: correct += 1 lineCorrect += 1 #compare resultLine and vocalizedLine if reducedTashkeel: result = araby.reduceTashkeel(result) # print result.encode('utf8'); counter += 1 #display stat for every line if compare: print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t" % ( counter - 1, #id round(correct * 100.00 / total, 2), #fully Correct round((total - WLMIncorrect) * 100.00 / total, 2), #Strip Correct incorrect, #fully WER WLMIncorrect, #Strip WER LettersError, #LER total, #Total ), if lineTotal: print "%0.2f%%\t" % round(lineCorrect * 100.00 / lineTotal, 2), #line Fully correct print "%0.2f%%\t" % round( (lineTotal - lineWLMIncorrect) * 100.00 / lineTotal, 2), #line Strip correct print result.encode('utf8') #get the next line if not text: line = (myfile.readline()).decode('utf8') else: if counter < len(lines): line = lines[counter] else: line = None