def reducedTashkeelText(text): """ Reduce Harakat and vocalization from a vocalized text. @param text: a given vocalized text. @type text: unicode. @return : reduced text vocalization @rtype: unicode """ return araby.reduceTashkeel(text)
def reducedTashkeelText(text): """ Reduce Harakat and vocalization from a vocalized text. @param text: a given vocalized text. @type text: unicode. @return : reduced text vocalization @rtype: unicode """ return araby.reduceTashkeel(text);
def test(): options = grabargs() filename = options['fname'] outfilename = options['ofname'] text = options['text'] strip_tashkeel = options['strip_tashkeel'] nocache = options['nocache'] reducedTashkeel = options['reducedTashkeel'] disableSyntax = options['disableSyntax'] disableSemantic = options['disableSemantic'] disableStat = options['disableStatistic'] ignore = options['ignore'] limit = options['limit'] compare = options['compare'] progress = options['progress'] enable_syn_train = options['train'] # filename = "samples/randomtext.txt" if not text and not filename: usage() sys.exit(0) if not text: try: myfile = open(filename) print("input file:", filename) if not outfilename: outfilename = filename + " (Tashkeel).txt" print("output file:", outfilename) outfile = open(outfilename, "w") except: print(" Can't Open the given File ", filename) sys.exit() else: lines = text.split('\n') # all things are well, import library import core.adaat import pyarabic.araby as araby counter = 1 if not limit: limit = 100000000 if not strip_tashkeel: vocalizer = ArabicVocalizer.TashkeelClass() if nocache: vocalizer.disable_cache() # print "nocache" if ignore: vocalizer.disable_last_mark() if disableSemantic: vocalizer.disable_semantic_analysis() if disableSyntax: vocalizer.disable_syntaxic_analysis() if disableStat: vocalizer.disable_stat_tashkeel() if enable_syn_train: vocalizer.enable_syn_train() # print "mishkal-console, vocalizer.anasynt.syntax_train_enabled", vocalizer.anasynt.syntax_train_enabled # vocalizer.disableShowCollocationMark() # print "show delimiter", vocalizer.collo.showDelimiter # nolimit = True nolimit = False if not text: line = (myfile.readline()).decode('utf8') else: if len(lines) > 0: line = lines[0] correct = 0 incorrect = 0 total = 0 totLetters = 0 LettersError = 0 WLMIncorrect = 0 percent = 0 if compare: # dispaly stats for the current line print( "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct\tLine" ) while line and (nolimit or counter <= limit): if not line.startswith('# '): line = line.strip() lineCorrect = 0 lineWLMIncorrect = 0 if strip_tashkeel: result = araby.strip_tashkeel(line) else: # vocalize line by line if not compare: result = vocalizer.tashkeel(line) if compare: inputVocalizedLine = line inputlist = vocalizer.analyzer.tokenize(inputVocalizedLine) inputUnvocalizedLine = araby.strip_tashkeel(line) vocalized_dict = vocalizer.tashkeel_ouput_html_suggest( inputUnvocalizedLine) # stemmer = tashaphyne.stemming.ArabicLightStemmer() # ~texts = vocalizer.analyzer.split_into_phrases(inputVocalizedLine) # ~inputlist = [] # ~for txt in texts: # ~inputlist += vocalizer.analyzer.text_tokenize(txt) outputlist = [x.get("chosen", '') for x in vocalized_dict] result = u" ".join(outputlist) outputlistsemi = [ x.get("semi", '') for x in vocalized_dict ] total += len(inputlist) lineTotal = len(inputlist) if len(inputlist) != len(outputlist): print("lists haven't the same length") print(len(inputlist), len(outputlist)) print(u"# ".join(inputlist).encode('utf8')) print(u"# ".join(outputlist).encode('utf8')) else: for inword, outword, outsemiword in zip( inputlist, outputlist, outputlistsemi): simi = araby.vocalized_similarity(inword, outword) if simi < 0: LettersError += -simi incorrect += 1 # evaluation without last haraka simi2 = araby.vocalized_similarity( inword, outsemiword) if simi2 < 0: WLMIncorrect += 1 lineWLMIncorrect += 1 else: correct += 1 lineCorrect += 1 # compare resultLine and vocalizedLine if reducedTashkeel: result = araby.reduceTashkeel(result) # print result.encode('utf8') counter += 1 # display stat for every line if compare: print("%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t" % ( counter - 1, # id round(correct * 100.00 / total, 2), # fully Correct round((total - WLMIncorrect) * 100.00 / total, 2), # Strip Correct incorrect, # fully WER WLMIncorrect, # Strip WER LettersError, # LER total # Total )) if lineTotal: print("%0.2f%%\t" % round(lineCorrect * 100.00 / lineTotal, 2) ) # line Fully correct print("%0.2f%%\t" % round( (lineTotal - lineWLMIncorrect) * 100.00 / lineTotal, 2) ) # line Strip correct # ~ print result.strip('\n').encode('utf8'), if text: print result.strip('\n').encode('utf8'), else: result_line = result.encode('utf8') print result_line # add line and new line to output file outfile.write(result_line) outfile.write("\n") if progress and not nolimit: # ~percent = (counter * 100/ limit ) if (counter / limit * 100 >percent) else percent sys.stderr.write( "\r[%d%%]%d/%d lines Full %0.2f Strip %0.2f " % ( counter * 100 / limit, counter, limit, round(correct * 100.00 / total, 2), # fully Correct round((total - WLMIncorrect) * 100.00 / total, 2) # Strip Correct )) # ~sys.stderr.write("treatment of "+line.encode('utf8')) sys.stderr.flush() # get the next line if not text: line = (myfile.readline()).decode('utf8') else: if counter < len(lines): line = lines[counter] else: line = None else: print("Done")
def test(): args = grabargs() filename = args.filename filename2 = args.compareto # used for comparison if filename2: compare = True else: compare = False outfilename = args.outfile text = args.text if not text and not filename: print('Try: mishkal-console.py -h') sys.exit(0) # tashkeel command command = args.command strip_tashkeel = False reducedTashkeel = False commandTashkeel = False if command == "strip": strip_tashkeel = True elif command == "reduce": reducedTashkeel = True else: commandTashkeel = True # general options limit = args.limit progress = args.progress verbose = args.verbose # options ignore = args.ignore cache = args.cache disableSyntax = args.syntax disableSemantic = args.semantic disableStat = args.stat enable_syn_train = args.train evaluation = args.evaluation # Open file if not text: try: myfile = open(filename, encoding='utf8') print("input file:", filename) if not outfilename: outfilename = filename + ".Tashkeel.txt" print("output file:", outfilename) outfile = open(outfilename, "w") except: print(" Can't Open the given File ", filename) sys.exit() else: lines = text.strip().split('\n') if compare and filename2: try: myfile2 = open(filename2, encoding='utf8') print("input file2:", filename2) except: print(" Can't Open the given File ", filename2) sys.exit() # all things are well, import library myconsole = tashkeel_console.Tashkeel_console() #~ myconsole.counter = 1 myconsole.limit = limit if not limit: # count lines in files if filename, otherwise count lines in text if filename: with open(filename) as f: limit = sum(1 for line in f) else: limit = len(lines) if not strip_tashkeel: vocalizer = ArabicVocalizer.TashkeelClass() if cache: vocalizer.enable_cache() sys.stderr.write(" Mishkal use a cache") if ignore: vocalizer.disable_last_mark() if disableSemantic: vocalizer.disable_semantic_analysis() if disableSyntax: vocalizer.disable_syntaxic_analysis() if disableStat: vocalizer.disable_stat_tashkeel() if enable_syn_train: vocalizer.enable_syn_train() # if verbose option, then activate logger in ArabicVocalizer if verbose: vocalizer.enable_verbose() if not text: line = (myfile.readline()) #.decode('utf8') else: if len(lines) > 0: line = lines[0] # get the next line to compare if compare: line_base = myfile2.readline().strip() if evaluation: myconsole.header() while line and myconsole.counter <= limit: line = line.strip() #~ myconsole.lineCorrect = 0 #~ myconsole.lineWLMIncorrect = 0 if strip_tashkeel: result = araby.strip_tashkeel(line) elif compare: myconsole.compare(line_base, line) myconsole.display_line_stat() result = line print("base :", line_base) print("input:", line) #~ else: # vocalize line by line elif not evaluation: result = vocalizer.tashkeel(line) myconsole.total += len(araby.tokenize(line)) elif evaluation: inputUnvocalizedLine = araby.strip_tashkeel(line) vocalized_dict = vocalizer.tashkeel_ouput_html_suggest( inputUnvocalizedLine) outputlist = [x.get("chosen", '') for x in vocalized_dict] result = u" ".join(outputlist) myconsole.compare(line, vocalized_dict) # display stat for every line myconsole.display_line_stat() # compare resultLine and vocalizedLine if reducedTashkeel: result = araby.reduceTashkeel(result) if text: print(result.strip('\n'), end='') else: result_line = result if verbose: print(result_line) # add line and new line to output file outfile.write(result_line) outfile.write("\n") if progress: # show progress bar myconsole.progress(compare) myconsole.counter += 1 # get the next line if not text: line = (myfile.readline()) else: if myconsole.counter < len(lines): line = lines[myconsole.counter] else: line = None # get the next line to compare if compare: line_base = myfile2.readline().strip() if progress: myconsole.footer()
def test(): filename, text, stripTashkeel, reducedTashkeel, disableSyntax, disableSemantic, disableStat, ignore, limit, compare =grabargs() #filename="samples/randomtext.txt" if not text and not filename: usage() sys.exit(0) if not text: try: myfile=open(filename) except: print " Can't Open the given File ", filename; sys.exit(); else: lines = text.split('\n'); # all things are well, import library import core.adaat import pyarabic.araby as araby counter=1; if not limit : limit= 100000000 if not stripTashkeel: vocalizer=ArabicVocalizer.TashkeelClass(); if ignore : vocalizer.disableLastMark(); if disableSemantic: vocalizer.disableSemanticAnalysis(); if disableSyntax: vocalizer.disableSyntaxicAnalysis(); if disableStat: vocalizer.disableStatTashkeel(); #vocalizer.disableShowCollocationMark(); #print "show delimiter", vocalizer.collo.showDelimiter; #nolimit = True; nolimit = False; if not text: line=(myfile.readline()).decode('utf8'); else: if len(lines)>0: line= lines[0]; correct=0; incorrect=0; total=0; totLetters =0; LettersError =0 WLMIncorrect =0; if compare: #dispaly stats for the current line print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct" # print "Full\tPartial\tFull correct \tfull incorrect\tpartial correct\tpartial incorrect\tWER\tLER\tTotal" while line and (nolimit or counter<=limit): if not line.startswith('#'): # lineIncorrect = 0; lineCorrect = 0; lineWLMIncorrect =0; if stripTashkeel: result = araby.stripTashkeel(line); else: #vocalize line by line if compare: vocalizedLine = line; line = araby.stripTashkeel(line) result=vocalizer.tashkeel(line); #compare resultLine and vocalizedLine if compare: list1=vocalizer.analyzer.tokenize(vocalizedLine); list2=vocalizer.analyzer.tokenize(result); #print u":".join(list1).encode('utf8'); #print u":".join(list2).encode('utf8'); total+=len(list1); lineTotal = len(list1); if len(list1)!=len(list2): print "lists haven't the same length"; else: for i in range(len(list1)): simi = araby.vocalizedSimilarity(list1[i],list2[i]); if simi<0: LettersError+= -simi; incorrect +=1; # lineIncorrect += 1; # evaluation without last haraka simi2 = araby.vocalizedSimilarity(araby.stripLastHaraka(list1[i]),araby.stripLastHaraka(list2[i])); if simi2<0: WLMIncorrect +=1; lineWLMIncorrect+=1; else: correct+=1; lineCorrect += 1; #compare resultLine and vocalizedLine if reducedTashkeel: result= araby.reduceTashkeel(result) # print result.encode('utf8'); counter+=1; #display stat for every line if compare: print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t"%( counter-1,#id round(correct*100.00/total,2),#fully Correct round((total-WLMIncorrect)*100.00/total,2),#Strip Correct incorrect,#fully WER WLMIncorrect,#Strip WER LettersError,#LER total,#Total ), if lineTotal: print "%0.2f%%\t"%round(lineCorrect*100.00/lineTotal,2),#line Fully correct print "%0.2f%%\t"%round((lineTotal-lineWLMIncorrect)*100.00/lineTotal,2),#line Strip correct print result.encode('utf8'); #get the next line if not text: line=(myfile.readline()).decode('utf8'); else: if counter<len(lines): line= lines[counter]; else: line =None;
def test(): options = grabargs() filename = options['fname'] text = options['text'] strip_tashkeel = options['strip_tashkeel'] nocache = options['nocache'] reducedTashkeel = options['reducedTashkeel'] disableSyntax = options['disableSyntax'] disableSemantic = options['disableSemantic'] disableStat = options['disableStatistic'] ignore = options['ignore'] limit = options['limit'] compare = options['compare'] progress = options['progress'] #filename = "samples/randomtext.txt" if not text and not filename: usage() sys.exit(0) if not text: try: myfile = open(filename) except: print " Can't Open the given File ", filename sys.exit() else: lines = text.split('\n') # all things are well, import library import core.adaat import pyarabic.araby as araby counter = 1 if not limit : limit = 100000000 if not strip_tashkeel: vocalizer = ArabicVocalizer.TashkeelClass() if nocache : vocalizer.disable_cache() print "nocache" if ignore : vocalizer.disable_last_mark() if disableSemantic: vocalizer.disable_semantic_analysis() if disableSyntax: vocalizer.disable_syntaxic_analysis() if disableStat: vocalizer.disable_stat_tashkeel() #vocalizer.disableShowCollocationMark() #print "show delimiter", vocalizer.collo.showDelimiter #nolimit = True nolimit = False if not text: line = (myfile.readline()).decode('utf8') else: if len(lines)>0: line = lines[0] correct = 0 incorrect = 0 total = 0 totLetters = 0 LettersError = 0 WLMIncorrect = 0 percent = 0 if compare: #dispaly stats for the current line print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct" while line and (nolimit or counter <= limit): if progress and not nolimit: #~percent = (counter * 100/ limit ) if (counter / limit * 100 >percent) else percent sys.stderr.write("\r[%d%%]%d/%d lines" %(counter * 100/ limit, counter, limit)) #~sys.stderr.write("treatment of "+line.encode('utf8')) sys.stderr.flush() if not line.startswith('#'): line = line.strip() lineCorrect = 0 lineWLMIncorrect = 0 if strip_tashkeel: result = araby.strip_tashkeel(line) else: #vocalize line by line if not compare: result = vocalizer.tashkeel(line) if compare: inputVocalizedLine = line inputlist = vocalizer.analyzer.tokenize(inputVocalizedLine) inputUnvocalizedLine = araby.strip_tashkeel(line) vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(inputUnvocalizedLine) #stemmer=tashaphyne.stemming.ArabicLightStemmer() #~texts = vocalizer.analyzer.split_into_phrases(inputVocalizedLine) #~inputlist =[] #~for txt in texts: #~inputlist += vocalizer.analyzer.text_tokenize(txt) outputlist = [x.get("chosen",'') for x in vocalized_dict] result = u" ".join(outputlist) outputlistsemi = [x.get("semi",'') for x in vocalized_dict] total += len(inputlist) lineTotal = len(inputlist) if len(inputlist) != len(outputlist): print "lists haven't the same length" print len(inputlist), len(outputlist) print u"#".join(inputlist).encode('utf8') print u"#".join(outputlist).encode('utf8') else: for inword, outword, outsemiword in zip(inputlist, outputlist, outputlistsemi): simi = araby.vocalized_similarity(inword, outword) if simi<0: LettersError += -simi incorrect += 1 # evaluation without last haraka simi2 = araby.vocalized_similarity(inword, outsemiword) if simi2<0: WLMIncorrect += 1 lineWLMIncorrect += 1 else: correct += 1 lineCorrect += 1 #compare resultLine and vocalizedLine if reducedTashkeel: result = araby.reduceTashkeel(result) # print result.encode('utf8') counter += 1 #display stat for every line if compare: print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t"%( counter-1, #id round(correct*100.00/total, 2), #fully Correct round((total-WLMIncorrect)*100.00/total, 2), #Strip Correct incorrect, #fully WER WLMIncorrect, #Strip WER LettersError, #LER total, #Total ), if lineTotal: print "%0.2f%%\t"%round(lineCorrect*100.00/lineTotal, 2), #line Fully correct print "%0.2f%%\t"%round((lineTotal-lineWLMIncorrect)*100.00/lineTotal, 2), #line Strip correct print result.encode('utf8') #get the next line if not text: line = (myfile.readline()).decode('utf8') else: if counter<len(lines): line = lines[counter] else: line = None
def test(): filename, text, stripTashkeel, reducedTashkeel, disableSyntax, disableSemantic, disableStat, ignore, limit, compare = grabargs( ) #filename="samples/randomtext.txt" if not text and not filename: usage() sys.exit(0) if not text: try: myfile = open(filename) except: print " Can't Open the given File ", filename sys.exit() else: lines = text.split('\n') # all things are well, import library import core.adaat import pyarabic.araby as araby counter = 1 if not limit: limit = 100000000 if not stripTashkeel: vocalizer = ArabicVocalizer.TashkeelClass() if ignore: vocalizer.disableLastMark() if disableSemantic: vocalizer.disableSemanticAnalysis() if disableSyntax: vocalizer.disableSyntaxicAnalysis() if disableStat: vocalizer.disableStatTashkeel() #vocalizer.disableShowCollocationMark(); #print "show delimiter", vocalizer.collo.showDelimiter; #nolimit = True; nolimit = False if not text: line = (myfile.readline()).decode('utf8') else: if len(lines) > 0: line = lines[0] correct = 0 incorrect = 0 total = 0 totLetters = 0 LettersError = 0 WLMIncorrect = 0 if compare: #dispaly stats for the current line print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct" # print "Full\tPartial\tFull correct \tfull incorrect\tpartial correct\tpartial incorrect\tWER\tLER\tTotal" while line and (nolimit or counter <= limit): if not line.startswith('#'): # lineIncorrect = 0; lineCorrect = 0 lineWLMIncorrect = 0 if stripTashkeel: result = araby.stripTashkeel(line) else: #vocalize line by line if compare: vocalizedLine = line line = araby.stripTashkeel(line) result = vocalizer.tashkeel(line) #compare resultLine and vocalizedLine if compare: list1 = vocalizer.analyzer.tokenize(vocalizedLine) list2 = vocalizer.analyzer.tokenize(result) #print u":".join(list1).encode('utf8'); #print u":".join(list2).encode('utf8'); total += len(list1) lineTotal = len(list1) if len(list1) != len(list2): print "lists haven't the same length" else: for i in range(len(list1)): simi = araby.vocalizedSimilarity( list1[i], list2[i]) if simi < 0: LettersError += -simi incorrect += 1 # lineIncorrect += 1; # evaluation without last haraka simi2 = araby.vocalizedSimilarity( araby.stripLastHaraka(list1[i]), araby.stripLastHaraka(list2[i])) if simi2 < 0: WLMIncorrect += 1 lineWLMIncorrect += 1 else: correct += 1 lineCorrect += 1 #compare resultLine and vocalizedLine if reducedTashkeel: result = araby.reduceTashkeel(result) # print result.encode('utf8'); counter += 1 #display stat for every line if compare: print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t" % ( counter - 1, #id round(correct * 100.00 / total, 2), #fully Correct round((total - WLMIncorrect) * 100.00 / total, 2), #Strip Correct incorrect, #fully WER WLMIncorrect, #Strip WER LettersError, #LER total, #Total ), if lineTotal: print "%0.2f%%\t" % round(lineCorrect * 100.00 / lineTotal, 2), #line Fully correct print "%0.2f%%\t" % round( (lineTotal - lineWLMIncorrect) * 100.00 / lineTotal, 2), #line Strip correct print result.encode('utf8') #get the next line if not text: line = (myfile.readline()).decode('utf8') else: if counter < len(lines): line = lines[counter] else: line = None