def run(self): self.acceptSock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.acceptSock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) self.acceptSock.bind((self.ip, self.port)) self.acceptSock.listen(1) self.addSocket(self.acceptSock) self.vocalizer = ArabicVocalizer.TashkeelClass('/tmp/mishkal_cache/') self.vocalizer.set_log_level(50) # critical while self.isRunning(): ready, _, _ = select.select(self.getSockets(), [], []) # only accept connection and skip if self.acceptSock in ready: conn, addr = self.acceptSock.accept() self.addSocket(conn) ready.remove(self.acceptSock) # for better reading skip if there are no more requests if ready == []: continue # handle outstanding requests for conn in ready: options = self.getData(conn) text = options['text'] if not text: print 'Debugggg' print text print options print '\n' continue #print 'First' + text lines = text.split('\n') result = u'' for line in lines: line = line.strip() if line == '': continue if line.startswith('#'): continue lineResult = self.vocalizer.tashkeel(line) result += ' ' + lineResult if self.isDebug(): if text: print lineResult.strip('\n').encode('utf8') try: answer = result + '\00' print("--- %s seconds ---" % (time.time() - self.start_time)) answer = self.post_processing(answer) print str(len(answer.encode('utf-8'))) conn.send(str(len(answer.encode('utf-8')))) #conn.send(answer.encode('utf-8')) if self.isDebug(): print result.strip('\n').encode('utf8') finally: self.closeSock(conn) if self.isSocketsEmpty(): self.setRunning(false)
def assistanttashkeel(text): """ get tashkeel with suggestions """ import tashkeel.tashkeel as ArabicVocalizer vocalizer = ArabicVocalizer.TashkeelClass() vocalized_text = vocalizer.assistanttashkeel(text) return vocalized_text
def tashkeelText(text, lastmark=True): import tashkeel.tashkeel as ArabicVocalizer vocalizer = ArabicVocalizer.TashkeelClass() print "lastMark", lastmark if lastmark == "0": vocalizer.disableLastMark() vocalized_text = vocalizer.tashkeel(text) return vocalized_text
def tashkeel2(text, lastmark): import tashkeel.tashkeel as ArabicVocalizer vocalizer = ArabicVocalizer.TashkeelClass() #print (u"lastMark %s"%lastmark).encode('utf8'); if lastmark == "0" or not lastmark: vocalizer.disableLastMark() vocalized_dict = vocalizer.tashkeelOuputHtmlSuggest(text) #print vocalized_dict return vocalized_dict
def assistanttashkeel(text): """ get tashkeel with suggestions """ import tashkeel.tashkeel as ArabicVocalizer cpath = os.path.join(os.path.dirname(__file__), '../tmp/') vocalizer = ArabicVocalizer.TashkeelClass(mycache_path=cpath) vocalized_text = vocalizer.assistanttashkeel(text) return vocalized_text
def tashkeel_text(text, lastmark=True): """ Tashkeel text without suggestions """ import tashkeel.tashkeel as ArabicVocalizer vocalizer = ArabicVocalizer.TashkeelClass() print "lastMark", lastmark if lastmark == "0": vocalizer.disable_last_mark() vocalized_text = vocalizer.tashkeel(text) return vocalized_text
def tashkeel_text(text, lastmark=True): """ Tashkeel text without suggestions """ cpath = os.path.join(os.path.dirname(__file__), '../tmp/') vocalizer = ArabicVocalizer.TashkeelClass(mycache_path=cpath) #~ print "lastMark", lastmark if lastmark == "0": vocalizer.disable_last_mark() vocalized_text = vocalizer.tashkeel(text) return vocalized_text
def tashkeel2(text, lastmark): """ Tashkeel text with suggestions """ import tashkeel.tashkeel as ArabicVocalizer vocalizer = ArabicVocalizer.TashkeelClass() #print (u"lastMark %s"%lastmark).encode('utf8') if lastmark == "0" or not lastmark: vocalizer.disable_last_mark() vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(text) #print vocalized_dict return vocalized_dict
def tashkeel2(text, lastmark): """ Tashkeel text with suggestions """ import tashkeel.tashkeel as ArabicVocalizer cpath = os.path.join(os.path.dirname(__file__), '../tmp/') vocalizer = ArabicVocalizer.TashkeelClass(mycache_path=cpath) #~ vocalizer.disable_cache() if lastmark == "0" or not lastmark: vocalizer.disable_last_mark() vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(text) return vocalized_dict
def showCollocations(text): """ Show collocations found in the text. The collocations is looked up from a data base extracted from a corpus. @param text: a given vocalized text. @type text: unicode. @return : the text have collocations quoted @rtype: unicode """ import tashkeel.tashkeel as ArabicVocalizer vocalizer = ArabicVocalizer.TashkeelClass() vocalized_text = vocalizer.statTashkeel(text) return vocalized_text
def Comparetashkeel(text): import tashkeel.tashkeel as ArabicVocalizer # the entred text is vocalized correctly correct_text = text text = araby.stripTashkeel(text) vocalizer = ArabicVocalizer.TashkeelClass() vocalized_text = vocalizer.tashkeel(text) # compare voalized text with a correct text text1 = correct_text text2 = vocalized_text # remove collocations symboles text2 = text2.replace("'", "") text2 = text2.replace("~", "") #stemmer=tashaphyne.stemming.ArabicLightStemmer() list1 = vocalizer.analyzer.tokenize(text1) list2 = vocalizer.analyzer.tokenize(text2) print u":".join(list1).encode('utf8') print u":".join(list2).encode('utf8') correct = 0 incorrect = 0 total = len(list1) if len(list1) != len(list2): print "lists haven't the same length" else: for i in range(total): if araby.vocalizedlike(list1[i], list2[i]): correct += 1 else: incorrect += 1 result = [ vocalized_text, "correct:%0.2f%%" % round(correct * 100.00 / total, 2), "incorrect:%0.2f%%" % round(incorrect * 100.00 / total, 2), total ] return result #correct*100/total;
def test(): options = grabargs() filename = options['fname'] outfilename = options['ofname'] text = options['text'] strip_tashkeel = options['strip_tashkeel'] nocache = options['nocache'] reducedTashkeel = options['reducedTashkeel'] disableSyntax = options['disableSyntax'] disableSemantic = options['disableSemantic'] disableStat = options['disableStatistic'] ignore = options['ignore'] limit = options['limit'] compare = options['compare'] progress = options['progress'] enable_syn_train = options['train'] # filename = "samples/randomtext.txt" if not text and not filename: usage() sys.exit(0) if not text: try: myfile = open(filename) print("input file:", filename) if not outfilename: outfilename = filename + " (Tashkeel).txt" print("output file:", outfilename) outfile = open(outfilename, "w") except: print(" Can't Open the given File ", filename) sys.exit() else: lines = text.split('\n') # all things are well, import library import core.adaat import pyarabic.araby as araby counter = 1 if not limit: limit = 100000000 if not strip_tashkeel: vocalizer = ArabicVocalizer.TashkeelClass() if nocache: vocalizer.disable_cache() # print "nocache" if ignore: vocalizer.disable_last_mark() if disableSemantic: vocalizer.disable_semantic_analysis() if disableSyntax: vocalizer.disable_syntaxic_analysis() if disableStat: vocalizer.disable_stat_tashkeel() if enable_syn_train: vocalizer.enable_syn_train() # print "mishkal-console, vocalizer.anasynt.syntax_train_enabled", vocalizer.anasynt.syntax_train_enabled # vocalizer.disableShowCollocationMark() # print "show delimiter", vocalizer.collo.showDelimiter # nolimit = True nolimit = False if not text: line = (myfile.readline()).decode('utf8') else: if len(lines) > 0: line = lines[0] correct = 0 incorrect = 0 total = 0 totLetters = 0 LettersError = 0 WLMIncorrect = 0 percent = 0 if compare: # dispaly stats for the current line print( "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct\tLine" ) while line and (nolimit or counter <= limit): if not line.startswith('# '): line = line.strip() lineCorrect = 0 lineWLMIncorrect = 0 if strip_tashkeel: result = araby.strip_tashkeel(line) else: # vocalize line by line if not compare: result = vocalizer.tashkeel(line) if compare: inputVocalizedLine = line inputlist = vocalizer.analyzer.tokenize(inputVocalizedLine) inputUnvocalizedLine = araby.strip_tashkeel(line) vocalized_dict = vocalizer.tashkeel_ouput_html_suggest( inputUnvocalizedLine) # stemmer = tashaphyne.stemming.ArabicLightStemmer() # ~texts = vocalizer.analyzer.split_into_phrases(inputVocalizedLine) # ~inputlist = [] # ~for txt in texts: # ~inputlist += vocalizer.analyzer.text_tokenize(txt) outputlist = [x.get("chosen", '') for x in vocalized_dict] result = u" ".join(outputlist) outputlistsemi = [ x.get("semi", '') for x in vocalized_dict ] total += len(inputlist) lineTotal = len(inputlist) if len(inputlist) != len(outputlist): print("lists haven't the same length") print(len(inputlist), len(outputlist)) print(u"# ".join(inputlist).encode('utf8')) print(u"# ".join(outputlist).encode('utf8')) else: for inword, outword, outsemiword in zip( inputlist, outputlist, outputlistsemi): simi = araby.vocalized_similarity(inword, outword) if simi < 0: LettersError += -simi incorrect += 1 # evaluation without last haraka simi2 = araby.vocalized_similarity( inword, outsemiword) if simi2 < 0: WLMIncorrect += 1 lineWLMIncorrect += 1 else: correct += 1 lineCorrect += 1 # compare resultLine and vocalizedLine if reducedTashkeel: result = araby.reduceTashkeel(result) # print result.encode('utf8') counter += 1 # display stat for every line if compare: print("%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t" % ( counter - 1, # id round(correct * 100.00 / total, 2), # fully Correct round((total - WLMIncorrect) * 100.00 / total, 2), # Strip Correct incorrect, # fully WER WLMIncorrect, # Strip WER LettersError, # LER total # Total )) if lineTotal: print("%0.2f%%\t" % round(lineCorrect * 100.00 / lineTotal, 2) ) # line Fully correct print("%0.2f%%\t" % round( (lineTotal - lineWLMIncorrect) * 100.00 / lineTotal, 2) ) # line Strip correct # ~ print result.strip('\n').encode('utf8'), if text: print result.strip('\n').encode('utf8'), else: result_line = result.encode('utf8') print result_line # add line and new line to output file outfile.write(result_line) outfile.write("\n") if progress and not nolimit: # ~percent = (counter * 100/ limit ) if (counter / limit * 100 >percent) else percent sys.stderr.write( "\r[%d%%]%d/%d lines Full %0.2f Strip %0.2f " % ( counter * 100 / limit, counter, limit, round(correct * 100.00 / total, 2), # fully Correct round((total - WLMIncorrect) * 100.00 / total, 2) # Strip Correct )) # ~sys.stderr.write("treatment of "+line.encode('utf8')) sys.stderr.flush() # get the next line if not text: line = (myfile.readline()).decode('utf8') else: if counter < len(lines): line = lines[counter] else: line = None else: print("Done")
def compare_tashkeel(text): """ Compare tashkeel between vocalized text and automatic vocalized text """ import tashkeel.tashkeel as ArabicVocalizer # the entred text is vocalized correctly correct_text = text.strip() text = araby.strip_tashkeel(text.strip()) vocalizer = ArabicVocalizer.TashkeelClass() #~vocalized_text = vocalizer.tashkeel(text) vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(text) # compare voalized text with a correct text text1 = correct_text #~text2 = vocalized_text displayed_html = u"" #stemmer=tashaphyne.stemming.ArabicLightStemmer() #~texts = vocalizer.analyzer.split_into_phrases(text1) texts = [ text1, ] list1 = [] for txt in texts: list1 += vocalizer.analyzer.tokenize(txt) list2 = vocalized_dict print u"\t".join(list1).encode('utf8') correct = 0 incorrect = 0 total = len(list1) if len(list1) != len(list2): print "lists haven't the same length", len(list1), len(list2) for i in range(min(len(list1), len(list2))): print(u"'%s'\t'%s'" % (list1[i], list2[i].get('chosen', ''))).encode("utf8") sys.exit() else: for i in range(total): wo1 = list1[i] wo1_strip = wo1 wo2 = list2[i]['chosen'] wo2_strip = list2[i]['semi'] # words without inflection mark inflect = list2[i]['inflect'] link = list2[i]['link'] rule = list2[i]['rule'] style = "diff" #~if araby.is_vocalized(wo2) and araby.vocalizedlike(wo1, wo2): if araby.vocalizedlike(wo1, wo2): if wo2 == "\n": wo2 = "<br/>" #~displayed_html += u" " + wo2 displayed_html += u" <span id='diff' class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % ( style, wo1, inflect, link, str(rule), wo2) correct += 1 else: incorrect += 1 # green for last mark difference wo1_strip = wo1 #~wo2_strip = araby.strip_lastharaka(wo2) if araby.vocalizedlike(wo1_strip, wo2_strip): style = 'diff-mark' else: # if the last marks are equal wm1 = wo1[-1:] wm2 = wo2[-1:] if (araby.is_haraka(wm1) and araby.is_haraka(wm2) and wm1 == wm2) \ or (bool(araby.is_haraka(wm1)) ^ bool(araby.is_haraka(wm2))): style = "diff-word" else: style = 'diff-all' displayed_html += u" <span id='diff' class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % ( style, wo1, inflect, link, str(rule), wo2) per_correct = round(correct * 100.00 / total, 2) per_incorrect = round(incorrect * 100.00 / total, 2) result = [ displayed_html, "correct:%0.2f%%, incorrect:%0.2f%%" % (per_correct, per_incorrect) ] return result #correct*100/total
import tashkeel.tashkeel as tashkeel def reshape(text): """Reshapes arabic in order to display characters from right to left """ if platform.system() == "Darwin": return text else: reshaped_text = arabic_reshaper.reshape(text) bidi_text = get_display(reshaped_text) return bidi_text if __name__ == '__main__': f = codecs.open("example_arabic_script.dat", "r", "utf-8") transliterator = ALA_LC_Transliterator() for line in f: print("--------------Original Text--------------") text = line.strip() print(reshape(text)) print("--------------Vocalized Text--------------") vocalizer = tashkeel.TashkeelClass() voc = vocalizer.tashkeel(text) print(reshape(voc)) print("--------------Transliterated Text--------------") tr = transliterator.do(voc.strip()) print(reshape(tr)) print("#########################################") f.close()
def assistanttashkeel(text): import tashkeel.tashkeel as ArabicVocalizer vocalizer = ArabicVocalizer.TashkeelClass() vocalized_text = vocalizer.assistanttashkeel(text) return vocalized_text
def test(): filename, text, stripTashkeel, reducedTashkeel, disableSyntax, disableSemantic, disableStat, ignore, limit, compare = grabargs( ) #filename="samples/randomtext.txt" if not text and not filename: usage() sys.exit(0) if not text: try: myfile = open(filename) except: print " Can't Open the given File ", filename sys.exit() else: lines = text.split('\n') # all things are well, import library import core.adaat import pyarabic.araby as araby counter = 1 if not limit: limit = 100000000 if not stripTashkeel: vocalizer = ArabicVocalizer.TashkeelClass() if ignore: vocalizer.disableLastMark() if disableSemantic: vocalizer.disableSemanticAnalysis() if disableSyntax: vocalizer.disableSyntaxicAnalysis() if disableStat: vocalizer.disableStatTashkeel() #vocalizer.disableShowCollocationMark(); #print "show delimiter", vocalizer.collo.showDelimiter; #nolimit = True; nolimit = False if not text: line = (myfile.readline()).decode('utf8') else: if len(lines) > 0: line = lines[0] correct = 0 incorrect = 0 total = 0 totLetters = 0 LettersError = 0 WLMIncorrect = 0 if compare: #dispaly stats for the current line print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct" # print "Full\tPartial\tFull correct \tfull incorrect\tpartial correct\tpartial incorrect\tWER\tLER\tTotal" while line and (nolimit or counter <= limit): if not line.startswith('#'): # lineIncorrect = 0; lineCorrect = 0 lineWLMIncorrect = 0 if stripTashkeel: result = araby.stripTashkeel(line) else: #vocalize line by line if compare: vocalizedLine = line line = araby.stripTashkeel(line) result = vocalizer.tashkeel(line) #compare resultLine and vocalizedLine if compare: list1 = vocalizer.analyzer.tokenize(vocalizedLine) list2 = vocalizer.analyzer.tokenize(result) #print u":".join(list1).encode('utf8'); #print u":".join(list2).encode('utf8'); total += len(list1) lineTotal = len(list1) if len(list1) != len(list2): print "lists haven't the same length" else: for i in range(len(list1)): simi = araby.vocalizedSimilarity( list1[i], list2[i]) if simi < 0: LettersError += -simi incorrect += 1 # lineIncorrect += 1; # evaluation without last haraka simi2 = araby.vocalizedSimilarity( araby.stripLastHaraka(list1[i]), araby.stripLastHaraka(list2[i])) if simi2 < 0: WLMIncorrect += 1 lineWLMIncorrect += 1 else: correct += 1 lineCorrect += 1 #compare resultLine and vocalizedLine if reducedTashkeel: result = araby.reduceTashkeel(result) # print result.encode('utf8'); counter += 1 #display stat for every line if compare: print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t" % ( counter - 1, #id round(correct * 100.00 / total, 2), #fully Correct round((total - WLMIncorrect) * 100.00 / total, 2), #Strip Correct incorrect, #fully WER WLMIncorrect, #Strip WER LettersError, #LER total, #Total ), if lineTotal: print "%0.2f%%\t" % round(lineCorrect * 100.00 / lineTotal, 2), #line Fully correct print "%0.2f%%\t" % round( (lineTotal - lineWLMIncorrect) * 100.00 / lineTotal, 2), #line Strip correct print result.encode('utf8') #get the next line if not text: line = (myfile.readline()).decode('utf8') else: if counter < len(lines): line = lines[counter] else: line = None