def test_vocalized_similarity(self): """Test vocalized_similarity function ?""" word1 = u"ضَربٌ" word2 = u"ضَرْبٌ" self.assertTrue(ar.vocalizedlike(word1, word2)) self.assertNotEqual(ar.vocalized_similarity(word1, word2), -2) self.assertTrue(ar.vocalized_similarity(word1, word2))
def detect_number_words(text): """ Detect number words in a text. @param text: input text @type text: unicode @return : number words extracted from text @rtype: integer >>> text2number(u"وجدت خمسمئة وثلاثة وعشرين دينارا") خمسمئة وثلاثة وعشرين """ #~ words = araby.tokenize(text) #print words phrases_context = extract_number_context(text) for ph_con in phrases_context: if len(ph_con) >= 3: previous = ph_con[0] phrase = ph_con[1] nextword = ph_con[2] numberedwords = phrase numeric = text2number(numberedwords) tags = get_previous_tag(previous) vocalized = vocalize_number(araby.strip_tashkeel(\ numberedwords).split(' '), tags) #calcul vocalization similarity : sim = araby.vocalized_similarity(numberedwords, vocalized) voc_unit = vocalize_unit(numeric, nextword) sim_unit = araby.vocalized_similarity(voc_unit, \ nextword) if sim < 0: print u'\t'.join([str(sim), numberedwords, vocalized, \ str(numeric), u' '.join([previous, phrase, nextword]), \ nextword, voc_unit, str(sim_unit)]).encode('utf8')
def test_vocalized_similarity(self): # vocalized_similarity(word1, word2) word1 = u"ضَربٌ" word2 = u"ضَرْبٌ" word3 = u"ضَرْبٍ" self.assertTrue(Araby.vocalized_similarity(word1, word2)) assert Araby.vocalized_similarity(word1, word3) == -1
def compare(self, baseline, vocalized_output): """ compare base line with automatic vocalized result """ myconsole.lineCorrect = 0 myconsole.lineWLMIncorrect = 0 inputVocalizedLine = baseline inputlist = araby.tokenize(inputVocalizedLine) if type(vocalized_output) == list: outputlist = [x.get("chosen", '') for x in vocalized_output] result = vocalized_output outputlistsemi = [x.get("semi", '') for x in vocalized_output] elif type(vocalized_output) == str: outputlist = araby.tokenize(vocalized_output) outputlistsemi = [araby.strip_lastharaka(x) for x in outputlist] else: print("Incompatible vocaluzed output, must be dict or string", type(vocalized_output), vocalized_output) sys.exit() self.total += len(inputlist) self.lineTotal = len(inputlist) if len(inputlist) != len(outputlist): print("lists haven't the same length") print(len(inputlist), len(outputlist)) print(u"# ".join(inputlist).encode('utf8')) print(u"# ".join(outputlist).encode('utf8')) else: for inword, outword, outsemiword in zip(inputlist, outputlist, outputlistsemi): simi = araby.vocalized_similarity(inword, outword) if simi < 0: self.LettersError += -simi self.incorrect += 1 # evaluation without last haraka simi2 = araby.vocalized_similarity(inword, outsemiword) if simi2 < 0: self.WLMIncorrect += 1 self.lineWLMIncorrect += 1 else: self.correct += 1 self.lineCorrect += 1 self.counter += 1
def detect_number_words(text): """ Detect number words in a text. Example: >>> detect_number_words(u"وجدت خمسمئة وثلاثة وعشرين دينارا") خمسمئة وثلاثة وعشرين @param text: input text @type text: unicode @return: number words extracted from text @rtype: integer """ phrases_context = extract_number_context(text) for ph_con in phrases_context: if len(ph_con) >= 3: previous = ph_con[0] phrase = ph_con[1] nextword = ph_con[2] numberedwords = phrase numeric = text2number(numberedwords) tags = get_previous_tag(previous) wordlist = araby.strip_tashkeel(numberedwords).split(' ') vocalized = vocalize_number(wordlist, tags) #calcul vocalization similarity : sim = araby.vocalized_similarity(numberedwords, vocalized) voc_unit = vocalize_unit(numeric, nextword) sim_unit = araby.vocalized_similarity(voc_unit, nextword) if sim < 0: #~ print u'\t'.join([str(sim), u' '.join(numberedwords), vocalized, #~ str(numeric), u' '.join([previous, phrase, nextword]), #~ nextword, voc_unit, str(sim_unit)]).encode('utf8') print('\t'.join( [str(sim), ' '.join(numberedwords), ' '.join(vocalized)]).encode('utf8')) print(str(numeric), ' '.join([previous, phrase, nextword]).encode('utf8')) print('\t'.join([nextword, voc_unit, str(sim_unit)]).encode('utf8'))
def detect_number_words(text): """ Detect number words in a text. Example: >>> detect_number_words(u"وجدت خمسمئة وثلاثة وعشرين دينارا") خمسمئة وثلاثة وعشرين @param text: input text @type text: unicode @return: number words extracted from text @rtype: integer """ phrases_context = extract_number_context(text) for ph_con in phrases_context: if len(ph_con) >= 3: previous = ph_con[0] phrase = ph_con[1] nextword = ph_con[2] numberedwords = phrase numeric = text2number(numberedwords) tags = get_previous_tag(previous) wordlist = araby.strip_tashkeel(numberedwords).split(' ') vocalized = vocalize_number(wordlist, tags) #calcul vocalization similarity: sim = araby.vocalized_similarity(numberedwords, vocalized) voc_unit = vocalize_unit(numeric, nextword) sim_unit = araby.vocalized_similarity(voc_unit, nextword) if sim < 0: #~ print u'\t'.join([str(sim), u' '.join(numberedwords), vocalized, #~ str(numeric), u' '.join([previous, phrase, nextword]), #~ nextword, voc_unit, str(sim_unit)]).encode('utf8') print(u'\t'.join([str(sim), u' '.join(numberedwords), u' '.join(vocalized)])) print(str(numeric), u' '.join([previous, phrase, nextword])) print(u'\t'.join([nextword, voc_unit, str(sim_unit)]))
def test(): options = grabargs() filename = options['fname'] outfilename = options['ofname'] text = options['text'] strip_tashkeel = options['strip_tashkeel'] nocache = options['nocache'] reducedTashkeel = options['reducedTashkeel'] disableSyntax = options['disableSyntax'] disableSemantic = options['disableSemantic'] disableStat = options['disableStatistic'] ignore = options['ignore'] limit = options['limit'] compare = options['compare'] progress = options['progress'] enable_syn_train = options['train'] # filename = "samples/randomtext.txt" if not text and not filename: usage() sys.exit(0) if not text: try: myfile = open(filename) print("input file:", filename) if not outfilename: outfilename = filename + " (Tashkeel).txt" print("output file:", outfilename) outfile = open(outfilename, "w") except: print(" Can't Open the given File ", filename) sys.exit() else: lines = text.split('\n') # all things are well, import library import core.adaat import pyarabic.araby as araby counter = 1 if not limit: limit = 100000000 if not strip_tashkeel: vocalizer = ArabicVocalizer.TashkeelClass() if nocache: vocalizer.disable_cache() # print "nocache" if ignore: vocalizer.disable_last_mark() if disableSemantic: vocalizer.disable_semantic_analysis() if disableSyntax: vocalizer.disable_syntaxic_analysis() if disableStat: vocalizer.disable_stat_tashkeel() if enable_syn_train: vocalizer.enable_syn_train() # print "mishkal-console, vocalizer.anasynt.syntax_train_enabled", vocalizer.anasynt.syntax_train_enabled # vocalizer.disableShowCollocationMark() # print "show delimiter", vocalizer.collo.showDelimiter # nolimit = True nolimit = False if not text: line = (myfile.readline()).decode('utf8') else: if len(lines) > 0: line = lines[0] correct = 0 incorrect = 0 total = 0 totLetters = 0 LettersError = 0 WLMIncorrect = 0 percent = 0 if compare: # dispaly stats for the current line print( "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct\tLine" ) while line and (nolimit or counter <= limit): if not line.startswith('# '): line = line.strip() lineCorrect = 0 lineWLMIncorrect = 0 if strip_tashkeel: result = araby.strip_tashkeel(line) else: # vocalize line by line if not compare: result = vocalizer.tashkeel(line) if compare: inputVocalizedLine = line inputlist = vocalizer.analyzer.tokenize(inputVocalizedLine) inputUnvocalizedLine = araby.strip_tashkeel(line) vocalized_dict = vocalizer.tashkeel_ouput_html_suggest( inputUnvocalizedLine) # stemmer = tashaphyne.stemming.ArabicLightStemmer() # ~texts = vocalizer.analyzer.split_into_phrases(inputVocalizedLine) # ~inputlist = [] # ~for txt in texts: # ~inputlist += vocalizer.analyzer.text_tokenize(txt) outputlist = [x.get("chosen", '') for x in vocalized_dict] result = u" ".join(outputlist) outputlistsemi = [ x.get("semi", '') for x in vocalized_dict ] total += len(inputlist) lineTotal = len(inputlist) if len(inputlist) != len(outputlist): print("lists haven't the same length") print(len(inputlist), len(outputlist)) print(u"# ".join(inputlist).encode('utf8')) print(u"# ".join(outputlist).encode('utf8')) else: for inword, outword, outsemiword in zip( inputlist, outputlist, outputlistsemi): simi = araby.vocalized_similarity(inword, outword) if simi < 0: LettersError += -simi incorrect += 1 # evaluation without last haraka simi2 = araby.vocalized_similarity( inword, outsemiword) if simi2 < 0: WLMIncorrect += 1 lineWLMIncorrect += 1 else: correct += 1 lineCorrect += 1 # compare resultLine and vocalizedLine if reducedTashkeel: result = araby.reduceTashkeel(result) # print result.encode('utf8') counter += 1 # display stat for every line if compare: print("%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t" % ( counter - 1, # id round(correct * 100.00 / total, 2), # fully Correct round((total - WLMIncorrect) * 100.00 / total, 2), # Strip Correct incorrect, # fully WER WLMIncorrect, # Strip WER LettersError, # LER total # Total )) if lineTotal: print("%0.2f%%\t" % round(lineCorrect * 100.00 / lineTotal, 2) ) # line Fully correct print("%0.2f%%\t" % round( (lineTotal - lineWLMIncorrect) * 100.00 / lineTotal, 2) ) # line Strip correct # ~ print result.strip('\n').encode('utf8'), if text: print result.strip('\n').encode('utf8'), else: result_line = result.encode('utf8') print result_line # add line and new line to output file outfile.write(result_line) outfile.write("\n") if progress and not nolimit: # ~percent = (counter * 100/ limit ) if (counter / limit * 100 >percent) else percent sys.stderr.write( "\r[%d%%]%d/%d lines Full %0.2f Strip %0.2f " % ( counter * 100 / limit, counter, limit, round(correct * 100.00 / total, 2), # fully Correct round((total - WLMIncorrect) * 100.00 / total, 2) # Strip Correct )) # ~sys.stderr.write("treatment of "+line.encode('utf8')) sys.stderr.flush() # get the next line if not text: line = (myfile.readline()).decode('utf8') else: if counter < len(lines): line = lines[counter] else: line = None else: print("Done")
def test(): options = grabargs() filename = options['fname'] text = options['text'] strip_tashkeel = options['strip_tashkeel'] nocache = options['nocache'] reducedTashkeel = options['reducedTashkeel'] disableSyntax = options['disableSyntax'] disableSemantic = options['disableSemantic'] disableStat = options['disableStatistic'] ignore = options['ignore'] limit = options['limit'] compare = options['compare'] progress = options['progress'] #filename = "samples/randomtext.txt" if not text and not filename: usage() sys.exit(0) if not text: try: myfile = open(filename) except: print " Can't Open the given File ", filename sys.exit() else: lines = text.split('\n') # all things are well, import library import core.adaat import pyarabic.araby as araby counter = 1 if not limit : limit = 100000000 if not strip_tashkeel: vocalizer = ArabicVocalizer.TashkeelClass() if nocache : vocalizer.disable_cache() print "nocache" if ignore : vocalizer.disable_last_mark() if disableSemantic: vocalizer.disable_semantic_analysis() if disableSyntax: vocalizer.disable_syntaxic_analysis() if disableStat: vocalizer.disable_stat_tashkeel() #vocalizer.disableShowCollocationMark() #print "show delimiter", vocalizer.collo.showDelimiter #nolimit = True nolimit = False if not text: line = (myfile.readline()).decode('utf8') else: if len(lines)>0: line = lines[0] correct = 0 incorrect = 0 total = 0 totLetters = 0 LettersError = 0 WLMIncorrect = 0 percent = 0 if compare: #dispaly stats for the current line print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct" while line and (nolimit or counter <= limit): if progress and not nolimit: #~percent = (counter * 100/ limit ) if (counter / limit * 100 >percent) else percent sys.stderr.write("\r[%d%%]%d/%d lines" %(counter * 100/ limit, counter, limit)) #~sys.stderr.write("treatment of "+line.encode('utf8')) sys.stderr.flush() if not line.startswith('#'): line = line.strip() lineCorrect = 0 lineWLMIncorrect = 0 if strip_tashkeel: result = araby.strip_tashkeel(line) else: #vocalize line by line if not compare: result = vocalizer.tashkeel(line) if compare: inputVocalizedLine = line inputlist = vocalizer.analyzer.tokenize(inputVocalizedLine) inputUnvocalizedLine = araby.strip_tashkeel(line) vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(inputUnvocalizedLine) #stemmer=tashaphyne.stemming.ArabicLightStemmer() #~texts = vocalizer.analyzer.split_into_phrases(inputVocalizedLine) #~inputlist =[] #~for txt in texts: #~inputlist += vocalizer.analyzer.text_tokenize(txt) outputlist = [x.get("chosen",'') for x in vocalized_dict] result = u" ".join(outputlist) outputlistsemi = [x.get("semi",'') for x in vocalized_dict] total += len(inputlist) lineTotal = len(inputlist) if len(inputlist) != len(outputlist): print "lists haven't the same length" print len(inputlist), len(outputlist) print u"#".join(inputlist).encode('utf8') print u"#".join(outputlist).encode('utf8') else: for inword, outword, outsemiword in zip(inputlist, outputlist, outputlistsemi): simi = araby.vocalized_similarity(inword, outword) if simi<0: LettersError += -simi incorrect += 1 # evaluation without last haraka simi2 = araby.vocalized_similarity(inword, outsemiword) if simi2<0: WLMIncorrect += 1 lineWLMIncorrect += 1 else: correct += 1 lineCorrect += 1 #compare resultLine and vocalizedLine if reducedTashkeel: result = araby.reduceTashkeel(result) # print result.encode('utf8') counter += 1 #display stat for every line if compare: print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t"%( counter-1, #id round(correct*100.00/total, 2), #fully Correct round((total-WLMIncorrect)*100.00/total, 2), #Strip Correct incorrect, #fully WER WLMIncorrect, #Strip WER LettersError, #LER total, #Total ), if lineTotal: print "%0.2f%%\t"%round(lineCorrect*100.00/lineTotal, 2), #line Fully correct print "%0.2f%%\t"%round((lineTotal-lineWLMIncorrect)*100.00/lineTotal, 2), #line Strip correct print result.encode('utf8') #get the next line if not text: line = (myfile.readline()).decode('utf8') else: if counter<len(lines): line = lines[counter] else: line = None