Example #1
0
def reducedTashkeelText(text):
    """
	Reduce Harakat and vocalization from a vocalized text.
	@param text: a given vocalized text.
	@type text: unicode.
	@return : reduced text vocalization
	@rtype: unicode
	"""
    return araby.reduceTashkeel(text)
Example #2
0
def reducedTashkeelText(text):
	"""
	Reduce Harakat and vocalization from a vocalized text.
	@param text: a given vocalized text.
	@type text: unicode.
	@return : reduced text vocalization
	@rtype: unicode
	"""
	return araby.reduceTashkeel(text);
Example #3
0
def test():
    options = grabargs()

    filename = options['fname']
    outfilename = options['ofname']
    text = options['text']
    strip_tashkeel = options['strip_tashkeel']
    nocache = options['nocache']
    reducedTashkeel = options['reducedTashkeel']
    disableSyntax = options['disableSyntax']
    disableSemantic = options['disableSemantic']
    disableStat = options['disableStatistic']
    ignore = options['ignore']
    limit = options['limit']
    compare = options['compare']
    progress = options['progress']
    enable_syn_train = options['train']

    # filename = "samples/randomtext.txt"
    if not text and not filename:
        usage()
        sys.exit(0)

    if not text:
        try:
            myfile = open(filename)
            print("input file:", filename)
            if not outfilename:
                outfilename = filename + " (Tashkeel).txt"
            print("output file:", outfilename)
            outfile = open(outfilename, "w")
        except:
            print(" Can't Open the given File ", filename)
            sys.exit()
    else:
        lines = text.split('\n')
    # all things are well, import library
    import core.adaat
    import pyarabic.araby as araby

    counter = 1
    if not limit:
        limit = 100000000
    if not strip_tashkeel:
        vocalizer = ArabicVocalizer.TashkeelClass()
        if nocache:
            vocalizer.disable_cache()
            # print "nocache"
        if ignore:
            vocalizer.disable_last_mark()
        if disableSemantic:
            vocalizer.disable_semantic_analysis()
        if disableSyntax:
            vocalizer.disable_syntaxic_analysis()
        if disableStat:
            vocalizer.disable_stat_tashkeel()
        if enable_syn_train:
            vocalizer.enable_syn_train()
            # print "mishkal-console, vocalizer.anasynt.syntax_train_enabled", vocalizer.anasynt.syntax_train_enabled

    # vocalizer.disableShowCollocationMark()
    # print "show delimiter", vocalizer.collo.showDelimiter
    # nolimit = True
    nolimit = False
    if not text:
        line = (myfile.readline()).decode('utf8')
    else:
        if len(lines) > 0:
            line = lines[0]
    correct = 0
    incorrect = 0
    total = 0
    totLetters = 0
    LettersError = 0
    WLMIncorrect = 0
    percent = 0
    if compare:
        # dispaly stats for the current line
        print(
            "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct\tLine"
        )

    while line and (nolimit or counter <= limit):
        if not line.startswith('# '):
            line = line.strip()
            lineCorrect = 0
            lineWLMIncorrect = 0
            if strip_tashkeel:
                result = araby.strip_tashkeel(line)
            else:  # vocalize line by line
                if not compare:
                    result = vocalizer.tashkeel(line)
                if compare:
                    inputVocalizedLine = line
                    inputlist = vocalizer.analyzer.tokenize(inputVocalizedLine)
                    inputUnvocalizedLine = araby.strip_tashkeel(line)
                    vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(
                        inputUnvocalizedLine)

                    # stemmer = tashaphyne.stemming.ArabicLightStemmer()
                    # ~texts = vocalizer.analyzer.split_into_phrases(inputVocalizedLine)
                    # ~inputlist = []
                    # ~for txt in texts:
                    # ~inputlist += vocalizer.analyzer.text_tokenize(txt)
                    outputlist = [x.get("chosen", '') for x in vocalized_dict]
                    result = u" ".join(outputlist)
                    outputlistsemi = [
                        x.get("semi", '') for x in vocalized_dict
                    ]
                    total += len(inputlist)
                    lineTotal = len(inputlist)
                    if len(inputlist) != len(outputlist):
                        print("lists haven't the same length")
                        print(len(inputlist), len(outputlist))
                        print(u"# ".join(inputlist).encode('utf8'))
                        print(u"# ".join(outputlist).encode('utf8'))
                    else:
                        for inword, outword, outsemiword in zip(
                                inputlist, outputlist, outputlistsemi):
                            simi = araby.vocalized_similarity(inword, outword)
                            if simi < 0:
                                LettersError += -simi
                                incorrect += 1
                                # evaluation without last haraka
                                simi2 = araby.vocalized_similarity(
                                    inword, outsemiword)
                                if simi2 < 0:
                                    WLMIncorrect += 1
                                    lineWLMIncorrect += 1
                            else:
                                correct += 1
                                lineCorrect += 1

            # compare resultLine and vocalizedLine
            if reducedTashkeel:
                result = araby.reduceTashkeel(result)
            # print result.encode('utf8')
            counter += 1

            # display stat for every line
            if compare:
                print("%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t" % (
                    counter - 1,  # id
                    round(correct * 100.00 / total, 2),  # fully Correct
                    round((total - WLMIncorrect) * 100.00 / total,
                          2),  # Strip Correct
                    incorrect,  # fully WER
                    WLMIncorrect,  # Strip WER
                    LettersError,  # LER
                    total  # Total
                ))
                if lineTotal:
                    print("%0.2f%%\t" %
                          round(lineCorrect * 100.00 / lineTotal, 2)
                          )  # line Fully correct
                    print("%0.2f%%\t" % round(
                        (lineTotal - lineWLMIncorrect) * 100.00 / lineTotal, 2)
                          )  # line Strip correct

            # ~ print result.strip('\n').encode('utf8'),
            if text:
                print result.strip('\n').encode('utf8'),
            else:
                result_line = result.encode('utf8')
                print result_line
                # add line and new line to output file
                outfile.write(result_line)
                outfile.write("\n")

        if progress and not nolimit:
            # ~percent = (counter * 100/ limit ) if (counter / limit * 100 >percent) else percent
            sys.stderr.write(
                "\r[%d%%]%d/%d lines    Full %0.2f Strip %0.2f     " % (
                    counter * 100 / limit,
                    counter,
                    limit,
                    round(correct * 100.00 / total, 2),  # fully Correct
                    round((total - WLMIncorrect) * 100.00 / total,
                          2)  # Strip Correct
                ))
            # ~sys.stderr.write("treatment of "+line.encode('utf8'))
            sys.stderr.flush()

        # get the next line
        if not text:
            line = (myfile.readline()).decode('utf8')
        else:
            if counter < len(lines):
                line = lines[counter]
            else:
                line = None
    else:
        print("Done")
Example #4
0
def test():
    args = grabargs()

    filename = args.filename
    filename2 = args.compareto  # used for comparison
    if filename2:
        compare = True
    else:
        compare = False
    outfilename = args.outfile
    text = args.text
    if not text and not filename:
        print('Try: mishkal-console.py -h')
        sys.exit(0)
    # tashkeel command
    command = args.command
    strip_tashkeel = False
    reducedTashkeel = False
    commandTashkeel = False
    if command == "strip":
        strip_tashkeel = True
    elif command == "reduce":
        reducedTashkeel = True
    else:
        commandTashkeel = True
    # general options
    limit = args.limit
    progress = args.progress
    verbose = args.verbose

    # options
    ignore = args.ignore
    cache = args.cache
    disableSyntax = args.syntax
    disableSemantic = args.semantic
    disableStat = args.stat
    enable_syn_train = args.train
    evaluation = args.evaluation

    # Open file
    if not text:
        try:
            myfile = open(filename, encoding='utf8')
            print("input file:", filename)
            if not outfilename:
                outfilename = filename + ".Tashkeel.txt"
            print("output file:", outfilename)
            outfile = open(outfilename, "w")
        except:
            print(" Can't Open the given File ", filename)
            sys.exit()
    else:
        lines = text.strip().split('\n')
    if compare and filename2:
        try:
            myfile2 = open(filename2, encoding='utf8')
            print("input file2:", filename2)
        except:
            print(" Can't Open the given File ", filename2)
            sys.exit()

    # all things are well, import library

    myconsole = tashkeel_console.Tashkeel_console()
    #~ myconsole.counter = 1
    myconsole.limit = limit
    if not limit:
        # count lines in files if filename, otherwise count lines in text
        if filename:
            with open(filename) as f:
                limit = sum(1 for line in f)
        else:
            limit = len(lines)
    if not strip_tashkeel:
        vocalizer = ArabicVocalizer.TashkeelClass()
        if cache:
            vocalizer.enable_cache()
            sys.stderr.write(" Mishkal use a cache")
        if ignore:
            vocalizer.disable_last_mark()
        if disableSemantic:
            vocalizer.disable_semantic_analysis()
        if disableSyntax:
            vocalizer.disable_syntaxic_analysis()
        if disableStat:
            vocalizer.disable_stat_tashkeel()
        if enable_syn_train:
            vocalizer.enable_syn_train()
        # if verbose option, then activate logger in ArabicVocalizer
        if verbose:
            vocalizer.enable_verbose()

    if not text:
        line = (myfile.readline())  #.decode('utf8')
    else:
        if len(lines) > 0:
            line = lines[0]
        # get the next line to compare
    if compare:
        line_base = myfile2.readline().strip()
    if evaluation:
        myconsole.header()

    while line and myconsole.counter <= limit:
        line = line.strip()
        #~ myconsole.lineCorrect = 0
        #~ myconsole.lineWLMIncorrect = 0
        if strip_tashkeel:
            result = araby.strip_tashkeel(line)
        elif compare:
            myconsole.compare(line_base, line)
            myconsole.display_line_stat()
            result = line
            print("base :", line_base)
            print("input:", line)
        #~ else:    # vocalize line by line
        elif not evaluation:
            result = vocalizer.tashkeel(line)
            myconsole.total += len(araby.tokenize(line))
        elif evaluation:
            inputUnvocalizedLine = araby.strip_tashkeel(line)
            vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(
                inputUnvocalizedLine)
            outputlist = [x.get("chosen", '') for x in vocalized_dict]
            result = u" ".join(outputlist)
            myconsole.compare(line, vocalized_dict)
            # display stat for every line
            myconsole.display_line_stat()
        # compare resultLine and vocalizedLine
        if reducedTashkeel:
            result = araby.reduceTashkeel(result)

        if text:
            print(result.strip('\n'), end='')
        else:
            result_line = result
            if verbose:
                print(result_line)
            # add line and new line to output file
            outfile.write(result_line)
            outfile.write("\n")

        if progress:
            # show progress bar
            myconsole.progress(compare)

        myconsole.counter += 1
        # get the next line
        if not text:
            line = (myfile.readline())
        else:
            if myconsole.counter < len(lines):
                line = lines[myconsole.counter]
            else:
                line = None
        # get the next line to compare
        if compare:
            line_base = myfile2.readline().strip()

    if progress:
        myconsole.footer()
Example #5
0
def test():
	filename, text,  stripTashkeel, reducedTashkeel, disableSyntax, disableSemantic, disableStat, ignore, limit, compare =grabargs()
	#filename="samples/randomtext.txt"	
	if not text and not filename:
		usage()
		sys.exit(0)
		
	if not text:
		try:
			myfile=open(filename)
		except:
			print " Can't Open the given File ", filename;
			sys.exit();
	else:
		lines = text.split('\n');
	# all things are well, import library
	import core.adaat 
	import pyarabic.araby as araby

	counter=1;
	if not limit : 
		limit=	100000000
	if not stripTashkeel: 
		vocalizer=ArabicVocalizer.TashkeelClass();
		if ignore : 
			vocalizer.disableLastMark();
		if disableSemantic:
			vocalizer.disableSemanticAnalysis();
		if disableSyntax:
			vocalizer.disableSyntaxicAnalysis();
		if disableStat:
			vocalizer.disableStatTashkeel();

	#vocalizer.disableShowCollocationMark();
	#print "show delimiter", vocalizer.collo.showDelimiter;
	#nolimit = True;
	nolimit = False;
	if not text:
		line=(myfile.readline()).decode('utf8');
	else:
		if len(lines)>0:
			line= lines[0];
	correct=0;
	incorrect=0;
	total=0;
	totLetters =0;
	LettersError =0
	WLMIncorrect =0;
	if compare:
		#dispaly stats for the current line
		print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct"
		
		# print "Full\tPartial\tFull correct \tfull incorrect\tpartial correct\tpartial incorrect\tWER\tLER\tTotal"
	
	while line and (nolimit or counter<=limit):
		if not line.startswith('#'):
			# lineIncorrect = 0;
			lineCorrect   = 0;
			lineWLMIncorrect =0;
			if stripTashkeel:
				result = araby.stripTashkeel(line);
			else:	#vocalize line by line
				if compare:
					vocalizedLine = line;
					line = araby.stripTashkeel(line)
				result=vocalizer.tashkeel(line);
				#compare resultLine and vocalizedLine
				if compare:
					list1=vocalizer.analyzer.tokenize(vocalizedLine);
					list2=vocalizer.analyzer.tokenize(result);
					#print u":".join(list1).encode('utf8');
					#print u":".join(list2).encode('utf8');
					total+=len(list1);
					lineTotal = len(list1);
					if len(list1)!=len(list2):
						print "lists haven't the same length";
					else:
						for i in range(len(list1)):
							simi = araby.vocalizedSimilarity(list1[i],list2[i]);
							if simi<0:
								LettersError+= -simi;
								incorrect   +=1;
								# lineIncorrect += 1;
								# evaluation without last haraka
								simi2 = araby.vocalizedSimilarity(araby.stripLastHaraka(list1[i]),araby.stripLastHaraka(list2[i]));
								if simi2<0: 
									WLMIncorrect    +=1;
									lineWLMIncorrect+=1;								

							else:
								correct+=1;
								lineCorrect += 1;
					
			#compare resultLine and vocalizedLine
			if reducedTashkeel:
				result= araby.reduceTashkeel(result)
			# print result.encode('utf8');
			counter+=1;

			#display stat for every line
			if compare:
				print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t"%(
						counter-1,#id
						round(correct*100.00/total,2),#fully Correct
						round((total-WLMIncorrect)*100.00/total,2),#Strip Correct
						incorrect,#fully WER
						WLMIncorrect,#Strip WER
						LettersError,#LER
						total,#Total
						),
				if lineTotal:
					print "%0.2f%%\t"%round(lineCorrect*100.00/lineTotal,2),#line Fully correct
					print "%0.2f%%\t"%round((lineTotal-lineWLMIncorrect)*100.00/lineTotal,2),#line Strip correct
						
			print result.encode('utf8');
		#get the next line
		if not text:
			line=(myfile.readline()).decode('utf8');
		else:
			if counter<len(lines):
				line= lines[counter];
			else:
				line =None;
Example #6
0
def test():
    options = grabargs()

    filename = options['fname']
    text     = options['text']
    strip_tashkeel  = options['strip_tashkeel']
    nocache         = options['nocache']
    reducedTashkeel = options['reducedTashkeel']
    disableSyntax   = options['disableSyntax']
    disableSemantic = options['disableSemantic']
    disableStat     = options['disableStatistic']
    ignore = options['ignore']
    limit  = options['limit']
    compare = options['compare']
    progress = options['progress']
        
    #filename = "samples/randomtext.txt"    
    if not text and not filename:
        usage()
        sys.exit(0)
        
    if not text:
        try:
            myfile = open(filename)
        except:
            print " Can't Open the given File ", filename
            sys.exit()
    else:
        lines = text.split('\n')
    # all things are well, import library
    import core.adaat 
    import pyarabic.araby as araby

    counter = 1
    if not limit : 
        limit = 100000000
    if not strip_tashkeel: 
        vocalizer = ArabicVocalizer.TashkeelClass()
        if nocache : 
            vocalizer.disable_cache()
            print "nocache"
        if ignore : 
            vocalizer.disable_last_mark()
        if disableSemantic:
            vocalizer.disable_semantic_analysis()
        if disableSyntax:
            vocalizer.disable_syntaxic_analysis()
        if disableStat:
            vocalizer.disable_stat_tashkeel()

    #vocalizer.disableShowCollocationMark()
    #print "show delimiter", vocalizer.collo.showDelimiter
    #nolimit = True
    nolimit = False
    if not text:
        line = (myfile.readline()).decode('utf8')
    else:
        if len(lines)>0:
            line = lines[0]
    correct = 0
    incorrect = 0
    total = 0
    totLetters = 0
    LettersError = 0
    WLMIncorrect = 0
    percent = 0
    if compare:
        #dispaly stats for the current line
        print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct"
        
    while line and (nolimit or counter <= limit):
        if progress and not nolimit:
            #~percent = (counter * 100/ limit ) if (counter / limit * 100 >percent) else percent
            sys.stderr.write("\r[%d%%]%d/%d lines" %(counter * 100/ limit, counter, limit))
            #~sys.stderr.write("treatment of "+line.encode('utf8'))
            sys.stderr.flush()
        if not line.startswith('#'):
            line = line.strip()
            lineCorrect = 0
            lineWLMIncorrect = 0
            if strip_tashkeel:
                result = araby.strip_tashkeel(line)
            else:    #vocalize line by line
                if not compare:
                    result = vocalizer.tashkeel(line)                    
                if compare:
                    inputVocalizedLine = line
                    inputlist = vocalizer.analyzer.tokenize(inputVocalizedLine)
                    inputUnvocalizedLine = araby.strip_tashkeel(line)
                    vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(inputUnvocalizedLine)


                    #stemmer=tashaphyne.stemming.ArabicLightStemmer()
                    #~texts = vocalizer.analyzer.split_into_phrases(inputVocalizedLine)
                    #~inputlist =[]
                    #~for txt in texts:
                        #~inputlist += vocalizer.analyzer.text_tokenize(txt)
                    outputlist = [x.get("chosen",'') for x in vocalized_dict]
                    result = u" ".join(outputlist)
                    outputlistsemi = [x.get("semi",'') for x in vocalized_dict]
                    total += len(inputlist)
                    lineTotal = len(inputlist)
                    if len(inputlist) != len(outputlist):
                        print "lists haven't the same length"
                        print len(inputlist), len(outputlist)
                        print u"#".join(inputlist).encode('utf8')
                        print u"#".join(outputlist).encode('utf8')
                    else:
                        for inword, outword, outsemiword in zip(inputlist, outputlist, outputlistsemi):
                            simi = araby.vocalized_similarity(inword, outword)
                            if simi<0:
                                LettersError += -simi
                                incorrect    += 1
                                # evaluation without last haraka
                                simi2 = araby.vocalized_similarity(inword, outsemiword)
                                if simi2<0: 
                                    WLMIncorrect     += 1
                                    lineWLMIncorrect += 1                                
                            else:
                                correct += 1
                                lineCorrect  += 1
                    
            #compare resultLine and vocalizedLine
            if reducedTashkeel:
                result = araby.reduceTashkeel(result)
            # print result.encode('utf8')
            counter += 1

            #display stat for every line
            if compare:
                print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t"%(
                        counter-1, #id
                        round(correct*100.00/total, 2), #fully Correct
                        round((total-WLMIncorrect)*100.00/total, 2), #Strip Correct
                        incorrect, #fully WER
                        WLMIncorrect, #Strip WER
                        LettersError, #LER
                        total, #Total
                        ), 
                if lineTotal:
                    print "%0.2f%%\t"%round(lineCorrect*100.00/lineTotal, 2), #line Fully correct
                    print "%0.2f%%\t"%round((lineTotal-lineWLMIncorrect)*100.00/lineTotal, 2), #line Strip correct
                        
            print result.encode('utf8')
        #get the next line
        if not text:
            line = (myfile.readline()).decode('utf8')
        else:
            if counter<len(lines):
                line = lines[counter]
            else:
                line = None
Example #7
0
def test():
    filename, text, stripTashkeel, reducedTashkeel, disableSyntax, disableSemantic, disableStat, ignore, limit, compare = grabargs(
    )
    #filename="samples/randomtext.txt"
    if not text and not filename:
        usage()
        sys.exit(0)

    if not text:
        try:
            myfile = open(filename)
        except:
            print " Can't Open the given File ", filename
            sys.exit()
    else:
        lines = text.split('\n')
    # all things are well, import library
    import core.adaat
    import pyarabic.araby as araby

    counter = 1
    if not limit:
        limit = 100000000
    if not stripTashkeel:
        vocalizer = ArabicVocalizer.TashkeelClass()
        if ignore:
            vocalizer.disableLastMark()
        if disableSemantic:
            vocalizer.disableSemanticAnalysis()
        if disableSyntax:
            vocalizer.disableSyntaxicAnalysis()
        if disableStat:
            vocalizer.disableStatTashkeel()

    #vocalizer.disableShowCollocationMark();
    #print "show delimiter", vocalizer.collo.showDelimiter;
    #nolimit = True;
    nolimit = False
    if not text:
        line = (myfile.readline()).decode('utf8')
    else:
        if len(lines) > 0:
            line = lines[0]
    correct = 0
    incorrect = 0
    total = 0
    totLetters = 0
    LettersError = 0
    WLMIncorrect = 0
    if compare:
        #dispaly stats for the current line
        print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct"

        # print "Full\tPartial\tFull correct \tfull incorrect\tpartial correct\tpartial incorrect\tWER\tLER\tTotal"

    while line and (nolimit or counter <= limit):
        if not line.startswith('#'):
            # lineIncorrect = 0;
            lineCorrect = 0
            lineWLMIncorrect = 0
            if stripTashkeel:
                result = araby.stripTashkeel(line)
            else:  #vocalize line by line
                if compare:
                    vocalizedLine = line
                    line = araby.stripTashkeel(line)
                result = vocalizer.tashkeel(line)
                #compare resultLine and vocalizedLine
                if compare:
                    list1 = vocalizer.analyzer.tokenize(vocalizedLine)
                    list2 = vocalizer.analyzer.tokenize(result)
                    #print u":".join(list1).encode('utf8');
                    #print u":".join(list2).encode('utf8');
                    total += len(list1)
                    lineTotal = len(list1)
                    if len(list1) != len(list2):
                        print "lists haven't the same length"
                    else:
                        for i in range(len(list1)):
                            simi = araby.vocalizedSimilarity(
                                list1[i], list2[i])
                            if simi < 0:
                                LettersError += -simi
                                incorrect += 1
                                # lineIncorrect += 1;
                                # evaluation without last haraka
                                simi2 = araby.vocalizedSimilarity(
                                    araby.stripLastHaraka(list1[i]),
                                    araby.stripLastHaraka(list2[i]))
                                if simi2 < 0:
                                    WLMIncorrect += 1
                                    lineWLMIncorrect += 1

                            else:
                                correct += 1
                                lineCorrect += 1

            #compare resultLine and vocalizedLine
            if reducedTashkeel:
                result = araby.reduceTashkeel(result)
            # print result.encode('utf8');
            counter += 1

            #display stat for every line
            if compare:
                print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t" % (
                    counter - 1,  #id
                    round(correct * 100.00 / total, 2),  #fully Correct
                    round((total - WLMIncorrect) * 100.00 / total,
                          2),  #Strip Correct
                    incorrect,  #fully WER
                    WLMIncorrect,  #Strip WER
                    LettersError,  #LER
                    total,  #Total
                ),
                if lineTotal:
                    print "%0.2f%%\t" % round(lineCorrect * 100.00 / lineTotal,
                                              2),  #line Fully correct
                    print "%0.2f%%\t" % round(
                        (lineTotal - lineWLMIncorrect) * 100.00 / lineTotal,
                        2),  #line Strip correct

            print result.encode('utf8')
        #get the next line
        if not text:
            line = (myfile.readline()).decode('utf8')
        else:
            if counter < len(lines):
                line = lines[counter]
            else:
                line = None