Example #1
0
def test():
	filename, text,  stripTashkeel, reducedTashkeel, disableSyntax, disableSemantic, disableStat, ignore, limit, compare =grabargs()
	#filename="samples/randomtext.txt"	
	if not text and not filename:
		usage()
		sys.exit(0)
		
	if not text:
		try:
			myfile=open(filename)
		except:
			print " Can't Open the given File ", filename;
			sys.exit();
	else:
		lines = text.split('\n');
	# all things are well, import library
	import core.adaat 
	import pyarabic.araby as araby

	counter=1;
	if not limit : 
		limit=	100000000
	if not stripTashkeel: 
		vocalizer=ArabicVocalizer.TashkeelClass();
		if ignore : 
			vocalizer.disableLastMark();
		if disableSemantic:
			vocalizer.disableSemanticAnalysis();
		if disableSyntax:
			vocalizer.disableSyntaxicAnalysis();
		if disableStat:
			vocalizer.disableStatTashkeel();

	#vocalizer.disableShowCollocationMark();
	#print "show delimiter", vocalizer.collo.showDelimiter;
	#nolimit = True;
	nolimit = False;
	if not text:
		line=(myfile.readline()).decode('utf8');
	else:
		if len(lines)>0:
			line= lines[0];
	correct=0;
	incorrect=0;
	total=0;
	totLetters =0;
	LettersError =0
	WLMIncorrect =0;
	if compare:
		#dispaly stats for the current line
		print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct"
		
		# print "Full\tPartial\tFull correct \tfull incorrect\tpartial correct\tpartial incorrect\tWER\tLER\tTotal"
	
	while line and (nolimit or counter<=limit):
		if not line.startswith('#'):
			# lineIncorrect = 0;
			lineCorrect   = 0;
			lineWLMIncorrect =0;
			if stripTashkeel:
				result = araby.stripTashkeel(line);
			else:	#vocalize line by line
				if compare:
					vocalizedLine = line;
					line = araby.stripTashkeel(line)
				result=vocalizer.tashkeel(line);
				#compare resultLine and vocalizedLine
				if compare:
					list1=vocalizer.analyzer.tokenize(vocalizedLine);
					list2=vocalizer.analyzer.tokenize(result);
					#print u":".join(list1).encode('utf8');
					#print u":".join(list2).encode('utf8');
					total+=len(list1);
					lineTotal = len(list1);
					if len(list1)!=len(list2):
						print "lists haven't the same length";
					else:
						for i in range(len(list1)):
							simi = araby.vocalizedSimilarity(list1[i],list2[i]);
							if simi<0:
								LettersError+= -simi;
								incorrect   +=1;
								# lineIncorrect += 1;
								# evaluation without last haraka
								simi2 = araby.vocalizedSimilarity(araby.stripLastHaraka(list1[i]),araby.stripLastHaraka(list2[i]));
								if simi2<0: 
									WLMIncorrect    +=1;
									lineWLMIncorrect+=1;								

							else:
								correct+=1;
								lineCorrect += 1;
					
			#compare resultLine and vocalizedLine
			if reducedTashkeel:
				result= araby.reduceTashkeel(result)
			# print result.encode('utf8');
			counter+=1;

			#display stat for every line
			if compare:
				print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t"%(
						counter-1,#id
						round(correct*100.00/total,2),#fully Correct
						round((total-WLMIncorrect)*100.00/total,2),#Strip Correct
						incorrect,#fully WER
						WLMIncorrect,#Strip WER
						LettersError,#LER
						total,#Total
						),
				if lineTotal:
					print "%0.2f%%\t"%round(lineCorrect*100.00/lineTotal,2),#line Fully correct
					print "%0.2f%%\t"%round((lineTotal-lineWLMIncorrect)*100.00/lineTotal,2),#line Strip correct
						
			print result.encode('utf8');
		#get the next line
		if not text:
			line=(myfile.readline()).decode('utf8');
		else:
			if counter<len(lines):
				line= lines[counter];
			else:
				line =None;
Example #2
0
import tashkeel
if __name__ == '__main__':
	filename, disableSyntax, disableSemantic, disableStat, ignore, limit, compare =grabargs()
	#filename="samples/randomtext.txt"	
	try:
		myfile=open(filename)
	except:
		print " Can't Open the given File ", filename;

	counter=1;
	if not limit : 
		limit=	100000000
	nolimit = False;
	correct=0;
	total=0;
	line=(myfile.readline()).decode('utf8');
	while line and (nolimit or counter<=limit):
		unvocline= araby.stripTashkeel(line);
		vocalized=pyarabic.number.preTashkeelNumber(araby.tokenize(unvocline));
		vocalized=u' '.join(vocalized);
		if vocalized!=unvocline:
			total+=1;
			sim = araby.vocalizedSimilarity(vocalized, araby.stripShadda( line));
			if sim>=0: correct+=1;
			#		for res in result:
			if sim<0:
				print u"\t".join([str(sim),str(counter),str(len(vocalized)),str(len(line)),vocalized, line]).encode('utf8');
		#get the next line
		line=(myfile.readline()).decode('utf8');
		counter+=1;
	print correct, total, round(correct*100.00/total,2)
Example #3
0
def test():
    filename, text, stripTashkeel, reducedTashkeel, disableSyntax, disableSemantic, disableStat, ignore, limit, compare = grabargs(
    )
    #filename="samples/randomtext.txt"
    if not text and not filename:
        usage()
        sys.exit(0)

    if not text:
        try:
            myfile = open(filename)
        except:
            print " Can't Open the given File ", filename
            sys.exit()
    else:
        lines = text.split('\n')
    # all things are well, import library
    import core.adaat
    import pyarabic.araby as araby

    counter = 1
    if not limit:
        limit = 100000000
    if not stripTashkeel:
        vocalizer = ArabicVocalizer.TashkeelClass()
        if ignore:
            vocalizer.disableLastMark()
        if disableSemantic:
            vocalizer.disableSemanticAnalysis()
        if disableSyntax:
            vocalizer.disableSyntaxicAnalysis()
        if disableStat:
            vocalizer.disableStatTashkeel()

    #vocalizer.disableShowCollocationMark();
    #print "show delimiter", vocalizer.collo.showDelimiter;
    #nolimit = True;
    nolimit = False
    if not text:
        line = (myfile.readline()).decode('utf8')
    else:
        if len(lines) > 0:
            line = lines[0]
    correct = 0
    incorrect = 0
    total = 0
    totLetters = 0
    LettersError = 0
    WLMIncorrect = 0
    if compare:
        #dispaly stats for the current line
        print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct"

        # print "Full\tPartial\tFull correct \tfull incorrect\tpartial correct\tpartial incorrect\tWER\tLER\tTotal"

    while line and (nolimit or counter <= limit):
        if not line.startswith('#'):
            # lineIncorrect = 0;
            lineCorrect = 0
            lineWLMIncorrect = 0
            if stripTashkeel:
                result = araby.stripTashkeel(line)
            else:  #vocalize line by line
                if compare:
                    vocalizedLine = line
                    line = araby.stripTashkeel(line)
                result = vocalizer.tashkeel(line)
                #compare resultLine and vocalizedLine
                if compare:
                    list1 = vocalizer.analyzer.tokenize(vocalizedLine)
                    list2 = vocalizer.analyzer.tokenize(result)
                    #print u":".join(list1).encode('utf8');
                    #print u":".join(list2).encode('utf8');
                    total += len(list1)
                    lineTotal = len(list1)
                    if len(list1) != len(list2):
                        print "lists haven't the same length"
                    else:
                        for i in range(len(list1)):
                            simi = araby.vocalizedSimilarity(
                                list1[i], list2[i])
                            if simi < 0:
                                LettersError += -simi
                                incorrect += 1
                                # lineIncorrect += 1;
                                # evaluation without last haraka
                                simi2 = araby.vocalizedSimilarity(
                                    araby.stripLastHaraka(list1[i]),
                                    araby.stripLastHaraka(list2[i]))
                                if simi2 < 0:
                                    WLMIncorrect += 1
                                    lineWLMIncorrect += 1

                            else:
                                correct += 1
                                lineCorrect += 1

            #compare resultLine and vocalizedLine
            if reducedTashkeel:
                result = araby.reduceTashkeel(result)
            # print result.encode('utf8');
            counter += 1

            #display stat for every line
            if compare:
                print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t" % (
                    counter - 1,  #id
                    round(correct * 100.00 / total, 2),  #fully Correct
                    round((total - WLMIncorrect) * 100.00 / total,
                          2),  #Strip Correct
                    incorrect,  #fully WER
                    WLMIncorrect,  #Strip WER
                    LettersError,  #LER
                    total,  #Total
                ),
                if lineTotal:
                    print "%0.2f%%\t" % round(lineCorrect * 100.00 / lineTotal,
                                              2),  #line Fully correct
                    print "%0.2f%%\t" % round(
                        (lineTotal - lineWLMIncorrect) * 100.00 / lineTotal,
                        2),  #line Strip correct

            print result.encode('utf8')
        #get the next line
        if not text:
            line = (myfile.readline()).decode('utf8')
        else:
            if counter < len(lines):
                line = lines[counter]
            else:
                line = None