-
Notifications
You must be signed in to change notification settings - Fork 3
/
AccEval.py
71 lines (55 loc) · 2.57 KB
/
AccEval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
'''
This module evaluates the accuracy of an OCR text. It accepts, at minimum,
the text as a list of lines and the dictionary used for evaluation. It
also accepts special rule sets (ie, F/S substitution) but will still
score a text if none is passed in. If specified, will process tokens
generated using end of line hyphen fusing.
Requires: List of lines, processed dictionary.
Optional: Substitution rules, vebose flag, hyphen flag
Module returns a tuple of 6 length: total number of capitalized tokens,
total number of capitalized dictionary matches, total number of capitalized
matches through subsitution, total number of lower-case tokens, total
number of lower-case dictionary matches, and total number of lower-case
matches through substitution.
'''
import TokenGen
def GetScore(Text,Lexicon,Rules=set(),verbose=False,hyphen=False):
CapsMatch = 0
CapsSub = 0
CapsCount = 0
LowMatch = 0
LowSub = 0
LowCount = 0
if verbose:
print("Attempting token matching")
if Rules == set():
print("No substitution rules loaded")
## If not asked to check for possible fragmented matches, use basic checker.
## Function will default to basic checker.
Tokens = TokenGen.break_hyphens(Text,Lexicon,Rules,verbose)
## Maintains separate scores for substitution if rules were passed in, as well
## as separate scores for capitals, lowercase. Note that tokens like
## "wiU" (will) are not counted as capitalized.
for word in Tokens:
LowerWord = word.lower()
if word[0].islower():
LowCount = LowCount + 1
if LowerWord in Lexicon:
LowMatch = LowMatch + 1
elif len(Rules) >= 1 and LowerWord in Rules:
LowSub = LowSub + 1
else:
CapsCount = CapsCount + 1
if LowerWord in Lexicon:
CapsMatch = CapsMatch + 1
elif len(Rules) >= 1 and LowerWord in Rules:
CapsSub = CapsSub + 1
if verbose:
print("\t" + str(CapsCount) + " total capitalized tokens")
print("\t" + str(CapsMatch) + " total capitalized dictionary matches")
print("\t" + str(CapsSub) + " total capitalized valid substitutions")
print("\t" + str(LowCount) + " total lower-case tokens")
print("\t" + str(LowMatch) + " total lower-case dictionary matches")
print("\t" + str(LowSub) + " total lower-case valid substitutions\n")
## Return the six scores as a tuple.
return (CapsCount,CapsMatch,CapsSub,LowCount,LowMatch,LowSub)