-
Notifications
You must be signed in to change notification settings - Fork 0
/
tests.py
60 lines (57 loc) · 2.44 KB
/
tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# import modules to test
import ocr_cleanup
import initial_ocr
from Document import Document
# to write and read output
import csv
# to split by os seperator
import os
# to debug
import sys
# a few uses
from collections import Counter
# to get standard directory names
import settings
# test cases
def four_frames_test():
# make directories
original_pic_dir = 'tests/four-frames/original-pictures'
dir_for_bigger_images = 'tests/four-frames' + os.sep + settings.images_ready_for_ocr
if not os.path.isdir(dir_for_bigger_images):
os.mkdir(dir_for_bigger_images)
dir_for_hocr = 'tests/four-frames' + os.sep + settings.hocr_dir
if not os.path.isdir(dir_for_hocr):
os.mkdir(dir_for_hocr)
dir_for_xml = 'tests/four-frames' + os.sep + settings.xml_dir
if not os.path.isdir(dir_for_xml):
os.mkdir(dir_for_xml)
# make initial run through the images
for filename in os.listdir(original_pic_dir):
# resize
full_path = original_pic_dir + os.sep + filename
full_path_for_new_image = dir_for_bigger_images + os.sep + filename
initial_ocr.resize_image(full_path, full_path_for_new_image, redo=True, part='digital reading')
# run tesseract
full_path_for_hocr = dir_for_hocr + os.sep + filename
initial_ocr.run_tesseract_on_image(full_path_for_new_image, full_path_for_hocr, redo=True)
# make corrections
correct_bags = ocr_cleanup.get_correct_bags()
word_to_doc = ocr_cleanup.make_matching_dictionary(correct_bags)
ocr_cleanup.cleanup_hocr_files(dir_for_hocr, dir_for_xml, correct_bags, word_to_doc)
# find differences
for filename in os.listdir(dir_for_xml):
full_path = dir_for_xml + os.sep + filename
doc = Document(full_path)
lines = [str(l).strip() for l in doc.lines if len(str(l).strip()) > 0]
filename_with_txt_ending = filename[:-len('png.hocr')] + 'txt'
path_to_correct_lines_file = 'tests/four-frames' + os.sep + 'limited-correct-output-text' + os.sep + filename_with_txt_ending
with open(path_to_correct_lines_file, 'r') as infile:
correct_lines = [line.strip() for line in infile]
if len(lines) != len(correct_lines):
raise Exception('lines has length {0} but correct_lines has length {1} for {2}'.format(len(lines), len(correct_lines), filename))
for i in range(len(lines)):
if lines[i] != correct_lines[i]:
raise Exception('lines[{0}] has value\n{1}\n but correct_lines[{0}] has value\n{2}\n for {3}'.format(i, lines[i], correct_lines[i], filename))
print('Four frames test passed')
if __name__ == '__main__':
four_frames_test()