-
Notifications
You must be signed in to change notification settings - Fork 0
/
ctocr.py
330 lines (293 loc) · 10.9 KB
/
ctocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
'''
Created on Aug 5, 2013
source code from:
http://craiget.com/extracting-table-data-from-pdfs-with-ocr/
my page with info:
https://docs.google.com/document/d/10UH7dH13te816lPPZARTfG3oK5eXq__W78q5viXgjrU/edit?usp=sharing
use PILLOW instead of PIL:
sudo pip uninstall PIL
sudo pip install pillow -U
tesseract install procedure:
sudo apt-get install tesseract-ocr
Russian language:
sudo apt-get install tesseract-ocr-rus
ToDo:
1. find two tables at one page and split to two pages
2. find index at field - done
Release History:
R0.1 2013-08-07 Initial Release. Works with Russian, INDEX chatched well
R0.2 2013-08-09 Fix bug for PIL library (PIL import was wrong)
R0.3
R0.4
'''
from PIL import Image, ImageOps
###############################################################################
#import Image, ImageOps
import subprocess, sys, os, glob
import time
from time import sleep
RELEASE = 0.4
# minimum run of adjacent pixels to call something a line
H_THRESH = 300
V_THRESH = 300
def get_hlines(pix, w, h):
"""Get start/end pixels of lines containing horizontal runs of at least THRESH black pix"""
hlines = []
for y in range(h):
x1, x2 = (None, None)
black = 0
run = 0
for x in range(w):
#### if pix[x,y] == (0,0,0):
if pix[x,y] == (0,255):
black = black + 1
if not x1: x1 = x
x2 = x
else:
if black > run:
run = black
black = 0
if run > H_THRESH:
hlines.append((x1,y,x2,y))
return hlines
def get_vlines(pix, w, h):
"""Get start/end pixels of lines containing vertical runs of at least THRESH black pix"""
vlines = []
for x in range(w):
y1, y2 = (None,None)
black = 0
run = 0
for y in range(h):
### if pix[x,y] == (0,0,0):
if pix[x,y] == (0,255):
black = black + 1
if not y1: y1 = y
y2 = y
else:
if black > run:
run = black
black = 0
if run > V_THRESH:
vlines.append((x,y1,x,y2))
return vlines
def get_cols(vlines):
"""Get top-left and bottom-right coordinates for each column from a list of vertical lines"""
cols = []
for i in range(1, len(vlines)):
if vlines[i][0] - vlines[i-1][0] > 1:
cols.append((vlines[i-1][0],vlines[i-1][1],vlines[i][2],vlines[i][3]))
return cols
def get_rows(hlines):
"""Get top-left and bottom-right coordinates for each row from a list of vertical lines"""
rows = []
for i in range(1, len(hlines)):
if hlines[i][1] - hlines[i-1][3] > 1:
rows.append((hlines[i-1][0],hlines[i-1][1],hlines[i][2],hlines[i][3]))
return rows
def get_cells(rows, cols):
"""Get top-left and bottom-right coordinates for each cell usings row and column coordinates"""
cells = {}
for i, row in enumerate(rows):
cells.setdefault(i, {})
for j, col in enumerate(cols):
x1 = col[0]
y1 = row[1]
x2 = col[2]
y2 = row[3]
cells[i][j] = (x1,y1,x2,y2)
return cells
def ocr_cell(im, cells, x, y):
"""Return OCRed text from this cell"""
fbase = PATH_TEMP + "/%d-%d" % (x, y)
ftif = "%s.tif" % fbase
ftxt = "%s.txt" % fbase
cmd = "tesseract -l rus -psm 7 %s %s" % (ftif, fbase)
# extract cell from whole image, grayscale (1-color channel), monochrome
region = im.crop(cells[x][y])
region = ImageOps.grayscale(region)
region = region.point(lambda p: p > 200 and 255)
# determine background color (most used color)
histo = region.histogram()
if histo[0] > histo[255]: bgcolor = 0
else: bgcolor = 255
# trim borders by finding top-left and bottom-right bg pixels
pix = region.load()
x1,y1 = 0,0
x2,y2 = region.size
x2,y2 = x2-1,y2-1
while pix[x1,y1] != bgcolor:
x1 += 1
y1 += 1
while pix[x2,y2] != bgcolor:
x2 -= 1
y2 -= 1
# save as TIFF and extract text with Tesseract OCR
trimmed = region.crop((x1,y1,x2,y2))
trimmed.save(ftif, "TIFF")
subprocess.call([cmd], shell=True, stderr=subprocess.PIPE)
lines = [l.strip() for l in open(ftxt).readlines()]
if len(lines) == 0:
return "0"
else:
return lines[0]
###############################################################################
# ported from get_cells() - only for one field - Phone. This fiel contains INDEX
def get_index(im, cells, field="NAME"):
try: # if bottom page is empty, neet escape !!
square_0 = cells[0][0]
except KeyError:
return "EMPTY"
# print "cell 0 before shift: ", square_0
if field == "NAME":
phone_field = (square_0[0] - 110, square_0[1] - 450, square_0[2] + 650, square_0[3] - 420) #(x1,y1,x2,y2)
elif field == "INDEX":
phone_field = (square_0[0] + 110, square_0[1] - 220, square_0[2] + 50, square_0[3] - 220)
else:
print "Wrong option for get_index() , STOP !!"
sys.exit(1)
# print "after shift to phone field: ", phone_field
"""Return OCRed text from this cell"""
fbase = PATH_TEMP + "/index_field"
ftif = "%s.tif" % fbase
ftxt = "%s.txt" % fbase
cmd = "tesseract -l rus -psm 7 %s %s" % (ftif, fbase)
# extract cell from whole image, grayscale (1-color channel), monochrome
region = im.crop(phone_field)
region = ImageOps.grayscale(region)
region = region.point(lambda p: p > 200 and 255)
# determine background color (most used color)
histo = region.histogram()
if histo[0] > histo[255]: bgcolor = 0
else: bgcolor = 255
# trim borders by finding top-left and bottom-right bg pixels
pix = region.load()
x1,y1 = 0,0
x2,y2 = region.size
x2,y2 = x2-1,y2-1
while pix[x1,y1] != bgcolor:
x1 += 1
y1 += 1
while pix[x2,y2] != bgcolor:
x2 -= 1
y2 -= 1
# save as TIFF and extract text with Tesseract OCR
trimmed = region.crop((x1,y1,x2,y2))
trimmed.save(ftif, "TIFF")
subprocess.call([cmd], shell=True, stderr=subprocess.PIPE)
lines = [l.strip() for l in open(ftxt).readlines()]
if len(lines) == 0:
return "EMPTY"
else:
return lines[0]
###############################################################################
###############################################################################
def cut_half_page(im, page):
width, height = im.size
box_top = (0, 0, width, height/2) # (left, upper, right, lower)
box_bottom = (0, height/2, width, height) # (left, upper, right, lower)
if page == 0:
region_top = im.crop(box_top)
region_top.save("halph_page_file.png", "PNG" )
else:
region_bottom = im.crop(box_bottom)
region_bottom.save("halph_page_file.png", "PNG" )
im = Image.open("halph_page_file.png")
return im
###############################################################################
def get_image_data(filename, page):
"""Extract textual data[rows][cols] from spreadsheet-like image file"""
im = Image.open(filename)
# im = Image.open(open(filename, 'rb'))
# code for divided page not ready yet !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# looking for top text line Y coordinate.
h_im = cut_half_page(im, page)
pix = h_im.load()
width, height = h_im.size
hlines = get_hlines(pix, width, height)
sys.stderr.write("%s: hlines: %d\n" % (filename, len(hlines)))
vlines = get_vlines(pix, width, height)
sys.stderr.write("%s: vlines: %d\n" % (filename, len(vlines)))
rows = get_rows(hlines)
sys.stderr.write("%s: rows: %d\n" % (filename, len(rows)))
cols = get_cols(vlines)
sys.stderr.write("%s: cols: %d\n" % (filename, len(cols)))
cells = get_cells(rows, cols)
# add my code - looking for INDEX at Phone Field
index = get_index(h_im, cells, "NAME")
data = []
for row in range(len(rows)):
data.append([ocr_cell(h_im,cells, row, col) for col in range(len(cols))])
return data, "NAME " + index
def split_pdf(filename):
"""Split PDF into PNG pages, return filename"""
prefix = filename[:-4]
cmd_1 = "convert -density 600 %s" % (filename)
cmd_2 = "/%s-%%d.png" % (prefix)
cmd = cmd_1 + " " + PATH_TEMP + cmd_2
subprocess.call([cmd], shell=True)
return [f for f in glob.glob(os.path.join(PATH_TEMP, '%s*' % prefix))]
def extract_pdf(filename,f):
"""Extract table data from pdf"""
pngfiles = split_pdf(filename)
sys.stderr.write("Pages: %d\n" % len(pngfiles))
# extract table data from each page
data = []
for pngfile in pngfiles:
pngdata, index = get_image_data(pngfile, page=0)
f.write("\n" + index + "\n")
print (index + "\n")
first_line_flag = True
for d in pngdata:
data.append(d)
if first_line_flag == True: # first line should be removed
first_line_flag = False
else:
for i in range(len(d)):
f.write(d[i] + "\t")
# remove temp files for this page
cmd = "rm %s" %(PATH_TEMP + '/*.tif')
subprocess.call([cmd], shell=True)
cmd = "rm %s" %(PATH_TEMP + '/*.txt')
subprocess.call([cmd], shell=True)
pngdata, index = get_image_data(pngfile, page=1)
f.write("\n" + index + "\n")
print (index + "\n")
first_line_flag = True
for d in pngdata:
data.append(d)
if first_line_flag == True: # first line should be removed
first_line_flag = False
else:
for i in range(len(d)):
f.write(d[i] + "\t")
# remove temp files for this page
# raw_input("LOOK at TEMP Directory !!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
cmd = "rm %s" %(PATH_TEMP + '/*.tif')
subprocess.call([cmd], shell=True)
cmd = "rm %s" %(PATH_TEMP + '/*.txt')
subprocess.call([cmd], shell=True)
# remove split pages
cmd = "rm %s" %(PATH_TEMP + '/*')
subprocess.call([cmd], shell=True)
return data
###############################################################################
###############################################################################
if __name__ == '__main__':
PATH_TEMP = '/home/sskriblo/s21-ocr_work/temp'
PATH_WORK = '/home/sskriblo/s21-ocr_work'
if len(sys.argv) != 2:
print "Usage: ctocr.py FILENAME"
exit()
start_time = time.time()
print start_time
f = open(PATH_WORK + '/s21-ocr.txt', 'w')
# split target pdf into pages
filename = sys.argv[1]
data = extract_pdf(filename,f)
# s21_report(data)
for row in data:
print "\t".join(row)
stop_time = time.time()
print stop_time
print "Total Time (seconds) = ", stop_time - start_time
f.close()