-
Notifications
You must be signed in to change notification settings - Fork 0
/
PDFScrape.py
executable file
·777 lines (650 loc) · 23.7 KB
/
PDFScrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
# -*- coding: utf-8 -*-
import matplotlib
matplotlib.use('Agg')
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from operator import itemgetter
import pdfminer
import re
import matplotlib.pyplot as plt
from matplotlib import patches
import argparse
import os
import shutil
import sys
import time
def extract_layout_by_page(pdf_path):
"""
Extracts LTPage objects from a pdf file.
"""
laparams = LAParams()
fp = open(pdf_path, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
layouts = []
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layouts.append(device.get_result())
return layouts
TEXT_ELEMENTS = [
pdfminer.layout.LTTextBox,
pdfminer.layout.LTTextBoxHorizontal,
pdfminer.layout.LTTextLine,
pdfminer.layout.LTTextLineHorizontal
]
def flatten(lst):
"""Flattens a list of lists"""
return [subelem for elem in lst for subelem in elem]
def extract_characters(element):
"""
Recursively extracts individual characters from
text elements.
"""
if isinstance(element, pdfminer.layout.LTChar):
return [element]
if any(isinstance(element, i) for i in TEXT_ELEMENTS):
return flatten([extract_characters(e) for e in element])
if isinstance(element, list):
return flatten([extract_characters(l) for l in element])
return []
def draw_rect_bbox(coordinates, ax, color):
"""
Draws an unfilled rectable onto ax.
"""
(x0, y0, x1, y1) = coordinates
ax.add_patch(
patches.Rectangle(
(x0, y0),
x1 - x0,
y1 - y0,
fill=False,
color=color
)
)
def print_text(x0,y0, plt, num):
plt.text(x0, y0, num, fontsize=6)
def draw_rect(rect, ax, color="black"):
draw_rect_bbox(rect.bbox, ax, color)
#global variables
tminx = 10000
tminy = 10000
tmaxx = 0
tmaxy = 0
results_dir = "results"
csv_dir = "csv_files"
json_dir = "./json"
f1 = open('./output.txt', 'w')
#########
def minmax_cal(rt):
global tminx, tminy, tmaxx, tmaxy
if rt.x0<tminx:
tminx = rt.x0
if rt.y0<tminy:
tminy = rt.y0
if rt.x1>tmaxx:
tmaxx = rt.x1
if rt.y1>tmaxy:
tmaxy = rt.y1
def is_inside(rt):
global tminx, tminy, tmaxx, tmaxy
if (rt.x0>=tminx and rt.y0>=tminy and rt.x1<=tmaxx and rt.y1<=tmaxy):
return True
if (rt.x0>=tminx and rt.y0>=tminy and rt.x0<=tmaxx and rt.y0<=tmaxy):
return True
if (rt.x1>=tminx and rt.y1>=tminy and rt.x1<=tmaxx and rt.y1<=tmaxy):
return True
return False
def is_inside_box(boxx, rt):
if (rt.x0>=boxx[0] and rt.y0>=boxx[1] and rt.x1<=boxx[2] and rt.y1<=boxx[3]):
return True
if (rt.x0>=boxx[0] and rt.y0>=boxx[1] and rt.x0<=boxx[2] and rt.y0<=boxx[3]):
return True
if (rt.x1>=boxx[0] and rt.y1>=boxx[1] and rt.x1<=boxx[2] and rt.y1<=boxx[3]):
return True
return False
def merge_bbox(coordinates1, coordinates2):
(p0, q0, r1, s1) = coordinates1
(x0, y0, x1, y1) = coordinates2
return min(p0, x0), min(q0, y0), max(r1, x1), max(s1, y1)
def is_number_box(bx):
strall = bx.get_text()
if len(strall)<1:
return False
if bx.x1-bx.x0>200:# some long lenth boxes for bullet no etc.
return False
words = strall.split('\n')
#todo: check for character
ustr = u"—"
truecount, falsecount = 0, 0
for word in words:
if (len(word)>0):
if ( re.match(u"^(?=.*\d)[0-9\s,.(){}\[\]\-%$mbn\/\—]+$", word) or re.match(u"^[\s—]+$", word)):
truecount += 1
else:
falsecount += 1
if (truecount>=falsecount):
return True
return False
def jsonWrite(inputAr, colCount, filename):
tbheaders = []
for i in range(colCount):
tbheaders.append("")
f1 = open(filename, 'w')
#print("Col count : " + str(colCount))
isEmptyRow = False
col1Text = ""
addPreviousLeftCol = False
prevCol1Text = ""
for lineStr in inputAr:
#print(lineStr)
words = lineStr.split("|")
j=0
isHeader = True
jsonLine = ""
for word in words:
#print("j="+ str(j)+ " :"+word+".")
if j==0: #check the header if first column of row is empty
if len(word)==0: #header row
isHeader = True
else:
isHeader = False
col1Text = word
else: #check empty line
if len(word)>0:
isEmptyRow = False
col1Text = ""
else:
isEmptyRow = True
if (isHeader and (j!=0) ):
tbheaders[j] += " " + word
else:
if j==0:
if addPreviousLeftCol:
word = prevCol1Text + " " + word
addPreviousLeftCol = False
#print(" Add previous- "+word)
jsonLine = "\"" + word + "\" :{ "
else:
jsonLine += "\"" + tbheaders[j] + "\" : \"" + word + "\" , "
j += 1
jsonLine += "}"
#check if previous is empty raw then appen left column
if (isEmptyRow):
#word = col1Text + " " + word
#print('Empty row - '+ col1Text)
addPreviousLeftCol = True
prevCol1Text = col1Text
if ((not isHeader) and (not isEmptyRow)):
f1.write(jsonLine + "\n")#print(jsonLine)
f1.close()
def sort2rows(tbltexts, pageno, tbleno):
#set column, row order
#assume we get top,left always first
if tbltexts:
col1x0, col1y0, col1x1, col1y1 = 0, 0, 999, 0
col2x0, col2y0, col2x1, col2y1 = 0, 0, 999, 0
col3x0, col3y0, col3x1, col3y1 = 0, 0, 999, 0
newlist = tbltexts[:]
#find the left most textbox. past this to function. dont calculate in here
#todo
leftrect = tbltexts[0]
for tb in tbltexts:
if tb.x0<leftrect.x0:
leftrect = tb
col1x1 = leftrect.x1
color = ['green','blue']
index = 1
tabledata = []
tablerowpos = []
while len(newlist):
leftcol = []
rightcol = []
#print(index)
#print(len(newlist))
#print(col1x1)
rightmin = 999
for tx in newlist:
if tx.x0>col1x1:
#find right most box for next round
if (tx.x1<rightmin):
rightmin = tx.x1
#leftrect = tx
#print('col 2')
#print(tx)
rightcol.append(tx)
else:
#print('col 1')
#print(tx)
leftcol.append(tx)
# for ts in leftcol:
# draw_rect(ts, ax, color[index%2])
index += 1
col1x1 = rightmin
newlist = rightcol[:]
#sort column and add to table
slist = sorted(leftcol, key=lambda x: x.y0, reverse=True)
#divid into rows
rowlst = []
lineTxt = ""
isFirstCh = True
#get the text line by line
for line in slist:
words = line.get_text().split('\n')
#print(words)
for word in words:
if (len(word)>0):
rowlst.append(word)
#rowlst.append(lineTxt)
tabledata.append(rowlst)
#find the row position of each line by looking at character postions
columchrs = extract_characters(slist)
rowpos = []
lineY = 1000
for ch in columchrs:
x = int(round(ch.y0))
if lineY-x>4:#x<lineY: #new row
rowpos.append(x)
lineY = x
tablerowpos.append(rowpos)
#break
noofcols = index-1
#merge columns to row number
#find rows top to bottom. top y has max value
#print("sort rows ...")
tabledata2 = tabledata[:]
#write table to file
filename = csv_dir + '/page'+ str(pageno) + '-'+ str(tbleno)+'.csv'
f2 = open(filename, 'w')
tablejsoninput = []
while True:
rowstr = ""
toprowlst = []
for rowy in tablerowpos:
if (len(rowy)):
toprowlst.append(rowy[0])
toprow = max(toprowlst)
#print(toprow)
for x in range(0, noofcols):
if (len(tablerowpos[x]) and (toprow-tablerowpos[x][0]<4) ):
val = tablerowpos[x].pop(0)
try:
valstr = tabledata2[x].pop(0)
rowstr += valstr.encode('utf-8')+"|"
except:
print("Fix this!")
else:
rowstr += "|"
#print(rowstr)
f2.write(rowstr + "\n")
rowstr = rowstr[:-1] # remove last |
tablejsoninput.append(rowstr)
#check if atleast one column data exist
isCont = False
for rowy2 in tablerowpos:
if (len(rowy2)):
isCont = True
if not isCont:
break
f2.close()
#make json
jsonfilename = json_dir + '/page'+ str(pageno) + '-'+ str(tbleno)+'.json'
jsonWrite(tablejsoninput, noofcols, jsonfilename)
return ' <a href="' + filename + '">csv</a> and ' + '<a href="' + jsonfilename+ '">json</a> '
def find_table_in_page(page, pno):
numtexts = []
rects = []
lines = []
textstr = []
pagenos = []
numTextCount, lineCount = 0, 0
isLandscape = False
page_link = ''
global tminx
global tminy
global tmaxx
global tmaxy
tminx, tminy, tmaxx, tmaxy =1000, 1000, 0, 0
xmin, ymin, xmax, ymax = page.bbox
size = 12
print(xmin, ymin, xmax, ymax)
f1.write("page no " + str(pno) + " *************************\r\n")
f1.write("page size " + str(page.bbox) + "\r\n")
if (xmax>ymax):
print("landscape page.")
isLandscape = True
#return
#1. first find number text boxes and rectangles/lines.
# seperate text and rectangle elements
pnum = 0
for e in page:
pnum += 1
area = (e.x1-e.x0) * (e.y1-e.y0)
if isinstance(e, pdfminer.layout.LTTextBoxHorizontal):
strall = e.get_text()
#todo: check for character
ustr = u"—"
if is_number_box(e):#( re.match(u"^(?=.*\d)[0-9\s,.(){}\[\]\-%\$mbn\—]+$", strall) ):
#minmax_cal(e) # calculate only numeric text boxes
# if (e.y0<80 or e.y1>ymax-80 or e.x1<80):# and e.y1-e.y0<30: #check page no textbox, left side line no
# pagenos.append(e)
# f1.write( "page no box: "+ str(e.bbox) + ": "+ e.get_text().encode('utf-8') + "\r\n")
# else:
numtexts.append(e)
numTextCount += 1
f1.write( "num box: "+ str(e.bbox) + ": "+ e.get_text().encode('utf-8') +"\r\n")#
else:
textstr.append(e)
elif isinstance(e, pdfminer.layout.LTRect) or isinstance(e, pdfminer.layout.LTLine):
#filter out page borders
if (e.x0<100 and e.y0<100 and e.x1>xmax-100 and e.y1>ymax-100):#full page border
continue
if e.y0>ymax-100 and e.y1>ymax-100:#top horizontal line
continue
rects.append(e)
#minmax_cal(e)
lineCount += 1
#add text only text boxes which are left to number text boxes. we dont wont to miss left columns of table
#remove any numbers only text box which stands alone in a row. get rid of page number
alltexts = numtexts + textstr
#do something
tabletexts = []
tabletexts = numtexts
numtextboxes = []
for nbs in numtexts:
numtextboxes.append(nbs.bbox)
#merege number boxes to columns
i = 0
isDone = False
colboxlist = []
colm = []
while not isDone and len(numtextboxes)>0:
if i==0:
initLen = len(numtextboxes)
#f1.write("box count 1 - "+ str(len(numtextboxes))+ "\r\n")
nb = numtextboxes[i]
j = i + 1
doNext = True
#f1.write("Round " + str(i)+ "\r\n")
while doNext and j<len(numtextboxes):
nb2 = numtextboxes[j]
#check same vertical line
midx = (nb2[0]+ nb2[2])/2
midx2 = (nb[0]+ nb[2])/2
#f1.write("box 1 - "+ str(nb) + " box 2 - "+ str(nb2)+ " Mid - " +str(midx) + "\r\n")
if (midx>nb[0] and midx<nb[2]) or (midx2>nb2[0] and midx2<nb2[2]):
#check gap is close to merge
if abs(nb[1]-nb2[3])<30 or abs(nb2[1]-nb[3])<30:
#we got a merge!
#f1.write("Merge found!"+ "\r\n")
colm.append(merge_bbox(nb, nb2))
#merge boxes
#remove the merge box from list
del numtextboxes[j]
doNext = False
j +=1
if j>=len(numtextboxes):
break
if doNext:
#no merge happens
colm.append(nb)
i += 1
if i>=len(numtextboxes):
#check if no merge happens we are done.
#f1.write("box count colm - "+ str(len(colm))+ "\r\n")
if initLen==len(colm):
#f1.write("break ! \r\n")
colboxlist = colm[:]
break
numtextboxes = colm[:]
del colm[:]
i = 0
if len(colboxlist)<0:
return
colboxlistCpy = colboxlist[:]
#merge header text boxes
textboxrects = []
for ts in textstr:
textboxrects.append(ts.bbox)
i = 0
isDone = False
colboxlistfinal = []
del colm[:] #colm = []
while not isDone and len(colboxlistCpy)>0:
if i==0:
initLen = len(textboxrects)
#f1.write("box count 1 - "+ str(len(textboxrects))+ "\r\n")
nb = colboxlistCpy[i]
j = i + 1
doNext = True
#f1.write("Round " + str(i)+ "\r\n")
while doNext and j<len(textboxrects):
nb2 = textboxrects[j]
#check same vertical line
midx = (nb2[0]+ nb2[2])/2
#midx2 = (nb[0]+ nb[2])/2
#f1.write("box 1 - "+ str(nb) + " box 2 - "+ str(nb2)+ " Mid - " +str(midx) + "\r\n")
# and get of rid of large boxes. less than twice the width of numbe box
if (nb2[2]-nb2[0]<(nb[2]-nb[0])*2):
if (midx>nb[0] and midx<nb[2]):
#check gap is close to merge
if abs(nb[1]-nb2[3])<30 or abs(nb2[1]-nb[3])<30:
#we got a merge!
#f1.write("Merge found!"+ "\r\n")
colm.append(merge_bbox(nb, nb2))
#merge boxes
#remove the merge box from list
del textboxrects[j]
#del colboxlistCpy[i]
doNext = False
j +=1
if j>=len(textboxrects):
break
if doNext:
#no merge happens
colm.append(nb)
i += 1
if i>=len(colboxlistCpy):
#check if no merge happens we are done.
#f1.write("box count colm - "+ str(len(colm))+ "\r\n")
if initLen==len(textboxrects):
#f1.write("break ! \r\n")
colboxlistfinal = colm[:]
break
colboxlistCpy = colm[:]
del colm[:]
i = 0
#group number boxes to tables
colboxlistfinalCpy = colboxlistfinal[:]
tmpTable = []
isCont = True
while isCont:
isMerge = False
initLen = len(colboxlistfinalCpy)
i=0
#for bx in colboxlistfinalCpy:
while True and i<len(colboxlistfinalCpy):
bx = colboxlistfinalCpy[i]
#after merge simply add
if isMerge:
#f1.write("append \r\n")
tmpTable.append(bx)
else:
#for bx2 in colboxlistfinalCpy:
j=i+1
while True and j<len(colboxlistfinalCpy):
bx2 = colboxlistfinalCpy[j]
leny = bx[3]-bx[1]
leny2 = bx2[3]-bx2[1]
diffx = max(bx[0],bx2[0]) - min(bx[2],bx2[2])
#f1.write("boxes "+str(diffx)+ " "+str(bx) + " " + str(bx2) +" \r\n")
if leny<leny2:
midy = (bx[3]+bx[1])/2
if midy>bx2[1] and midy<bx2[3] and diffx<100:# and diffx<50
#merge
tmpTable.append(merge_bbox(bx, bx2))
del colboxlistfinalCpy[j]
#f1.write("merge 1 \r\n")
isMerge = True
else:
midy = (bx2[3]+bx2[1])/2
if midy>bx[1] and midy<bx[3] and diffx<100:# and diffx<50
#merge
tmpTable.append(merge_bbox(bx, bx2))
del colboxlistfinalCpy[j]
#f1.write("merge 2 \r\n")
isMerge = True
j += 1
if isMerge:
#f1.write("break 3 ! \r\n")
break
if j>=len(colboxlistfinalCpy):
#f1.write("break 0 ! \r\n")
break
if not isMerge:
tmpTable.append(bx)
i += 1
if i>=len(colboxlistfinalCpy):
isMerge = False
#f1.write("break 1 ! \r\n")
break
if (initLen==len(colboxlistfinalCpy)):
#f1.write("break 2 ! \r\n")
break
colboxlistfinalCpy = tmpTable[:]
del tmpTable[:]
#find left most colum of tables
#now we have the right side of tables in the page. lets find the left most column
tableCompleted = []
leftcoltextboxes = []
for fx in colboxlistfinalCpy:
tmpLeftBoxes = []
for s in textstr:#check if textbox in left side
midxx = (s.y0+s.y1)/2
if midxx>=fx[1] and midxx<=fx[3] and s.x1<fx[0]:
tmpLeftBoxes.append(s)
#remove multiple left side boxes. boxes not related to table
tmpLeftBoxes2 = tmpLeftBoxes[:]
for tb in tmpLeftBoxes:
for tb2 in tmpLeftBoxes:
if max(tb.x0, tb2.x0)>min(tb.x1, tb2.x1):
#remove left most
if tb.x0<tb2.x0:
try:
tmpLeftBoxes2.remove(tb)
except ValueError:
print("Item rem err")
else:
try:
tmpLeftBoxes2.remove(tb2)
except ValueError:
print("Item rem err")
leftcoltextboxes.append(tmpLeftBoxes)
if (len(tmpLeftBoxes2)>0):
leftx = tmpLeftBoxes2[0].x0
tableCompleted.append((leftx,fx[1],fx[2],fx[3]))
else:
tableCompleted.append(fx)
tableboxes = []
tableCompletedRemovedSmall = []
htmllink = ''
for tb in tableCompleted:
#find rects inside
if tb[2]-tb[0]>20 and tb[3]-tb[1]>20:# get rid of small boxes
index = 0
for et in alltexts:
if is_inside_box(tb, et):
tableboxes.append(et)
#sort to rows
htmllink += sort2rows(tableboxes, pno, i)
tableCompletedRemovedSmall.append(tb)
i += 1
#dont save image if no table
if len(tableCompletedRemovedSmall)==0:
return ""
#***************draw table*****************************************
fig, ax = plt.subplots(figsize = (size, size * (ymax/xmax)))
if isLandscape:
fig, ax = plt.subplots(figsize = (size * (xmax/ymax), size))
# for ix in colboxlistfinal:
# draw_rect_bbox(ix, ax, 'green')
# for tr in textboxrects:
# draw_rect_bbox(tr, ax, 'yellow')
for fx in tableCompleted:
draw_rect_bbox(fx, ax, 'red')
# for s in numtexts:#InTble2ndFilter
# draw_rect(s, ax, "blue")
alltable = numtexts + tableboxes #numtextsInTble + textstrInTble2ndFilter
characters = extract_characters(alltable)
for c in characters:#texts:#
#draw_rect(c, ax, "blue")
print_text(c.x0, c.y0, plt, c.get_text())
plt.xlim(xmin, xmax)
plt.ylim(ymin, ymax)
#plt.show()
try:
path = results_dir + '/page' + str(pno) + '.png'
plt.savefig(path)
htmlfulllink = 'Page no - ' + str(pno) + ' <a href="' + path + '">page</a> ' + htmllink
fi.write(htmlfulllink+ '<br>\n')
except:
print("Unexpected error:", sys.exc_info()[0])
plt.close(fig)
fig.clf()
return htmlfulllink #tableCompleted #return table boxes
#delete images, csv, json folders
dir = os.path.dirname(__file__)
filename = os.path.join(dir, 'results')
#create dir
if os.path.exists(filename):
shutil.rmtree(filename)
time.sleep(5)
os.makedirs(filename)
filename = os.path.join(dir, 'csv_files')
#create dir
if os.path.exists(filename):
shutil.rmtree(filename)
time.sleep(5)
os.makedirs(filename)
filename = os.path.join(dir, 'json')
#create dir
if os.path.exists(filename):
shutil.rmtree(filename)
time.sleep(5)
os.makedirs(filename)
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-f", "--filename", required=True,
help="path to pdf file")
ap.add_argument("-p", "--pageno", required=False,
help="page no")
args = vars(ap.parse_args())
example_file = args["filename"]#"simple2.pdf"
page_no = args["pageno"]
page_layouts = extract_layout_by_page(example_file)
print(len(page_layouts))
pno = 1
filename = 'index.html'
fi = open(filename, 'w')
for page_layout in page_layouts:
objects_on_page = set(type(o) for o in page_layout)
print(objects_on_page)
linktext = find_table_in_page(page_layout, pno)
pno += 1
# current_page = page_layouts[32]
# objects_on_page = set(type(o) for o in current_page)
# print objects_on_page
# tablerects = find_table_in_page(current_page, 32)
# sort2rows(tablerects)
f1.close()
fi.close()