/
get_paper_content.py
103 lines (97 loc) · 3.24 KB
/
get_paper_content.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python
# -*- coding:utf8 -*-
import sys
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter
import os
def get_paper_content(fname, pages=2, outdir="data"):
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
basename = os.path.basename(fname)
basename = basename.replace(".pdf", "")
outfile = os.path.join(outdir, basename + ".html")
outtype = None
imagewriter = None
rotation = 0
stripcontrol = False
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFPageInterpreter.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter,
stripcontrol=stripcontrol)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter, debug=debug)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return
fp = file(fname, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter.debug = True
try:
for index, page in enumerate(PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True)):
if index > pages:
break
page.rotate = (page.rotate+rotation) % 360
interpreter.process_page(page)
except:
print(fname)
return
fp.close()
device.close()
outfp.close()
return
if __name__ == '__main__':
import fnmatch
fdir = r"D:\paper"
pages = 5
outdir = "data_p" + str(pages)
if not os.path.exists(outdir):
os.makedirs(outdir)
for fname in os.listdir(fdir):
if fnmatch.fnmatch(fname, "*.pdf"):
#print(fname)
get_paper_content(os.path.join(fdir, fname), pages=pages, outdir=outdir)