/
pdf_text_extractor.py
134 lines (121 loc) · 6.36 KB
/
pdf_text_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import subprocess
import os
from PIL import Image, ImageEnhance
from tika import parser
import numpy as np
from skimage.filters import threshold_adaptive
from threading import Thread
class DetectImagePdf():
def __init__(self):
fullPath = os.path.realpath(__file__)
config = open(('%s/' + 'config.txt') % os.path.dirname(fullPath), 'r')
fileDirectory = config.readline().split('=')[1].strip('\n')
outputDirectory = config.readline().split('=')[1].strip('\n')
for subdir, dirs, files in os.walk(fileDirectory):
for src in files:
if(".DS_Store" in src):
continue
thread = Thread(target=self.makeSearchable, args=(src, outputDirectory))
thread.start()
def makeSearchable(self, src, subdir):
rootDir = subdir + "/examplePDFs"
pdfPath = rootDir + "/" + "rawPdfs"
finishedTextPath = rootDir + "/" + "finishedText"
removed_text_path = rootDir + "/" + "removedText"
gsPath = rootDir + "/" + "gsPdfs"
imagesProcessedPath = rootDir + "/" + "imagesProcessed"
imageText = rootDir + "/" + "imageText"
if not os.path.exists(pdfPath):
os.makedirs(pdfPath)
if not os.path.exists(finishedTextPath):
os.makedirs(finishedTextPath)
if not os.path.exists(removed_text_path):
os.makedirs(removed_text_path)
if not os.path.exists(gsPath):
os.makedirs(gsPath)
if not os.path.exists(imagesProcessedPath):
os.makedirs(imagesProcessedPath)
if not os.path.exists(imageText):
os.makedirs(imageText)
filename, fileType = src.rsplit(".", 1)
print("\n**********************")
print("Processing file: " + filename)
print("**********************\n")
# Extact easy text
print("Getting text that can be easily extracted...")
rawText = parser.from_file(pdfPath + "/" + src)
if rawText["content"] is None:
print("Found no text to extract, continuing process")
else:
fileOutput = open(finishedTextPath + "/" + filename + ".txt", 'w')
fileOutput.write(rawText["content"].encode("utf-8"))
fileOutput.close()
# Remove text from pdf
print("Removing text from pdf")
process1 = subprocess.Popen(['java', '-jar', 'PdfTextDeleter.jar', src, os.path.join(removed_text_path, src)])
process1.wait()
# Apply ghostscript to removed text pdfs
if not os.path.exists(gsPath + "/" + filename + "-imgs"):
os.makedirs(gsPath + "/" + filename + "-imgs")
if not os.path.exists(rootDir + "/imagesProcessed/" + filename + "-imgs"):
os.makedirs(rootDir + "/imagesProcessed/" + filename + "-imgs")
if not os.path.exists(rootDir + "/imageText/" + filename + "-imgs"):
os.makedirs(rootDir + "/imageText/" + filename + "-imgs")
print("Converting left over pdf to images")
process2 = subprocess.Popen(["gs", "-dNOPAUSE", "-sFONTPATH=/opt/local/share/ghostscript/9.16/Resource/Font/",
"-sDEVICE=pngalpha", "-r300", "-dBATCH", "-sOutputFile=" + gsPath + "/" + filename + "-imgs" + "/" + filename + "-%03d" ".png",
removed_text_path + "/" + src], env={'PATH': '/opt/local/bin/'})
process2.wait()
self.preprocessImages(rootDir, subdir, src)
self.applyOCRToImages(rootDir, subdir, src)
self.mergeTextFiles(rootDir, subdir, src)
def preprocessImages(self, rootDir, subdir, srcFile):
rootfilename, fileType = srcFile.rsplit(".", 1)
for subdir, dirs, files in os.walk(rootDir + "/gsPdfs/" + rootfilename + "-imgs"):
for src in files:
if(".DS_Store" in src):
continue
filename, fileType = src.rsplit(".", 1)
print("Processing image")
image = Image.open(subdir + "/" + src).convert('L')
image = np.asarray(image)
block_size = 7
binary_adaptive = threshold_adaptive(image, block_size, method='gaussian', offset=-35, param=37)
# scipy.misc.imsave(rootDir + "/imagesProcessed/" + rootfilename + "-imgs/" + "binary-" + src, binary_adaptive)
# scipy.misc.imsave(rootDir + "/imagesProcessed/" + rootfilename + "-imgs/" + "reg-" + src, image)
def applyOCRToImages(self, rootDir, subdir, src):
rootfilename, fileType = src.rsplit(".", 1)
for subdir, dirs, files in os.walk(rootDir + "/imagesProcessed/" + rootfilename + "-imgs"):
for src in files:
if(".DS_Store" in src):
continue
filename, fileType = src.rsplit(".", 1)
print("Extract text from image using tesseract")
process = subprocess.Popen(["tesseract", rootDir + "/imagesProcessed/" + rootfilename + "-imgs" + "/" + src,
rootDir + "/imageText/" + rootfilename + "-imgs" + "/" + filename, "-l", "eng"],
env={'PATH': '/opt/local/bin/'}, stdout=subprocess.PIPE)
process.wait()
def mergeTextFiles(self, rootDir, subdir, src):
rootfilename, fileType = src.rsplit(".", 1)
for subdir, dirs, files in os.walk(rootDir + "/imageText/" + rootfilename + "-imgs"):
for src in files:
if(".DS_Store" in src):
continue
print("Adding extracted image text to searchable text")
unfinishedTextFile = open(rootDir + "/finishedText/" + rootfilename + ".txt", 'a')
imageTextFile = open(subdir + "/" + src, 'r')
unfinishedTextFile.write(imageTextFile.read())
unfinishedTextFile.close()
imageTextFile.close()
def desaturateImage(self, rootDir, subdir, src):
if ".ccitt" in src or ".params" in src or ".jb2e" in src:
return
img = Image.open(subdir + "/" + src)
colorConverter = ImageEnhance.Color(img)
img4 = colorConverter.enhance(0)
contrastConverter = ImageEnhance.Contrast(img4)
img2 = contrastConverter.enhance(1)
sharpenConverter = ImageEnhance.Sharpness(img2.convert('RGB'))
img3 = sharpenConverter.enhance(1)
img3.save(rootDir + "/grayscale/" + src)
c1 = DetectImagePdf()