/
text_extractor.py
143 lines (115 loc) · 4.32 KB
/
text_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python #pylint: disable-msg=C0103
"""
Text Extraction class
"""
import os
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdftypes import PDFException
from appscript import k
from utilities import Config, dtpo_log
from dtpoexceptions import DTPOFileError
VALID_FILE_TYPES = (
k.PDF_Document,
k.html,
k.rtf,
k.txt,
)
class TextExtractor(object) :
"""
Attempts to extract text from the passed file
"""
def __init__(self,
source_file,
test_parse = False,
source_directory = None,
working_directory = None,
testing = False) :
"""
source directory and working directory are generally for test
purposes
"""
if not testing :
# If this isn't specified we're testing so don't need them
assert Config.logger
assert Config.config
dtpo_log('debug', 'TextExtractor -> %s', source_file)
if source_directory is None :
source_directory = Config.config.get_source_directory()
if working_directory is None :
working_directory = Config.config.get_working_directory()
self.source_file = source_directory + "/" + source_file
self.text_file = working_directory + "/" + source_file + ".txt"
self.file_array = []
self.status = False
self.file_type, self.mime_type = get_file_type(self.source_file)
if str(self.file_type) == 'k.PDF_Document' :
self.parse_pdf(test_parse)
else :
error_message = 'TextExtractor - Invalid File Type for {0}' \
.format(self.source_file)
dtpo_log('error', error_message)
raise ValueError(error_message)
def parse_pdf(self, test_parse = False) :
"""
Parse a PDF and return text contents as an array
"""
dtpo_log('debug', "parsePDF sourceFile -> '%s'", self.source_file)
# input options
pagenos = set()
maxpages = 0
# output option
codec = 'utf-8'
caching = True
laparams = LAParams()
laparams.char_margin = 8.0
laparams.word_margin = 2.0
rsrcmgr = PDFResourceManager(caching=caching)
try :
outfp = file(self.text_file, 'w')
except IOError as io_error:
raise DTPOFileError(self.text_file, 0, str(io_error))
try :
fp = file(self.source_file, 'rb')
except IOError as io_error:
raise DTPOFileError(self.source_file, 0, str(io_error))
try :
device = TextConverter(
rsrcmgr, outfp, codec=codec, laparams=laparams)
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages,
caching=caching, check_extractable=True)
except PDFException as pdf_error :
message = "Failed to parse file {0} -> {1}".\
format(self.source_file, str(pdf_error))
raise DTPOFileError(self.source_file, 0, message)
except Exception as exception :
message = "Failed to parse PDF file Unknown exception {0} - > {1}" \
.format(type(exception), str(exception))
raise DTPOFileError(self.source_file, 0, message)
fp.close()
device.close()
outfp.close()
# Got the PDF converted = now get it into an array
self.file_array = []
for line in open(self.text_file) :
self.file_array.append(line)
# Remove the last entry - it's always '\x0c'
if len(self.file_array) > 0:
del self.file_array[-1]
# Remove the outfile
if not test_parse :
os.remove(self.text_file)
def get_file_contents_as_array(self) :
"""
Even if there was nothing to convert there the array will be empty
"""
return self.file_array
def get_file_type(source_file) :
"""
Check the source file and determine its type
TODO Implement other types
"""
dtpo_log('info', 'get_file_type for %s - needs fully implementing',
source_file)
return k.PDF_Document, 'application/pdf'