forked from adsabs/ADSfulltext
/
utils.py
345 lines (275 loc) · 11.8 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
"""
Contains useful functions and utilities that are not neccessarily only useful
for this module. But are also used in differing modules insidide the same
project, and so do not belong to anything specific.
"""
__author__ = 'J. Elliott'
__maintainer__ = 'J. Elliott'
__copyright__ = 'Copyright 2015'
__version__ = '1.0'
__email__ = 'ads@cfa.harvard.edu'
__status__ = 'Production'
__credit__ = ['V. Sudilovsky']
__license__ = 'GPLv3'
import sys
import os
import logging
import string
import unicodedata
import re
import json
from settings import config, PROJ_HOME, CONSTANTS
from cloghandler import ConcurrentRotatingFileHandler
def setup_logging(file_, name_, level=config['LOGGING_LEVEL']):
"""
Sets up generic logging to file with rotating files on disk
:param file_: the __file__ doc of python module that called the logging
:param name_: the name of the file that called the logging
:param level: the level of the logging DEBUG, INFO, WARN
:return: logging instance
"""
level = getattr(logging, level)
logfmt = '%(levelname)s\t%(process)d [%(asctime)s]:\t%(message)s'
datefmt = '%m/%d/%Y %H:%M:%S'
formatter = logging.Formatter(fmt=logfmt, datefmt=datefmt)
logging_instance = logging.getLogger(name_)
fn_path = os.path.join(os.path.dirname(file_), PROJ_HOME, 'logs')
if not os.path.exists(fn_path):
os.makedirs(fn_path)
fn = os.path.join(fn_path, '{0}.log'.format(name_))
rfh = ConcurrentRotatingFileHandler(filename=fn,
maxBytes=2097152,
backupCount=5,
mode='a',
encoding='UTF-8') # 2MB file
rfh.setFormatter(formatter)
logging_instance.handlers = []
logging_instance.addHandler(rfh)
logging_instance.setLevel(level)
return logging_instance
def overrides(interface_class):
"""
To be used as a decorator, it allows the explicit declaration you are
overriding the method of class from the one it has inherited. It checks that
the name you have used matches that in the parent class and returns an
assertion error if not
"""
def overrider(method):
"""
Makes a check that the overrided method now exists in the given class
:param method: method to override
:return: the class with the overriden method
"""
assert(method.__name__ in dir(interface_class))
return method
return overrider
class FileInputStream(object):
"""
A custom data format that handles all the file input/output in the
ADSfulltext project.
"""
def __init__(self, input_stream):
"""
Initialisation (constructor) method of the class
:param input_stream: the path to the file that needs to be loaded
:return: no return
"""
self.input_stream = input_stream
self.raw = ''
self.bibcode = ''
self.full_text_path = ''
self.provider = ''
self.payload = None
def print_info(self):
"""
Prints relevant information about the input stream
:return: no return
"""
print 'Bibcode: {0}'.format(self.bibcode)
print 'Full text path: {0}'.format(self.full_text_path)
print 'Provider: {0}'.format(self.provider)
print 'Raw content: {0}'.format(self.raw)
def extract(self, force_extract=False):
"""
Opens the file and parses the content depending on the type of input
:param force_extract: boolean decides if the normal checks should
be ignored and extracted regardless
:return: the bibcode, full text path, provider, and raw content
"""
in_file = self.input_stream
try:
with open(in_file, 'r') as f:
input_lines = f.readlines()
raw = []
bibcode, full_text_path, provider = [], [], []
for line in input_lines:
l = [i for i in line.strip().split('\t') if i != '']
if len(l) == 0:
continue
bibcode.append(l[0])
full_text_path.append(l[1])
provider.append(l[2])
payload_dictionary = {
CONSTANTS['BIBCODE']: bibcode[-1],
CONSTANTS['FILE_SOURCE']: full_text_path[-1],
CONSTANTS['PROVIDER']: provider[-1]
}
if force_extract:
payload_dictionary[CONSTANTS['UPDATE']] = \
'FORCE_TO_EXTRACT'
raw.append(payload_dictionary)
self.bibcode = bibcode
self.full_text_path = full_text_path
self.provider = provider
self.raw = raw
except IOError:
print in_file, sys.exc_info()
return self.bibcode, self.full_text_path, self.provider, self.raw
def make_payload(self, **kwargs):
"""
Convert the file stream input to a payload form defined below
:param kwargs: extra arguments
:return: list of json formatted payloads
"""
if 'packet_size' in kwargs:
self.payload = \
[json.dumps(self.raw[i:i+kwargs['packet_size']])
for i in range(0, len(self.raw), kwargs['packet_size'])]
else:
self.payload = [json.dumps(self.raw)]
return self.payload
class TextCleaner(object):
"""
Class that contains methods to clean text.
"""
def __init__(self, text):
"""
Initialisation method (constructor) of the class
For those interested:
http://www.joelonsoftware.com/articles/Unicode.html
Translation map (ASCII):
This is used to replace the escape characters. There are 32 escape
characters listed for example
here: http://www.robelle.com/smugbook/ascii.html
input_control_characters:
This is a string that contains all the escape characters
translated_control_characters:
This is a string that is equal in length to input_control
characters, where all the escape characters
are replaced by an empty string ' '. The only escape characters
kept are \n, \t, \r, (9, 10, 13)
This map can then be given to the string.translate as the map for
a string (ASCII encoded)
e.g.,
'jonny\x40myemail.com\n'.translate(dict.fromkeys(filter(lambda x:
x not in [9,10,13], range(32))))
'jonny@myemail.com\n'
Translation map (Unicode):
This has the same purpose as the previous map, except it works on
text that is encoded in utf-8, or some other unicode encoding. The
unicode_control_number array contains a list of tuples, that
contain the range of numbers that want to be removed. i.e., 0x00,
0x08 in unicode form is U+00 00 to U+00 08, which is just removing
the e.g., Null characters, see
http://www.fileformat.info/info/charset/UTF-8/list.htm
for a list of unicode numberings.
e.g.,
This map can then be given to the string.translate as the map for
a unicode type (e.g., UTF-8 encoded)
u'jonny\x40myemail.com\n'.translate(dict.fromkeys(filter(lambda x:
x not in [9,10,13], range(32))))
u'jonny@myemail.com\n'
unicodedata.normalize(unicode_string, 'NFKC'):
https://docs.python.org/2/library/
unicodedata.html#unicodedata.normalize
http://stackoverflow.com/questions/14682397/can-somone-
explain-how-unicodedata-normalizeform-unistr-work-with-examples
NFKC = Normal Form K Composition
'K' converts characters such as circle(1) to 1
'C' composes characters such as C, to C+,
:param text: input text to clean
:return: no return
"""
self.text = text
translated_control_characters = ''.join(
[chr(i) if i in [9, 10, 13] else ' ' for i in range(0, 32)])
input_control_characters = "".join([chr(i) for i in range(0, 32)])
self.ASCII_translation_map = string.maketrans(
input_control_characters, translated_control_characters)
unicode_control_numbers = [(0x00, 0x08), (0x0B, 0x1F), (0x7F, 0x84),
(0x86, 0x9F), (0xD800, 0xDFFF), (0xFDD0,
0xFDDF),
(0xFFFE, 0xFFFF),
(0x1FFFE, 0x1FFFF), (0x2FFFE, 0x2FFFF),
(0x3FFFE, 0x3FFFF), (0x4FFFE, 0x4FFFF),
(0x5FFFE, 0x5FFFF), (0x6FFFE, 0x6FFFF),
(0x7FFFE, 0x7FFFF), (0x8FFFE, 0x8FFFF),
(0x9FFFE, 0x9FFFF), (0xAFFFE, 0xAFFFF),
(0xBFFFE, 0xBFFFF), (0xCFFFE, 0xCFFFF),
(0xDFFFE, 0xDFFFF), (0xEFFFE, 0xEFFFF),
(0xFFFFE, 0xFFFFF), (0x10FFFE, 0x10FFFF)]
self.Unicode_translation_map = dict.fromkeys(
unicode_number
for starting_unicode_number, ending_unicode_number
in unicode_control_numbers
for unicode_number
in range(starting_unicode_number, ending_unicode_number+1)
)
def translate(self):
"""
Removes escape characters whether the text is unicode or ASCII
:return: no return
"""
if type(self.text) == str:
self.text = self.text.translate(self.ASCII_translation_map)
else:
self.text = self.text.translate(self.Unicode_translation_map)
def decode(self):
"""
Decodes the text into unicode expected UTF-8 encoding
:return: no return
"""
if type(self.text) == str:
self.text = self.text.decode('utf-8', 'ignore')
def normalise(self):
"""
Normalises different combination of characters into a single chracter
:return: no return
"""
self.text = unicodedata.normalize('NFKC', unicode(self.text))
self.text = re.sub('\s+', ' ', self.text)
def trimwords(self, maxlength=200):
"""
Removes "words" longer than wordlength characters, which tend to be
artifacts generated by the text extraction pipeline (typically tables).
We do this because these huge words cause problems further down the line
when they are indexed in SOLR
:param maxlength: maximum length of words to keep
:return: no return
"""
# note: we want to keep the original text in the proper sequence of lines
# to avoid messing up text analysis downstream
buffer = []
for line in self.text.splitlines():
newline = ' '.join([word if len(word) <= maxlength else '' for word in line.split()])
buffer.append(newline)
self.text = '\n'.join(buffer)
def run(self, translate=True, decode=True, normalise=True, trim=True):
"""
Wrapper method that can run all of the methods wanted by the user
in one executable.
:param translate: should it translate, boolean
:param decode: should it decode, boolean
:param normalise: should it normalise, boolean
:param trimwords: remove long sequences of non-blank characters (usually garbage)
:return: cleaned text
"""
if translate:
self.translate()
if decode:
self.decode()
if normalise:
self.normalise()
if trim:
self.trimwords()
return self.text