/
preprocess.py
392 lines (330 loc) · 10.2 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
import myHTMLParser
import constant
import re
from datetime import datetime
import index
import nltk
from nltk.stem import PorterStemmer
def read_collection(file_path):
"""This function takes a single file and returns a list of strings. Each
string is a document (between opening and closing <DOC> tags."""
with open(file_path, 'r') as wholeFile:
text = wholeFile.read()
text = text.replace('</DOC>', '</DOC> BREAK_NEW_DOC')
documents = text.split('BREAK_NEW_DOC')
return documents
def get_docID(text):
"""This function returns the docID."""
h = myHTMLParser.myHTMLParser()
h.feed(text)
return h.returnDocID()
def get_text(text):
"""This function returns the text in a document and
excludes tags and comments."""
h = myHTMLParser.myHTMLParser()
h.feed(text)
return h.returnFiltered()
def processing(document, indexType):
"""This function controls the processing. It sends each token
through the various functions that handle the different cases.
At the end, it does the same for any hyphenated words that had
been split up and need to be processed individually."""
caughtTokens = []
moreCaught = []
text = get_text(document)
text = text.lower()
text = dates(text, indexType)
text = split_special_char(text, indexType)
tokens = split_on_spaces(text)
for idx, token in enumerate(tokens):
tokens[idx],caughtTokens = hyphens(tokens[idx], indexType)
tokens[idx],moreCaught = underscore(tokens[idx], indexType)
tokens[idx] = processing_steps(tokens[idx], indexType)
caughtTokens.append(moreCaught)
if caughtTokens:
for idx, needtoadd in enumerate(caughtTokens):
if type(needtoadd) == 'string':
caughtTokens[idx] = processing_steps(caughtTokens[idx], indexType)
tokens.append(caughtTokens[idx])
tokens = delete_empty(tokens)
if indexType == 'positional':
return tokens
elif indexType == 'stem':
tokens = stem_terms(tokens)
return tokens
elif indexType == 'phrase':
tokens = build_phrases(tokens)
return tokens
else:
tokens = removeStop(tokens)
return tokens
def processing_steps(token, indexType):
"""This function handles passing an individual token
to every function to be processed."""
token = file_extensions(token, indexType)
token = extra_zeros(token, indexType)
token = start_of_heading(token, indexType)
token = remove_nums(token, indexType)
token = currency(token, indexType)
token = period_end(token, indexType)
token = periods(token, indexType)
return token
def remove_nums(token, indexType):
"""This function removes any tokens containing non-alpha
characters from the stem and the phrase index."""
if indexType == 'stem' and not token.isalpha():
return ''
elif indexType == 'phrase' and not token.isalpha():
return ' STOP '
else:
return token
def file_extensions(token, indexType):
"""This function deletes the period from file extensions, unless
it is processing for a stem of phrase index in which cases the token
is removed."""
if re.search(constant.file_ext, token):
if indexType=='phrase':
token=re.sub(constant.file_ext, ' STOP ', token)
elif indexType=='stem':
token=''
else:
token=re.sub(constant.file_ext, r'\1\2', token)
return token
def currency(token, indexType):
"""This function handles processing for tokens involving the $
and also handles extra zeros in currency. """
if re.search(constant.currency, token):
if indexType == 'phrase':
token = ' STOP '
elif indexType == 'stem':
token = ''
else:
if len(token) == 1:
token = ''
elif '.' not in token:
token = token
else:
cents = token.split('.', 2)
if len(cents[1]) == 1:
if cents[1] == '0':
token = cents[0]
else:
token = token
elif len(cents[1]) == 2:
if cents[1] == '00':
token = cents[0]
elif cents[1][1] == '0':
token = (cents[0] + '.' + cents[1][0])
else:
token = (cents[0] + '.' + cents[1][0] + cents[1][1])
elif len(cents[1]) == 3:
if cents[1] == '000':
token = cents[0]
else:
token = token
elif not cents[1]:
token = cents[0]
if re.search(constant.currency_zerovalue, token):
token = ''
return token
def hyphens(token, indexType):
"""This function splits the token into pieces, deletes
hyphens, etc. depending on the type of index."""
needtoadd = []
if '-' in token:
if indexType == 'phrase':
token = ' STOP '
if indexType == 'single' or indexType == 'positional':
token = re.sub('-', '', token)
if indexType == 'single' or indexType == 'stem':
pieces = token.split('-')
for piece in pieces:
if len(piece) >= 3:
needtoadd.append(piece)
else:
token = token
return token, needtoadd
def extra_zeros(token, indexType):
"""This function deals with the extra zeros in numbers
in order to normalize numbers for indexing. In the case of
the stem index, the token is deleted. In the case of the phrase
it is marked as a break (STOP). """
if re.search(constant.numerical_leading_zeros, token):
if indexType == 'stem':
token = ''
elif indexType == 'phrase':
token = ' STOP '
else:
token = re.sub(constant.numerical_leading_zeros, '', token)
if re.search(constant.numerical_trailing_zeros, token):
if indexType == 'stem':
token = ''
elif indexType == 'phrase':
token = ' STOP '
else:
token = re.sub(constant.numerical_trailing_zeros, r'\1', token)
if token == '0' or token == '00' or token == '000':
token = ''
return token
def start_of_heading(token, indexType):
"""This removes any SOH errors that can be generated during parsing."""
token = re.sub(constant.start_of_heading, '', token)
return token
def underscore(token, indexType):
"""This function handles underscores (except those in date form) in terms the
same way it would handle hypens."""
needtoadd = []
if '_' in token:
if not re.search(constant.date_format, token):
if indexType == 'phrase':
token = ' STOP '
if indexType == 'single' or indexType == 'positional':
token = re.sub('_','', token)
if indexType == 'single' or indexType == 'stem':
pieces = token.split('_')
for piece in pieces:
if len(piece) >= 3:
needtoadd.append(piece)
else:
token = token
return token, needtoadd
def dates(text, indexType):
"""This function identifies dates, checkes them for validity,
and then puts them in a consistent format. """
matchedDate1 = re.findall(constant.date1, text)
for obj in matchedDate1:
if indexType == 'phrase':
text = re.sub(constant.date1, ' STOP ', text)
else:
try:
dt = datetime.strptime(obj,'%m/%d/%Y')
text = re.sub(obj,(dt.strftime(' %B_%d_%Y')),text)
except:
obj = obj
matchedDate2 = re.findall(constant.date2, text)
for obj in matchedDate2:
if indexType == 'phrase':
text = re.sub(constant.date1, ' STOP ', text)
else:
try:
dt = datetime.strptime(obj,'%m-%d-%Y')
text = re.sub(obj,(dt.strftime(' %B_%d_%Y')),text)
except:
obj = obj
matchedDate3 = re.findall(constant.date3, text)
for obj in matchedDate3:
if indexType == 'phrase':
text = re.sub(constant.date1, ' STOP ', text)
else:
try:
dt = datetime.strptime(obj,'%B %d, %Y')
text = re.sub(obj,(dt.strftime(' %B_%d_%Y')),text)
except:
obj = obj
matchedDate4 = re.findall(constant.date4, text)
for obj in matchedDate4:
if indexType == 'phrase':
text = re.sub(constant.date1, ' STOP ', text)
else:
try:
dt = datetime.strptime(obj,'%m-%d-%Y')
text = re.sub(obj,(dt.strftime(' %B_%d_%Y')),text)
except:
obj = obj
return text
def periods(token, indexType):
"""End of sentence periods have already been handled. This handles
emails, urls, ips, abbrevs, and alpha period combos. It deletes/STOP
for stem and phrase because tokens with periods are either special
cases that should be ignored for phrases or will not be helpful in the
stemmer."""
if '.' in token:
if indexType == 'phrase':
token = ' STOP '
if indexType == 'stem':
token = ''
elif re.findall(constant.email, token):
return token
elif re.findall(constant.url, token):
return token
elif re.findall(constant.ip, token):
return token
elif re.findall(constant.abbrev, token):
token = re.sub(constant.abbrev, r'\1', token)
if '.' in token:
token = re.sub(constant.period_alpha, r'\1', token)
return token
def period_end(token, indexType):
if token:
if token[len(token)-1] == '.':
token = token[0:len(token)-1]
else:
token = token
return token
def delete_empty(tokens):
tokens = [token for token in tokens if token]
return tokens
def split_special_char(text, indexType):
"""This function precedes the split on spaces and adds spaces where they are
special characters in order to draw out the valid tokens before processing."""
if indexType == 'phrase':
text = re.sub(constant.specialChar, ' STOP ', text)
else:
text = re.sub(constant.specialChar, ' ', text)
return text
def split_on_spaces(text):
"""This function splits the string on spaces to return a list of individual
tokens to be processed."""
tokens = text.split()
return tokens
def removeStop(tokens):
"""This method removes the
stop words from the list of tokens."""
stopWords = make_list_stops()
for idx, token in enumerate(tokens):
if token in stopWords:
tokens[idx] = ''
tokens = delete_empty(tokens)
return tokens
def make_list_stops():
"""This compiles the list of stop words and returns."""
with open('stops.txt') as stopFile:
stops = stopFile.read()
stops = stops.replace('\'', '')
stopWords = stops.split()
return stopWords
def stem_terms(tokens):
"""This function uses the Porter Stemmer to stem each of the
tokens and returns a list with the stemmed results. """
stemmer = PorterStemmer()
stemmed = []
for token in tokens:
try:
stemmed.append(str(stemmer.stem(token)))
except:
stemmed.append('')
stemmend = delete_empty(stemmed)
return stemmed
def build_phrases(tokens):
stopWords = make_list_stops()
special_case = 'STOP'
string = ''
num_terms = 0
phrases = []
for idx, word in enumerate(tokens):
if num_terms >= 3:
phrases.append(string)
string = ''
num_terms=0
elif word not in stopWords and word.find(special_case) == -1:
string += word + ' '
num_terms+=1
elif word in stopWords or word.find(special_case) != -1:
if num_terms == 1:
string = ''
num_terms = 0
elif num_terms == 2:
phrases.append(string)
string = ''
num_terms = 0
return phrases