-
Notifications
You must be signed in to change notification settings - Fork 0
/
processor.py
232 lines (191 loc) · 10.4 KB
/
processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
"""
This class provides the preprocessing functions for the PubMed corpus:
processFolder(root, target, stemming, min_word_length, remove_numbers, remove_duplicates, ner)
processString(string, stemming, min_word_length, remove_numbers, remove_duplicates, ner)
"""
import os
from nltk import stem
from adeptner import Adeptner
from graph import Graph
__author__ = "Marcello Benedetti"
__status__ = "Prototype"
DEBUG = False
EXCLUDE = ['!', '?', '.', ',', ':', ';', '_', '-', '+', '*', '/', '\\', '^',
"'", '"', '’' , '(', ')', '[', ']', '=', '°', '|', '{', '}', '\n', '\t',
'%', '¡', '¨', '“', '”', '`', '<', '>', '$', '&', '@', '#', '°']
STOP_WORDS = ['a', 'able', 'about', 'above', 'across', 'after', 'again', 'against', 'all', 'almost',
'also', 'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'before',
'being', 'below', 'between', 'both', 'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do',
'does', 'doing', 'don', 'down', 'during', 'each', 'either', 'else', 'ever', 'every', 'few', 'for',
'from', 'further', 'get', 'got', 'had', 'has', 'have', 'having', 'he', 'her', 'here', 'hers',
'herself', 'him', 'himself', 'his', 'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its',
'itself', 'just', 'least', 'let', 'like', 'likely', 'may', 'me', 'might', 'more', 'most', 'must',
'my', 'myself', 'neither', 'no', 'nor', 'not', 'now', 'of', 'off', 'often', 'on', 'once', 'only',
'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'rather', 's', 'said', 'same',
'say', 'says', 'she', 'should', 'since', 'so', 'some', 'such', 't', 'than', 'that', 'the', 'their',
'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'thus',
'to', 'too', 'under', 'until', 'up', 'us', 'very', 'wants', 'was', 'we', 'were', 'what', 'when',
'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your',
'yours', 'yourself', 'yourselves']
PREFIXES = ['von', 'de', 'vant', 'van', 'der', 'vom', 'vander', 'zur',
'ten', 'la', 'du', 'ter', 'dos', 'al', 'del', 'st', 'le', 'dos', 'da',
'do', 'mc', 'des', 'den', 'di', 'abu', 'vander', 'den', 'della', 'vande',
'dit', 'bin', 'ibn', 'el', 'los', 'dello', 'vanden', 'ap', 'las', 'delli',
'mac', 'mrs', 'mr', 'miss', 'jr', 'sr', 'II', 'III', 'IV', 'in']
class Processor(object):
def __init__(self):
self.exclude = EXCLUDE
self.stop_words = STOP_WORDS
self.prefixes = PREFIXES #currently prefixes are not used
self.ner = Adeptner()
def folderToGraph(self, root="corpus", target_folder=None, graph=None, ner=False, lemmatize=True, stemming='medium', min_word_length=1):
"""
Creates a target folder and related subdirectories according to the structure of the root folder.
Then, processes all the textual files in the root structure and store the results in the target structure.
:root The root folder contains years subfolders that in turn contain the '.txt' documents.
:target_folder The folderwhere to store the processed files.
:graph The graph object.
:ner Use named entity recognition.
:stemming Choose whether or not applying stemming. It may be None, 'light', 'medium', 'heavy'.
:min_word_length Minimum number of characters for a word to be kept.
:return None
"""
if not os.path.isdir(root):
raise Exception("'{0}' folder doesn't exist".format(root))
if graph:
if not isinstance(graph, Graph):
raise Exception("The graph object is non valid")
graph.connect()
if target_folder and not os.path.isdir(target_folder): #create 'target' folder if it doesn't exist
os.mkdir(target_folder)
print("...preprocessing documents and building the graph")
count_nodes = 0
count_edges = 0
for dir in os.listdir(root): #visit year subdirectories of 'root' folder
dir_origin = os.path.join(root, dir)
if os.path.isdir(dir_origin) and dir.isdigit(): #don't visit directories whose name is not a year
year = int(dir)
if target_folder:
dir_dest = os.path.join(target_folder, dir)
if not os.path.isdir(dir_dest): #create year subfolders if they don't exist in 'target'
os.mkdir(dir_dest)
for file in os.listdir(dir_origin): #open textual files under year folder and process them
if file.endswith(".txt"):
string = self.fileToString(os.path.join(dir_origin, file)) #put file into string
entities = self.processString(string, ner, lemmatize, stemming, min_word_length)
if entities:
if DEBUG:
print(entities)
if graph:
cn, ce = graph.addToGraph(entities, int(year))
count_nodes += cn
count_edges += ce
if target_folder:
self.stringToFile(os.path.join(dir_dest, file), entities) #save string to file
graph.commit()
graph.close()
print("added {0} nodes to {1} ".format(count_nodes, graph.graph_nodes))
print("added {0} edges to {1} ".format(count_edges, graph.graph_edges))
def processString(self, string, ner=False, lemmatize=True, stemming=None, min_word_length=1):
"""
Apply prerpocessing to a string.
:ner Use named entity recognition (if True) or text preprocessing (if False) to extract entities.
:stemming Choose whether or not applying stemming. It may be None, 'light', 'medium', 'heavy'.
:min_word_length Minimum number of characters for a word to be kept.
:return Cleaned up string
"""
entities = []
result = []
if ner:
entities = self.getEntities(string) #perform named entity recognition
else:
string = self.removeStopwords(string) #remove the stopwords in STOP_WORDS
entities = string.split(" ") #this creates a bag of words
if entities:
for ent in entities:
ent = self.cleanText(ent) #remove punctuation and numbers
if lemmatize:
ent = self.lemmatizeText(ent) #perform lemmatization
if stemming:
ent = self.stemText(ent, intensity=stemming) #perform stemming
if min_word_length > 0:
ent = self.removeShortWords(ent, min_word_length) #remove short words
ent = ent.strip() #remove trailing spaces
if ent:
result.append(ent)
result = list(set(result)) #remove duplicates
return result
def fileToString(self, file_path):
"""Take the content of a file and put it in a string."""
text = ""
if file_path:
with open(file_path, "r") as f:
text = f.read() #omit size to read it all
f.close()
return text
def stringToFile(self, file_path, text):
"""Take the content of a string and put it in a '.txt' file."""
if file_path and text:
with open(file_path, "w") as f:
f.write(text)
f.close()
def getEntities(self, text):
"""Apply Named Entity Recognition from Stanford's web service ADEPTA."""
return self.ner.getTerms(text).get('MEDTERM')
def lemmatizeText(self, text):
"""Apply lemmatization to a string."""
l = stem.WordNetLemmatizer()
bow = text.split(" ") #this creates a bag of words
result = []
for word in bow:
result.append(l.lemmatize(word))
return ' '.join(result)
def stemText(self, text, intensity):
"""Apply stemming to a string according to :intesity."""
#select nltk stemmer
if intensity is 'light':
s = stem.PorterStemmer()
elif intensity is 'medium':
s = stem.snowball.EnglishStemmer()
elif intensity is 'heavy':
s = stem.LancasterStemmer()
else:
raise Exception("'{0}' is not a correct intensity parameter. Must be light, medium or heavy.".format(intensity))
bow = text.split(" ") #this creates a bag of words
result = []
for word in bow:
result.append(s.stem(word))
return ' '.join(result)
def cleanText(self, text):
"""
1. remove the punctuation specified in EXCLUDE
2. remove numbers
3. transform to lowercase
4. remove multiple spaces
"""
new_text = ""
for c in text:
if c in self.exclude or c.isdigit():
new_text = new_text + " "
else:
new_text = new_text + c
new_text = new_text.lower()
return ' '.join(new_text.split()) #this removes double spaces
def removeShortWords(self, text, min_word_length):
"""Remove words that are shorter than :min_word_length."""
bow = text.split(" ") #this creates a bag of words
result = []
for word in bow:
if len(word)>=min_word_length:
result.append(word)
return ' '.join(result)
def removeStopwords(self, text):
"""Remove the stopwords specified in STOP_WORDS."""
bow = text.split(" ") #this creates a bag of words
result = []
for word in bow:
if word not in self.stop_words:
result.append(word)
return ' '.join(result)
#if __name__=="__main__":
# debug()