/
DbDocSource.py
219 lines (166 loc) · 6.73 KB
/
DbDocSource.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
__author__ = 'Andrew J McArdle'
"""
Gensim corpora sourced from database records.
See: https://radimrehurek.com/gensim/index.html
"""
from gensim import corpora, parsing, summarization
from bs4 import BeautifulSoup
class DbSQL(object):
"""
Wrapper class to hold SQL SELECT statement and the position of the unique key, title and document
in the result set (default position is 0,1,2 respectively)
"""
def __init__(self, sql_stmnt, pk_loc=0, title_loc=1, doc_loc=2):
self.pk_location = pk_loc
self.title_loc = title_loc
self.doc_loc = doc_loc
self._sql = sql_stmnt
@property
def sql(self):
return self._sql
@sql.setter
def sql(self, value):
self._sql = value
@property
def unique_key_location(self):
return self.pk_location
@property
def title_location(self):
return self.title_loc
@property
def doc_location(self):
return self.doc_loc
class DbSummary(object):
def __init__(self, dbSQL, db_object, source_is_html=False):
self.__dbSQL = dbSQL
self.__db_object = db_object
self.__record_identifiers = []
if source_is_html:
self.preprocessor = HtmlTagsRemover()
else:
self.preprocessor = None
def __iter__(self):
with self.__db_object as conn:
cursor = conn.cursor()
cursor.execute(self.__dbSQL.sql)
record = cursor.fetchone()
while record:
document_text = record[self.__dbSQL.doc_location]
pk, title = record[self.__dbSQL.unique_key_location], record[self.__dbSQL.title_location]
if self.preprocessor:
title = self.preprocessor.clean_text(title)
document_text = self.preprocessor.clean_text(document_text)
self.record_identifiers.append((pk, title))
yield summarization.summarize(document_text)
record = cursor.fetchone()
@property
def record_identifiers(self):
"""
Get the 'unique keys' (primary key?) and titles
:return: List of (pk, title) tuples
:rtype List
"""
return self.__record_identifiers
#end Class DbSummary
class DbDictSource(object):
"""
Class for creating the gensim corpora.dictionary object from a database connection.
Not much use in itself. Create child class and override tokenize if you wat to do more than merely split
sentences on spaces. For example if you want to use stemmer or clean up the text do this in the tokenize
method.
:param dbSQL The object to source the SQL and the location of the unique key, title and 'document' fields
in the SQL recordset
:type DbSQL
:param db_object The database object to provide the connection to the database
:type DbmsCore.JdbcConnection TODO: Change this to an interface
:param source_is_html If the document is in html will trigger removal of html tags
:type Boolean (default = False)
"""
def __init__(self, dbSQL, db_object, source_is_html=False):
self.__dbSQL = dbSQL
self.__db_object = db_object
self.record = None
if source_is_html:
self.preprocessor = HtmlTagsRemover()
else:
self.preprocessor = None
def __iter__(self):
with self.db_object as conn:
cursor = conn.cursor()
cursor.execute(self.__dbSQL.sql)
record = cursor.fetchone()
while record:
document_text = record[self.__dbSQL.doc_location]
if self.preprocessor:
document_text = self.preprocessor.clean_text(document_text)
yield parsing.preprocess_string(document_text)
record = cursor.fetchone()
@property
def db_object(self):
return self.__db_object
@property
def db_sql(self):
return self.__dbSQL
class DbCorpus(object):
"""
Gensim corpora sourced from a database connection.
Uses the DbDictSource internally to create the corpora.dictionary. The unique key and title fields
can be retrieved from the record_identifiers property to allow clients to retrieve the database record
at a point in the future.
Builds the gensim corpora object using the dictionary_source object which sources documents from
a database connection
:param dictionary_source The database driven dictionary object from where we source words
:type DbDictSource
:param source_is_html If the document is in html will trigger removal of html tags
:type Boolean (default = False)
"""
def __init__(self, dictionary_source, source_is_html=False):
self._length = 0
self.dict_source = dictionary_source
if source_is_html:
self.preprocessor = HtmlTagsRemover()
else:
self.preprocessor = None
self._dictionary = corpora.Dictionary(self.dict_source)
once_ids = [token_id for token_id, doc_freq in self.dictionary.dfs.iteritems() if doc_freq == 1]
self._dictionary.filter_tokens(once_ids)
self._dictionary.compactify()
self.__record_identifiers = []
def __iter__(self):
db = self.dict_source.db_object
dbsql = self.dict_source.db_sql
with db as conn:
cursor = conn.cursor()
cursor.execute(dbsql.sql)
record = cursor.fetchone()
while record:
self._length += 1
document_text = record[dbsql.doc_location]
pk, title = record[dbsql.unique_key_location], record[dbsql.title_location]
if self.preprocessor:
title = self.preprocessor.clean_text(title)
document_text = self.preprocessor.clean_text(document_text)
self.record_identifiers.append((pk, title))
tokens = parsing.preprocess_string(document_text)
yield self._dictionary.doc2bow(tokens)
record = cursor.fetchone()
def __len__(self):
return self._length
@property
def record_identifiers(self):
"""
Get the 'unique keys' (primary key?) and titles
:return: List of (pk, title) tuples
:rtype List
"""
return self.__record_identifiers
@property
def dictionary(self):
return self._dictionary
class HtmlTagsRemover(object):
"""
Uses Beautiful Soup to remove HTML - for use if data contains rich text (common on modern systems)
"""
def clean_text(self, text):
soup = BeautifulSoup(text, 'html.parser')
return soup.get_text()