forked from colintj/dedupe-web
/
helpers.py
281 lines (260 loc) · 9.64 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
import sqlite3
import re
import os
import dedupe
from dedupe.serializer import _to_json, dedupe_decoder
import json
import time
from unicodedata import normalize
from unidecode import unidecode
from cStringIO import StringIO
from csvkit.sql import make_table, make_create_table_statement
from csvkit.table import Table
from csvkit.unicsv import UnicodeCSVWriter, UnicodeCSVReader, \
UnicodeCSVDictWriter
from queue import queuefunc
from werkzeug import secure_filename
from operator import itemgetter
from collections import defaultdict, OrderedDict
import xlwt
from openpyxl import Workbook
UPLOAD_FOLDER = os.path.join(os.path.dirname(__file__), 'upload_data')
def slugify(text, delim=u'_'):
if text:
punct_re = re.compile(r'[\t !"#$%&\'()*\-/<=>?@\[\\\]^_`{|},.:;]+')
result = []
for word in punct_re.split(text.lower()):
word = normalize('NFKD', word).encode('ascii', 'ignore')
if word:
result.append(word)
return unicode(delim.join(result))
else:
return text
def preProcess(column):
column = unidecode(column)
column = re.sub(' +', ' ', column)
column = re.sub('\n', ' ', column)
column = column.strip().strip('"').strip("'").lower().strip()
return column
def makeRawTable(contents):
inp = StringIO(contents)
reader = UnicodeCSVReader(inp)
header = reader.next()
header = [slugify(h) for h in header]
outp = StringIO()
writer = UnicodeCSVWriter(outp)
writer.writerow(header)
writer.writerows([[preProcess(unicode(i)) for i in r] for r in reader])
outp.seek(0)
conn = sqlite3.connect(':memory:')
t = Table.from_csv(outp,
name='raw_table',
blanks_as_nulls=False,
infer_types=False)
sql_table = make_table(t)
create_st = make_create_table_statement(sql_table)
parts = create_st.split('raw_table (')
create_st = '{0} raw_table ( record_id INTEGER PRIMARY KEY,{1}'.format(*parts)
insert = sql_table.insert()
curs = conn.cursor()
curs.execute(create_st)
rows = [dict(zip(header, row)) for row in t.to_rows()]
for row in rows:
curs.execute(str(insert), row)
dump = StringIO()
for line in conn.iterdump():
dump.write(unidecode(line))
dump.seek(0)
return dump.getvalue(), header
def makeDataDict(dump, sample=False, fields=None):
conn = sqlite3.connect(':memory:')
curs = conn.cursor()
curs.executescript(dump)
curs.execute('select * from raw_table limit 1')
header = [i[0] for i in curs.description]
sql = 'SELECT {0} from raw_table'.format(','.join([h for h in header if h != 'record_id']))
if sample:
sql = '''
SELECT r.*
FROM raw_table AS r
JOIN (
SELECT MAX(record_id) AS record_id
FROM raw_table
GROUP BY {0}
) AS s
ON r.record_id = s.record_id
'''.format(','.join(fields))
curs.execute(sql)
dd = {}
for index, row in enumerate(curs.fetchall()):
dd[index] = dict(zip(header, row))
return dd
def getDistinct(field, dump):
conn = sqlite3.connect(':memory:')
curs = conn.cursor()
curs.executescript(dump)
curs.execute('select distinct {0} from raw_table'.format(field))
dist = [i[0] for i in curs.fetchall() if i[0]]
return dist
def prepareResults(clustered_dupes, fpath):
"""
Prepare deduplicated file for writing to various formats with
duplicates clustered.
"""
cluster_membership = {}
cluster_id = None
for cluster_id, cluster in enumerate(clustered_dupes):
for record_id, score in zip(*cluster):
cluster_membership[record_id] = cluster_id
unique_record_id = cluster_id + 1
clustered_rows = []
with open('{0}-converted.csv'.format(fpath), 'rb') as f:
reader = UnicodeCSVReader(f)
heading_row = reader.next()
heading_row.insert(0, 'Group ID')
rows = []
for row_id, row in enumerate(reader):
if row_id in cluster_membership:
cluster_id = cluster_membership[row_id]
else:
cluster_id = unique_record_id
unique_record_id += 1
row.insert(0, cluster_id)
rows.append(row)
rows = sorted(rows, key=itemgetter(0))
rows.insert(0, heading_row)
for row in rows:
d = OrderedDict()
for k,v in zip(heading_row, row):
d[k] = v
clustered_rows.append(d)
return unique_record_id, clustered_rows
def prepareUniqueResults(clustered_dupes, fpath):
cluster_membership = {}
for (cluster_id, cluster) in enumerate(clustered_dupes):
for record_id, score in zip(*cluster):
cluster_membership[record_id] = cluster_id
unique_rows = []
with open('{0}-converted.csv'.format(fpath), 'rb') as f:
reader = UnicodeCSVReader(f)
rows = [reader.next()]
seen_clusters = set()
for row_id, row in enumerate(reader):
if row_id in cluster_membership:
cluster_id = cluster_membership[row_id]
if cluster_id not in seen_clusters:
rows.append(row)
seen_clusters.add(cluster_id)
else:
rows.append(row)
for row in rows:
d = OrderedDict()
for k,v in zip(rows[0], row):
d[k] = v
unique_rows.append(d)
return unique_rows
def writeCSV(unique_rows, clustered_rows, file_path):
u_path = '%s-deduped_unique.csv' % file_path
d_path = '%s-deduped.csv' % file_path
unique = open(u_path, 'wb')
writer = UnicodeCSVDictWriter(unique, unique_rows[0].keys())
writer.writeheader()
writer.writerows(unique_rows)
unique.close()
clusters = open(d_path, 'wb')
writer = UnicodeCSVDictWriter(clusters, clustered_rows[0].keys())
writer.writeheader()
writer.writerows(clustered_rows)
clusters.close()
return d_path, u_path
def iterExcel(rows):
header = rows[0].keys()
for r, row in enumerate(rows):
for c, key in enumerate(header):
value = row[key]
yield r,c,value
def writeXLS(unique_rows, clustered_rows, file_path):
u_path = '%s-deduped_unique.xls' % file_path
d_path = '%s-deduped.xls' % file_path
clustered_book = xlwt.Workbook(encoding='utf-8')
clustered_sheet = clustered_book.add_sheet('Clustered Results')
for r,c,value in iterExcel(clustered_rows):
clustered_sheet.write(r,c,label=value)
clustered_book.save(d_path)
unique_book = xlwt.Workbook(encoding='utf-8')
unique_sheet = unique_book.add_sheet('Unique Results')
for r,c,value in iterExcel(unique_rows):
unique_sheet.write(r,c,label=value)
unique_book.save(u_path)
return d_path, u_path
def writeXLSX(unique_rows, clustered_rows, file_path):
u_path = '%s-deduped_unique.xlsx' % file_path
d_path = '%s-deduped.xlsx' % file_path
d_book = Workbook()
d_ws = d_book.active
d_ws.title = 'Clustered Results'
for r,c,value in iterExcel(clustered_rows):
col = get_column_letter(c + 1)
d_ws.cell('%s%s' % (col, r + 1)).value = value
d_book.save(filename=d_path)
u_book = Workbook()
u_ws = u_book.active
u_ws.title = 'Unique Results'
for r,c,value in iterExcel(unique_rows):
col = get_column_letter(c + 1)
u_ws.cell('%s%s' % (col, r + 1)).value = value
u_book.save(filename=u_path)
return d_path, u_path
@queuefunc
def dedupeit(field_defs,
training_data,
sample,
dump,
file_path,
file_type):
d = dedupe.Dedupe(field_defs, sample)
td = StringIO(json.dumps(training_data, default=_to_json))
d.readTraining(td)
d.train()
settings_path = '{0}-settings.dedupe'.format(file_path)
training_path = '{0}-training.json'.format(file_path)
with open(settings_path, 'wb') as f:
d.writeSettings(f)
with open(training_path, 'wb') as f:
d.writeTraining(f)
dd = makeDataDict(dump)
threshold = d.threshold(dd,recall_weight=2)
clusters = d.match(dd, threshold)
files = writeResults(clusters, dump, file_type, file_path)
files['training'] = os.path.relpath(training_path, __file__)
files['settings'] = os.path.relpath(settings_path, __file__)
return files
@queuefunc
def staticDedupeit(settings_path, dump, file_type, file_path, recall_weight):
d = dedupe.StaticDedupe(open(settings_path, 'rb'))
dd = makeDataDict(dump)
threshold = d.threshold(dd, recall_weight=float(recall_weight))
clusters = d.match(dd, threshold)
files = writeResults(clusters, dump, file_type, file_path)
return files
def writeResults(clusters, dump, file_type, file_path):
cluster_count, clustered_rows = prepareResults(clusters, file_path)
unique_rows = prepareUniqueResults(clusters, file_path)
if file_type == 'csv':
deduped, deduped_unique = writeCSV(unique_rows, clustered_rows, file_path)
if file_type == 'xls':
deduped, deduped_unique = writeXLS(unique_rows, clustered_rows, file_path)
if file_type == 'xlsx':
deduped, deduped_unique = writeXLSX(unique_rows, clustered_rows, file_path)
conn = sqlite3.connect(':memory:')
curs = conn.cursor()
curs.executescript(dump)
curs.execute('select count(*) from raw_table')
line_count = [i[0] for i in curs.fetchall()]
files = {
'deduped': os.path.relpath(deduped, __file__),
'deduped_unique': os.path.relpath(deduped_unique, __file__),
'cluster_count': cluster_count,
'line_count': line_count,
}
return files