def upload(): session_id = unicode(uuid4()) f = request.files['input_file'] flask_session['session_name'] = f.filename file_type = f.filename.rsplit('.')[1] u = StringIO(f.read()) u.seek(0) if file_type != 'csv': # pragma: no cover file_format = convert.guess_format(flask_session['session_name']) u = StringIO(convert.convert(u, file_format)) fieldnames = [ slugify(unicode(i)) for i in u.next().strip('\r\n').split(',') ] flask_session['fieldnames'] = fieldnames user_id = flask_session['user_id'] user = db_session.query(User).get(user_id) group = user.groups[0] sess = DedupeSession(id=session_id, name=request.form.get('name'), description=request.form.get('description'), filename=f.filename, group=group, status=STATUS_LIST[0]['machine_name']) db_session.add(sess) db_session.commit() u.seek(0) with open('/tmp/%s_raw.csv' % session_id, 'wb') as s: s.write(u.getvalue()) del u initializeSession.delay(session_id) flask_session['session_id'] = session_id return jsonify(ready=True, session_id=session_id)
def get_unmatched(): resp = { 'status': 'ok', 'message': '', 'object': {}, 'remaining': 0, } status_code = 200 session_id = flask_session['session_id'] dedupe_session = db_session.query(DedupeSession).get(session_id) resp['remaining'] = dedupe_session.review_count raw_fields = list(set([f['field'] for f in json.loads(dedupe_session.field_defs)])) raw_fields.append('record_id') fields = ', '.join(['r.{0}'.format(f) for f in raw_fields]) sel = ''' SELECT {0} FROM "raw_{1}" as r LEFT JOIN "entity_{1}" as e ON r.record_id = e.record_id WHERE e.record_id IS NULL LIMIT 1 '''.format(fields, session_id) engine = db_session.bind with engine.begin() as conn: rows = [dict(zip(raw_fields, r)) for r in conn.execute(sel)] if not rows: dedupe_session.status = 'canonical' db_session.add(dedupe_session) db_session.commit() else: resp['object'] = rows[0] response = make_response(json.dumps(resp), status_code) response.headers['Content-Type'] = 'application/json' return response
def delete_session(): session_id = flask_session['session_id'] data = db_session.query(DedupeSession).get(session_id) db_session.delete(data) db_session.commit() tables = [ 'entity_{0}', 'entity_{0}_cr', 'raw_{0}', 'processed_{0}', 'processed_{0}_cr', 'block_{0}', 'block_{0}_cr', 'plural_block_{0}', 'plural_block_{0}_cr', 'cr_{0}', 'covered_{0}', 'covered_{0}_cr', 'plural_key_{0}', 'plural_key_{0}_cr', 'small_cov_{0}', 'small_cov_{0}_cr', 'canon_{0}', 'exact_match_{0}', 'match_blocks_{0}', ] cleanupTables.delay(session_id, tables=tables) resp = make_response(json.dumps({ 'session_id': session_id, 'status': 'ok' })) resp.headers['Content-Type'] = 'application/json' return resp
def mark_pair(): action = request.args['action'] flask_session['last_interaction'] = datetime.now() counter = flask_session.get('counter') sess = db_session.query(DedupeSession).get(flask_session['session_id']) deduper = flask_session['deduper'] # Attempt to cast the training input appropriately # TODO: Figure out LatLong type field_defs = json.loads(sess.field_defs) fds = {} for fd in field_defs: try: fds[fd['field']].append(fd['type']) except KeyError: fds[fd['field']] = [fd['type']] current_pair = flask_session['current_pair'] left, right = current_pair l_d = {} r_d = {} for k, v in left.items(): if 'Price' in fds[k]: l_d[k] = float(v) else: l_d[k] = v for k, v in right.items(): if 'Price' in fds[k]: r_d[k] = float(v) else: r_d[k] = v current_pair = [l_d, r_d] if sess.training_data: labels = json.loads(sess.training_data) else: labels = {'distinct': [], 'match': []} if action == 'yes': labels['match'].append(current_pair) counter['yes'] += 1 resp = {'counter': counter} elif action == 'no': labels['distinct'].append(current_pair) counter['no'] += 1 resp = {'counter': counter} elif action == 'finish': dedupeRaw.delay(flask_session['session_id']) resp = {'finished': True} flask_session['dedupe_start'] = time.time() else: counter['unsure'] += 1 flask_session['counter'] = counter resp = {'counter': counter} sess.training_data = json.dumps(labels, default=_to_json) db_session.add(sess) db_session.commit() deduper.markPairs(labels) if resp.get('finished'): del flask_session['deduper'] resp = make_response(json.dumps(resp)) resp.headers['Content-Type'] = 'application/json' return resp
def clear_error(): work_id = request.args['work_id'] work = db_session.query(WorkTable).get(work_id) work.cleared = True db_session.add(work) db_session.commit() response = make_response(json.dumps({'status': 'ok'})) response.headers['Content-Type'] = 'application/json' return response
def getCluster(session_id, entity_pattern, raw_pattern): ent_name = entity_pattern.format(session_id) raw_name = raw_pattern.format(session_id) sess = app_session.query(DedupeSession).get(session_id) app_session.refresh(sess) cluster_list = [] prediction = None machine = cPickle.loads(sess.review_machine) entity_id = machine.get_next() sess.review_machine = cPickle.dumps(machine) app_session.add(sess) app_session.commit() engine = app_session.bind model_fields = list(set([f['field'] for f in json.loads(sess.field_defs)])) raw_cols = ', '.join(['r.{0}'.format(f) for f in model_fields]) sel = text(''' SELECT e.confidence, {0}, r.record_id FROM "{1}" AS r JOIN "{2}" as e ON r.record_id = e.record_id WHERE e.entity_id = :entity_id ORDER BY e.confidence '''.format(raw_cols, raw_name, ent_name)) records = list(engine.execute(sel, entity_id=entity_id)) if records: raw_fields = ['confidence'] + model_fields + ['record_id'] max_confidence = max([r['confidence'] for r in records]) cluster_length = len(records) prediction = machine.predict([max_confidence, cluster_length]) for thing in records: d = {} for k,v in zip(raw_fields, thing): d[k] = v # d['confidence'] = formatPercentage(d['confidence']) cluster_list.append(d) one_minute = datetime.now() + timedelta(minutes=1) upd = text(''' UPDATE "{0}" SET checked_out = TRUE, checkout_expire = :one_minute WHERE entity_id = :entity_id '''.format(ent_name)) with engine.begin() as c: c.execute(upd, entity_id=entity_id, one_minute=one_minute) return entity_id, cluster_list, prediction else: return None, None, None
def add_user(): form = AddUserForm() if form.validate_on_submit(): user_info = { 'name': form.name.data, 'email': form.email.data, 'password': form.password.data, } user = User(**user_info) db_session.add(user) db_session.commit() user.roles = form.roles.data user.groups = form.groups.data db_session.add(user) db_session.commit() flash('User %s added' % user.name) return redirect(url_for('admin.user_list')) return render_template('add_user.html', form=form)
def select_field_types(): dedupe_session = db_session.query(DedupeSession).get( flask_session['session_id']) errors = db_session.query(WorkTable)\ .filter(WorkTable.session_id == dedupe_session.id)\ .filter(WorkTable.cleared == False)\ .all() errors = [e.value for e in errors] field_list = flask_session['field_list'] if request.method == 'POST': field_defs = [] form = {} for k in request.form.keys(): if k != 'csrf_token': form[k] = request.form.getlist(k) ftypes = sorted(form.items()) for k, g in groupby(ftypes, key=lambda x: x[0].rsplit('_', 1)[0]): vals = list(g) has_missing = False for ftype, val in vals: if ftype == '{0}_missing'.format(k): has_missing = True fs = [] for field, val in vals: fs.extend([{'field': k, 'type': val[i]} \ for i in range(len(val)) if field.endswith('type')]) for f in fs: if has_missing: f.update({'has_missing': True}) field_defs.extend(fs) dedupe_session = db_session.query(DedupeSession).get( flask_session['session_id']) dedupe_session.field_defs = json.dumps(field_defs) dedupe_session.status = 'model defined' db_session.add(dedupe_session) db_session.commit() if not errors: initializeModel.delay(dedupe_session.id) return redirect(url_for('trainer.training_run')) return render_template('dedupe_session/select_field_types.html', field_list=field_list, dedupe_session=dedupe_session, errors=errors)
def train(): try: post = json.loads(request.data) except ValueError: post = json.loads(request.form.keys()[0]) r, status_code, sess = validate_post(post) # TODO: Check if model fields are present in matches if not post.get('matches'): r['status'] = 'error' r['message'] = 'List of matches is required' status_code = 400 if r['status'] != 'error': api_key = post['api_key'] session_id = post['session_id'] obj = post['object'] positive = [] negative = [] for match in post['matches']: if match['match'] is 1: positive.append(match) else: negative.append(match) for k,v in match.items(): match[k] = preProcess(unicode(v)) del match['match'] if len(positive) > 1: r['status'] = 'error' r['message'] = 'A maximum of 1 matching record can be sent. \ More indicates a non-canonical dataset' status_code = 400 else: training_data = json.loads(sess.training_data) if positive: training_data['match'].append([positive[0],obj]) for n in negative: training_data['distinct'].append([n,obj]) sess.training_data = json.dumps(training_data) db_session.add(sess) db_session.commit() resp = make_response(json.dumps(r)) resp.headers['Content-Type'] = 'application/json' return resp
def delete_data_model(): session_id = flask_session['session_id'] dedupe_session = db_session.query(DedupeSession).get(session_id) dedupe_session.field_defs = None dedupe_session.training_data = None dedupe_session.sample = None dedupe_session.status = 'dataset uploaded' db_session.add(dedupe_session) db_session.commit() tables = [ 'entity_{0}', 'block_{0}', 'plural_block_{0}', 'covered_{0}', 'plural_key_{0}', 'small_cov_{0}', ] engine = db_session.bind for table in tables: # pragma: no cover try: data_table = Table(table.format(session_id), Base.metadata, autoload=True, autoload_with=engine) data_table.drop(engine) except NoSuchTableError: pass except ProgrammingError: pass resp = { 'status': 'ok', 'message': 'Data model for session {0} deleted'.format(session_id) } status_code = 200 resp = make_response(json.dumps(resp), status_code) resp.headers['Content-Type'] = 'application/json' return resp
def mark_canon_cluster(): session_id = flask_session['session_id'] if not request.args.get('entity_id'): resp = { 'status': 'error', 'message': '"entity_id" is a required parameter' } status_code = 400 else: entity_id = request.args.get('entity_id') match_ids = request.args.get('match_ids') distinct_ids = request.args.get('distinct_ids') user = db_session.query(User).get(flask_session['api_key']) engine = db_session.bind if match_ids: match_ids = tuple([d for d in match_ids.split(',')]) upd = text(''' UPDATE "entity_{0}" SET entity_id = :entity_id, clustered = :clustered, checked_out = :checked_out, last_update = :last_update, reviewer = :user_name WHERE entity_id in ( SELECT record_id FROM "entity_{0}_cr" WHERE entity_id = :entity_id AND record_id IN :record_ids ) '''.format(session_id)) upd_cr = text(''' UPDATE "entity_{0}_cr" SET target_record_id = :entity_id, clustered = :clustered, checked_out = :checked_out, last_update = :last_update, reviewer = :user_name WHERE record_id IN :record_ids '''.format(session_id)) last_update = datetime.now().replace(tzinfo=TIME_ZONE) with engine.begin() as c: c.execute(upd, entity_id=entity_id, last_update=last_update, user_name=user.name, record_ids=match_ids, clustered=True, checked_out=False) c.execute(upd_cr, entity_id=entity_id, last_update=last_update, user_name=user.name, record_ids=match_ids, clustered=True, checked_out=False) if distinct_ids: distinct_ids = tuple([d for d in distinct_ids.split(',')]) delete = text(''' DELETE FROM "entity_{0}_cr" WHERE entity_id = :entity_id AND record_id IN :record_ids '''.format(session_id)) with engine.begin() as c: c.execute(delete, entity_id=entity_id, record_ids=distinct_ids) dedupe_session = db_session.query(DedupeSession).get(session_id) machine = loads(dedupe_session.review_machine) if distinct_ids: machine.label(entity_id, 0) else: machine.label(entity_id, 1) dedupe_session.review_machine = dumps(machine) dedupe_session.review_count = dedupe_session.review_count - 1 db_session.add(dedupe_session) db_session.commit() resp = { 'session_id': session_id, 'entity_id': entity_id, 'match_ids': match_ids, 'distinct_ids': distinct_ids, 'status': 'ok', 'message': '' } status_code = 200 resp = make_response(json.dumps(resp), status_code) resp.headers['Content-Type'] = 'application/json' return resp
def mark_cluster(): resp = {'status': 'ok', 'message': ''} status_code = 200 session_id = flask_session['session_id'] dedupe_session = db_session.query(DedupeSession).get(session_id) user = db_session.query(User).get(flask_session['api_key']) engine = db_session.bind entity_table = Table('entity_{0}'.format(session_id), Base.metadata, autoload=True, autoload_with=engine) # TODO: Return an error if these args are not present. entity_id = request.args.get('entity_id') match_ids = request.args.get('match_ids') distinct_ids = request.args.get('distinct_ids') training_data = json.loads(dedupe_session.training_data) if match_ids: match_ids = tuple([int(m) for m in match_ids.split(',')]) upd_vals = { 'entity_id': entity_id, 'record_ids': match_ids, 'user_name': user.name, 'clustered': True, 'match_type': 'clerical review', 'last_update': datetime.now().replace(tzinfo=TIME_ZONE), 'match_ids': match_ids, } upd = text(''' UPDATE "entity_{0}" SET entity_id = :entity_id, reviewer = :user_name, clustered = :clustered, match_type = :match_type, last_update = :last_update WHERE entity_id = :entity_id AND record_id IN :match_ids '''.format(session_id)) with engine.begin() as conn: conn.execute(upd, **upd_vals) update_existing = text(''' UPDATE "entity_{0}" SET entity_id = :entity_id, clustered = :clustered, reviewer = :user_name, match_type = :match_type, last_update = :last_update FROM ( SELECT e.record_id FROM "entity_{0}" AS e JOIN ( SELECT record_id FROM "entity_{0}" WHERE entity_id = :entity_id AND record_id IN :record_ids ) AS s ON e.target_record_id = s.record_id ) AS subq WHERE "entity_{0}".record_id = subq.record_id '''.format(dedupe_session.id)) with engine.begin() as c: c.execute(update_existing, **upd_vals) # training_data['match'].extend(pairs) if distinct_ids: distinct_ids = tuple([int(d) for d in distinct_ids.split(',')]) delete = entity_table.delete()\ .where(entity_table.c.entity_id == entity_id)\ .where(entity_table.c.record_id.in_(distinct_ids)) with engine.begin() as c: c.execute(delete) #training_data['distinct'].append(pairs) machine = loads(dedupe_session.review_machine) if distinct_ids: machine.label(entity_id, 0) else: machine.label(entity_id, 1) dedupe_session.review_machine = dumps(machine) dedupe_session.review_count = dedupe_session.review_count - 1 db_session.add(dedupe_session) db_session.commit() resp = { 'session_id': session_id, 'entity_id': entity_id, 'match_ids': match_ids, 'distinct_ids': distinct_ids, 'status': 'ok', 'message': '' } status_code = 200 resp = make_response(json.dumps(resp), status_code) resp.headers['Content-Type'] = 'application/json' return resp
def add_entity(): ''' Add an entry to the entity map. POST data should be a string encoded JSON object which looks like: { "object": { "city":"Macon", "cont_name":"Kinght & Fisher, LLP", "zip":"31201", "firstname":null, "employer":null, "address":"350 Second St", "record_id":3, "type":"Monetary", "occupation":null }, "api_key":"6bf73c41-404e-47ae-bc2d-051e935c298e", "match_id": 100, } The object key should contain a mapping of fields that are in the data model. If the record_id field is present, an attempt will be made to look up the record in the raw / processed table before making the entry. If match_id is present, the record will be added as a member of the entity referenced by the id. ''' r = { 'status': 'ok', 'message': "" } status_code = 200 session_id = flask_session['session_id'] try: post = json.loads(request.data) except ValueError: r = { 'status': 'error', 'message': ''' The content of your request should be a string encoded JSON object. ''', 'object': request.data, } resp = make_response(json.dumps(r), 400) resp.headers['Content-Type'] = 'application/json' return resp obj = post['object'] record_id = obj.get('record_id') if record_id: del obj['record_id'] match_id = json.loads(request.data).get('match_id') sess = db_session.query(DedupeSession).get(session_id) field_defs = json.loads(sess.field_defs) fds = {} for fd in field_defs: try: fds[fd['field']].append(fd['type']) except KeyError: fds[fd['field']] = [fd['type']] if not set(fds.keys()) == set(obj.keys()): r['status'] = 'error' r['message'] = "The fields in the object do not match the fields in the model" status_code = 400 else: engine = db_session.bind proc_table = Table('processed_{0}'.format(session_id), Base.metadata, autoload=True, autoload_with=engine, keep_existing=True) row = db_session.query(proc_table)\ .filter(proc_table.c.record_id == record_id)\ .first() if not row: # pragma: no cover raw_table = Table('raw_{0}'.format(session_id), Base.metadata, autoload=True, autoload_with=engine, keep_existing=True) proc_ins = 'INSERT INTO "processed_{0}" (SELECT record_id, '\ .format(proc_table_name) for idx, field in enumerate(fds.keys()): try: field_types = fds[field] except KeyError: field_types = ['String'] # TODO: Need to figure out how to parse a LatLong field type if 'Price' in field_types: col_def = 'COALESCE(CAST("{0}" AS DOUBLE PRECISION), 0.0) AS {0}'.format(field) else: col_def = 'CAST(TRIM(COALESCE(LOWER("{0}"), \'\')) AS VARCHAR) AS {0}'.format(field) if idx < len(fds.keys()) - 1: proc_ins += '{0}, '.format(col_def) else: proc_ins += '{0} '.format(col_def) else: proc_ins += 'FROM "raw_{0}" WHERE record_id = :record_id)'\ .format(session_id) with engine.begin() as conn: record_id = conn.execute(raw_table.insert()\ .returning(raw_table.c.record_id) , **obj) conn.execute(text(proc_ins), record_id=record_id) hash_me = ';'.join([preProcess(unicode(obj[i])) for i in fds.keys()]) md5_hash = md5(unidecode(hash_me)).hexdigest() entity = { 'entity_id': unicode(uuid4()), 'record_id': record_id, 'source_hash': md5_hash, 'clustered': True, 'checked_out': False, } entity_table = Table('entity_{0}'.format(session_id), Base.metadata, autoload=True, autoload_with=engine, keep_existing=True) if match_id: entity['target_record_id'] = match_id entity_id = db_session.query(entity_table.c.entity_id)\ .filter(entity_table.c.record_id == match_id)\ .first() entity['entity_id'] = entity_id.entity_id with engine.begin() as conn: conn.execute(entity_table.insert(), **entity) deduper = dedupe.StaticGazetteer(StringIO(sess.gaz_settings_file)) for k,v in obj.items(): obj[k] = preProcess(unicode(v)) block_keys = [{'record_id': b[1], 'block_key': b[0]} \ for b in list(deduper.blocker([(record_id, obj)]))] with engine.begin() as conn: conn.execute(text(''' INSERT INTO "match_blocks_{0}" ( block_key, record_id ) VALUES (:block_key, :record_id) '''.format(sess.id)), *block_keys) if sess.review_count: sess.review_count = sess.review_count - 1 db_session.add(sess) db_session.commit() resp = make_response(json.dumps(r), status_code) resp.headers['Content-Type'] = 'application/json' return resp