Esempio n. 1
0
def upload():
    session_id = unicode(uuid4())
    f = request.files['input_file']
    flask_session['session_name'] = f.filename
    file_type = f.filename.rsplit('.')[1]
    u = StringIO(f.read())
    u.seek(0)
    if file_type != 'csv':  # pragma: no cover
        file_format = convert.guess_format(flask_session['session_name'])
        u = StringIO(convert.convert(u, file_format))
    fieldnames = [
        slugify(unicode(i)) for i in u.next().strip('\r\n').split(',')
    ]
    flask_session['fieldnames'] = fieldnames
    user_id = flask_session['user_id']
    user = db_session.query(User).get(user_id)
    group = user.groups[0]
    sess = DedupeSession(id=session_id,
                         name=request.form.get('name'),
                         description=request.form.get('description'),
                         filename=f.filename,
                         group=group,
                         status=STATUS_LIST[0]['machine_name'])
    db_session.add(sess)
    db_session.commit()
    u.seek(0)
    with open('/tmp/%s_raw.csv' % session_id, 'wb') as s:
        s.write(u.getvalue())
    del u
    initializeSession.delay(session_id)
    flask_session['session_id'] = session_id
    return jsonify(ready=True, session_id=session_id)
Esempio n. 2
0
def get_unmatched():
    resp = {
        'status': 'ok',
        'message': '',
        'object': {},
        'remaining': 0,
    }
    status_code = 200
    session_id = flask_session['session_id']

    dedupe_session = db_session.query(DedupeSession).get(session_id)
    resp['remaining'] = dedupe_session.review_count
    raw_fields = list(set([f['field'] for f in json.loads(dedupe_session.field_defs)]))
    raw_fields.append('record_id')
    fields = ', '.join(['r.{0}'.format(f) for f in raw_fields])
    sel = ''' 
      SELECT {0}
      FROM "raw_{1}" as r
      LEFT JOIN "entity_{1}" as e
        ON r.record_id = e.record_id
      WHERE e.record_id IS NULL
      LIMIT 1
    '''.format(fields, session_id)
    engine = db_session.bind
    with engine.begin() as conn:
        rows = [dict(zip(raw_fields, r)) for r in conn.execute(sel)]
    if not rows:
        dedupe_session.status = 'canonical'
        db_session.add(dedupe_session)
        db_session.commit()
    else:
        resp['object'] = rows[0]
    response = make_response(json.dumps(resp), status_code)
    response.headers['Content-Type'] = 'application/json'
    return response
Esempio n. 3
0
def delete_session():

    session_id = flask_session['session_id']
    data = db_session.query(DedupeSession).get(session_id)
    db_session.delete(data)
    db_session.commit()
    tables = [
        'entity_{0}',
        'entity_{0}_cr',
        'raw_{0}',
        'processed_{0}',
        'processed_{0}_cr',
        'block_{0}',
        'block_{0}_cr',
        'plural_block_{0}',
        'plural_block_{0}_cr',
        'cr_{0}',
        'covered_{0}',
        'covered_{0}_cr',
        'plural_key_{0}',
        'plural_key_{0}_cr',
        'small_cov_{0}',
        'small_cov_{0}_cr',
        'canon_{0}',
        'exact_match_{0}',
        'match_blocks_{0}',
    ]
    cleanupTables.delay(session_id, tables=tables)
    resp = make_response(json.dumps({
        'session_id': session_id,
        'status': 'ok'
    }))
    resp.headers['Content-Type'] = 'application/json'
    return resp
Esempio n. 4
0
def mark_pair():
    action = request.args['action']
    flask_session['last_interaction'] = datetime.now()
    counter = flask_session.get('counter')
    sess = db_session.query(DedupeSession).get(flask_session['session_id'])
    deduper = flask_session['deduper']

    # Attempt to cast the training input appropriately
    # TODO: Figure out LatLong type
    field_defs = json.loads(sess.field_defs)
    fds = {}
    for fd in field_defs:
        try:
            fds[fd['field']].append(fd['type'])
        except KeyError:
            fds[fd['field']] = [fd['type']]
    current_pair = flask_session['current_pair']
    left, right = current_pair
    l_d = {}
    r_d = {}
    for k, v in left.items():
        if 'Price' in fds[k]:
            l_d[k] = float(v)
        else:
            l_d[k] = v
    for k, v in right.items():
        if 'Price' in fds[k]:
            r_d[k] = float(v)
        else:
            r_d[k] = v
    current_pair = [l_d, r_d]
    if sess.training_data:
        labels = json.loads(sess.training_data)
    else:
        labels = {'distinct': [], 'match': []}
    if action == 'yes':
        labels['match'].append(current_pair)
        counter['yes'] += 1
        resp = {'counter': counter}
    elif action == 'no':
        labels['distinct'].append(current_pair)
        counter['no'] += 1
        resp = {'counter': counter}
    elif action == 'finish':
        dedupeRaw.delay(flask_session['session_id'])
        resp = {'finished': True}
        flask_session['dedupe_start'] = time.time()
    else:
        counter['unsure'] += 1
        flask_session['counter'] = counter
        resp = {'counter': counter}
    sess.training_data = json.dumps(labels, default=_to_json)
    db_session.add(sess)
    db_session.commit()
    deduper.markPairs(labels)
    if resp.get('finished'):
        del flask_session['deduper']
    resp = make_response(json.dumps(resp))
    resp.headers['Content-Type'] = 'application/json'
    return resp
Esempio n. 5
0
def clear_error():
    work_id = request.args['work_id']
    work = db_session.query(WorkTable).get(work_id)
    work.cleared = True
    db_session.add(work)
    db_session.commit()
    response = make_response(json.dumps({'status': 'ok'}))
    response.headers['Content-Type'] = 'application/json'
    return response
Esempio n. 6
0
def getCluster(session_id, entity_pattern, raw_pattern):
    ent_name = entity_pattern.format(session_id)
    raw_name = raw_pattern.format(session_id)
    sess = app_session.query(DedupeSession).get(session_id)
    app_session.refresh(sess)
    
    cluster_list = []
    prediction = None
    machine = cPickle.loads(sess.review_machine)
    entity_id = machine.get_next()
    sess.review_machine = cPickle.dumps(machine)
    app_session.add(sess)
    app_session.commit()
    engine = app_session.bind
    model_fields = list(set([f['field'] for f in json.loads(sess.field_defs)]))
    raw_cols = ', '.join(['r.{0}'.format(f) for f in model_fields])
    sel = text('''
        SELECT 
            e.confidence,
            {0},
            r.record_id
        FROM "{1}" AS r
        JOIN "{2}" as e 
            ON r.record_id = e.record_id
        WHERE e.entity_id = :entity_id
        ORDER BY e.confidence
        '''.format(raw_cols, raw_name, ent_name))
    records = list(engine.execute(sel, entity_id=entity_id))

    if records:
        raw_fields = ['confidence'] + model_fields + ['record_id']
        max_confidence = max([r['confidence'] for r in records])
        cluster_length = len(records)
        prediction = machine.predict([max_confidence, cluster_length])
        for thing in records:
            d = {}
            for k,v in zip(raw_fields, thing):
                d[k] = v

            # d['confidence'] = formatPercentage(d['confidence'])
            cluster_list.append(d)
        one_minute = datetime.now() + timedelta(minutes=1)
        upd = text(''' 
            UPDATE "{0}" SET
              checked_out = TRUE,
              checkout_expire = :one_minute
            WHERE entity_id = :entity_id
            '''.format(ent_name))
        with engine.begin() as c:
            c.execute(upd, entity_id=entity_id, one_minute=one_minute)
        return entity_id, cluster_list, prediction
    else:
        return None, None, None
Esempio n. 7
0
def add_user():
    form = AddUserForm()
    if form.validate_on_submit():
        user_info = {
            'name': form.name.data,
            'email': form.email.data,
            'password': form.password.data,
        }
        user = User(**user_info)
        db_session.add(user)
        db_session.commit()
        user.roles = form.roles.data
        user.groups = form.groups.data
        db_session.add(user)
        db_session.commit()
        flash('User %s added' % user.name)
        return redirect(url_for('admin.user_list'))
    return render_template('add_user.html', form=form)
Esempio n. 8
0
def select_field_types():
    dedupe_session = db_session.query(DedupeSession).get(
        flask_session['session_id'])
    errors = db_session.query(WorkTable)\
            .filter(WorkTable.session_id == dedupe_session.id)\
            .filter(WorkTable.cleared == False)\
            .all()
    errors = [e.value for e in errors]
    field_list = flask_session['field_list']
    if request.method == 'POST':
        field_defs = []
        form = {}
        for k in request.form.keys():
            if k != 'csrf_token':
                form[k] = request.form.getlist(k)
        ftypes = sorted(form.items())
        for k, g in groupby(ftypes, key=lambda x: x[0].rsplit('_', 1)[0]):
            vals = list(g)
            has_missing = False
            for ftype, val in vals:
                if ftype == '{0}_missing'.format(k):
                    has_missing = True
            fs = []
            for field, val in vals:
                fs.extend([{'field': k, 'type': val[i]} \
                    for i in range(len(val)) if field.endswith('type')])
            for f in fs:
                if has_missing:
                    f.update({'has_missing': True})
            field_defs.extend(fs)
        dedupe_session = db_session.query(DedupeSession).get(
            flask_session['session_id'])
        dedupe_session.field_defs = json.dumps(field_defs)
        dedupe_session.status = 'model defined'
        db_session.add(dedupe_session)
        db_session.commit()
        if not errors:
            initializeModel.delay(dedupe_session.id)
        return redirect(url_for('trainer.training_run'))
    return render_template('dedupe_session/select_field_types.html',
                           field_list=field_list,
                           dedupe_session=dedupe_session,
                           errors=errors)
Esempio n. 9
0
def train():
    try:
        post = json.loads(request.data)
    except ValueError:
        post = json.loads(request.form.keys()[0])
    r, status_code, sess = validate_post(post)
    # TODO: Check if model fields are present in matches
    if not post.get('matches'):
        r['status'] = 'error'
        r['message'] = 'List of matches is required'
        status_code = 400
    if r['status'] != 'error':
        api_key = post['api_key']
        session_id = post['session_id']
        obj = post['object']
        positive = []
        negative = []
        for match in post['matches']:
            if match['match'] is 1:
                positive.append(match)
            else:
                negative.append(match)
            for k,v in match.items():
                match[k] = preProcess(unicode(v))
            del match['match']
        if len(positive) > 1:
            r['status'] = 'error'
            r['message'] = 'A maximum of 1 matching record can be sent. \
                More indicates a non-canonical dataset'
            status_code = 400
        else:
            training_data = json.loads(sess.training_data)
            if positive:
                training_data['match'].append([positive[0],obj])
            for n in negative:
                training_data['distinct'].append([n,obj])
            sess.training_data = json.dumps(training_data)
            db_session.add(sess)
            db_session.commit()
    resp = make_response(json.dumps(r))
    resp.headers['Content-Type'] = 'application/json'
    return resp
Esempio n. 10
0
def delete_data_model():

    session_id = flask_session['session_id']
    dedupe_session = db_session.query(DedupeSession).get(session_id)
    dedupe_session.field_defs = None
    dedupe_session.training_data = None
    dedupe_session.sample = None
    dedupe_session.status = 'dataset uploaded'
    db_session.add(dedupe_session)
    db_session.commit()
    tables = [
        'entity_{0}',
        'block_{0}',
        'plural_block_{0}',
        'covered_{0}',
        'plural_key_{0}',
        'small_cov_{0}',
    ]
    engine = db_session.bind
    for table in tables:  # pragma: no cover
        try:
            data_table = Table(table.format(session_id),
                               Base.metadata,
                               autoload=True,
                               autoload_with=engine)
            data_table.drop(engine)
        except NoSuchTableError:
            pass
        except ProgrammingError:
            pass
    resp = {
        'status': 'ok',
        'message': 'Data model for session {0} deleted'.format(session_id)
    }
    status_code = 200

    resp = make_response(json.dumps(resp), status_code)
    resp.headers['Content-Type'] = 'application/json'
    return resp
Esempio n. 11
0
def mark_canon_cluster():
    session_id = flask_session['session_id']

    if not request.args.get('entity_id'):
        resp = {
            'status': 'error',
            'message': '"entity_id" is a required parameter'
        }
        status_code = 400
    else:
        entity_id = request.args.get('entity_id')
        match_ids = request.args.get('match_ids')
        distinct_ids = request.args.get('distinct_ids')
        user = db_session.query(User).get(flask_session['api_key'])
        engine = db_session.bind
        if match_ids:
            match_ids = tuple([d for d in match_ids.split(',')])
            upd = text('''
                UPDATE "entity_{0}" SET 
                    entity_id = :entity_id,
                    clustered = :clustered,
                    checked_out = :checked_out,
                    last_update = :last_update,
                    reviewer = :user_name
                WHERE entity_id in (
                    SELECT record_id 
                        FROM "entity_{0}_cr"
                    WHERE entity_id = :entity_id
                        AND record_id IN :record_ids
                )
                '''.format(session_id))
            upd_cr = text(''' 
                UPDATE "entity_{0}_cr" SET
                    target_record_id = :entity_id,
                    clustered = :clustered,
                    checked_out = :checked_out,
                    last_update = :last_update,
                    reviewer = :user_name
                WHERE record_id IN :record_ids
            '''.format(session_id))
            last_update = datetime.now().replace(tzinfo=TIME_ZONE)
            with engine.begin() as c:
                c.execute(upd,
                          entity_id=entity_id,
                          last_update=last_update,
                          user_name=user.name,
                          record_ids=match_ids,
                          clustered=True,
                          checked_out=False)
                c.execute(upd_cr,
                          entity_id=entity_id,
                          last_update=last_update,
                          user_name=user.name,
                          record_ids=match_ids,
                          clustered=True,
                          checked_out=False)
        if distinct_ids:
            distinct_ids = tuple([d for d in distinct_ids.split(',')])
            delete = text(''' 
                DELETE FROM "entity_{0}_cr"
                WHERE entity_id = :entity_id
                    AND record_id IN :record_ids
            '''.format(session_id))
            with engine.begin() as c:
                c.execute(delete, entity_id=entity_id, record_ids=distinct_ids)
        dedupe_session = db_session.query(DedupeSession).get(session_id)
        machine = loads(dedupe_session.review_machine)
        if distinct_ids:
            machine.label(entity_id, 0)
        else:
            machine.label(entity_id, 1)
        dedupe_session.review_machine = dumps(machine)
        dedupe_session.review_count = dedupe_session.review_count - 1
        db_session.add(dedupe_session)
        db_session.commit()
        resp = {
            'session_id': session_id,
            'entity_id': entity_id,
            'match_ids': match_ids,
            'distinct_ids': distinct_ids,
            'status': 'ok',
            'message': ''
        }
        status_code = 200
    resp = make_response(json.dumps(resp), status_code)
    resp.headers['Content-Type'] = 'application/json'
    return resp
Esempio n. 12
0
def mark_cluster():
    resp = {'status': 'ok', 'message': ''}
    status_code = 200
    session_id = flask_session['session_id']

    dedupe_session = db_session.query(DedupeSession).get(session_id)
    user = db_session.query(User).get(flask_session['api_key'])
    engine = db_session.bind
    entity_table = Table('entity_{0}'.format(session_id),
                         Base.metadata,
                         autoload=True,
                         autoload_with=engine)
    # TODO: Return an error if these args are not present.
    entity_id = request.args.get('entity_id')
    match_ids = request.args.get('match_ids')
    distinct_ids = request.args.get('distinct_ids')
    training_data = json.loads(dedupe_session.training_data)
    if match_ids:
        match_ids = tuple([int(m) for m in match_ids.split(',')])
        upd_vals = {
            'entity_id': entity_id,
            'record_ids': match_ids,
            'user_name': user.name,
            'clustered': True,
            'match_type': 'clerical review',
            'last_update': datetime.now().replace(tzinfo=TIME_ZONE),
            'match_ids': match_ids,
        }
        upd = text(''' 
            UPDATE "entity_{0}" SET
              entity_id = :entity_id,
              reviewer = :user_name,
              clustered = :clustered,
              match_type = :match_type,
              last_update = :last_update
            WHERE entity_id = :entity_id
              AND record_id IN :match_ids
        '''.format(session_id))
        with engine.begin() as conn:
            conn.execute(upd, **upd_vals)
        update_existing = text('''
            UPDATE "entity_{0}" SET 
                entity_id = :entity_id, 
                clustered = :clustered,
                reviewer = :user_name,
                match_type = :match_type,
                last_update = :last_update
                FROM (
                    SELECT e.record_id 
                        FROM "entity_{0}" AS e 
                        JOIN (
                            SELECT record_id 
                                FROM "entity_{0}"
                                WHERE entity_id = :entity_id
                                    AND record_id IN :record_ids
                        ) AS s 
                        ON e.target_record_id = s.record_id
                ) AS subq 
            WHERE "entity_{0}".record_id = subq.record_id
            '''.format(dedupe_session.id))
        with engine.begin() as c:
            c.execute(update_existing, **upd_vals)
        # training_data['match'].extend(pairs)
    if distinct_ids:
        distinct_ids = tuple([int(d) for d in distinct_ids.split(',')])
        delete = entity_table.delete()\
            .where(entity_table.c.entity_id == entity_id)\
            .where(entity_table.c.record_id.in_(distinct_ids))
        with engine.begin() as c:
            c.execute(delete)
        #training_data['distinct'].append(pairs)

    machine = loads(dedupe_session.review_machine)
    if distinct_ids:
        machine.label(entity_id, 0)
    else:
        machine.label(entity_id, 1)
    dedupe_session.review_machine = dumps(machine)
    dedupe_session.review_count = dedupe_session.review_count - 1
    db_session.add(dedupe_session)
    db_session.commit()
    resp = {
        'session_id': session_id,
        'entity_id': entity_id,
        'match_ids': match_ids,
        'distinct_ids': distinct_ids,
        'status': 'ok',
        'message': ''
    }
    status_code = 200
    resp = make_response(json.dumps(resp), status_code)
    resp.headers['Content-Type'] = 'application/json'
    return resp
Esempio n. 13
0
def add_entity():
    ''' 
    Add an entry to the entity map. 
    POST data should be a string encoded JSON object which looks like:
    
    {
        "object": {
            "city":"Macon",
            "cont_name":"Kinght & Fisher, LLP",
            "zip":"31201",
            "firstname":null,
            "employer":null,
            "address":"350 Second St",
            "record_id":3,
            "type":"Monetary",
            "occupation":null
        },
        "api_key":"6bf73c41-404e-47ae-bc2d-051e935c298e",
        "match_id": 100,
    }

    The object key should contain a mapping of fields that are in the data
    model. If the record_id field is present, an attempt will be made to look
    up the record in the raw / processed table before making the entry. If
    match_id is present, the record will be added as a member of the entity
    referenced by the id.
    '''
    r = {
        'status': 'ok',
        'message': ""
    }
    status_code = 200
    session_id = flask_session['session_id']

    try:
        post = json.loads(request.data)
    except ValueError:
        r = {
            'status': 'error',
            'message': ''' 
                The content of your request should be a 
                string encoded JSON object.
            ''',
            'object': request.data,
        }
        resp = make_response(json.dumps(r), 400)
        resp.headers['Content-Type'] = 'application/json'
        return resp
    obj = post['object']
    record_id = obj.get('record_id')
    if record_id:
        del obj['record_id']
    match_id = json.loads(request.data).get('match_id')
    sess = db_session.query(DedupeSession).get(session_id)
    field_defs = json.loads(sess.field_defs)
    fds = {}
    for fd in field_defs:
        try:
            fds[fd['field']].append(fd['type'])
        except KeyError:
            fds[fd['field']] = [fd['type']]
    if not set(fds.keys()) == set(obj.keys()):
        r['status'] = 'error'
        r['message'] = "The fields in the object do not match the fields in the model"
        status_code = 400
    else:
        engine = db_session.bind
        proc_table = Table('processed_{0}'.format(session_id), Base.metadata, 
            autoload=True, autoload_with=engine, keep_existing=True)
        row = db_session.query(proc_table)\
            .filter(proc_table.c.record_id == record_id)\
            .first()
        if not row: # pragma: no cover
            raw_table = Table('raw_{0}'.format(session_id), Base.metadata, 
                autoload=True, autoload_with=engine, keep_existing=True)
            proc_ins = 'INSERT INTO "processed_{0}" (SELECT record_id, '\
                .format(proc_table_name)
            for idx, field in enumerate(fds.keys()):
                try:
                    field_types = fds[field]
                except KeyError:
                    field_types = ['String']
                # TODO: Need to figure out how to parse a LatLong field type
                if 'Price' in field_types:
                    col_def = 'COALESCE(CAST("{0}" AS DOUBLE PRECISION), 0.0) AS {0}'.format(field)
                else:
                    col_def = 'CAST(TRIM(COALESCE(LOWER("{0}"), \'\')) AS VARCHAR) AS {0}'.format(field)
                if idx < len(fds.keys()) - 1:
                    proc_ins += '{0}, '.format(col_def)
                else:
                    proc_ins += '{0} '.format(col_def)
            else:
                proc_ins += 'FROM "raw_{0}" WHERE record_id = :record_id)'\
                    .format(session_id)

            with engine.begin() as conn:
                record_id = conn.execute(raw_table.insert()\
                    .returning(raw_table.c.record_id) , **obj)
                conn.execute(text(proc_ins), record_id=record_id)
        hash_me = ';'.join([preProcess(unicode(obj[i])) for i in fds.keys()])
        md5_hash = md5(unidecode(hash_me)).hexdigest()
        entity = {
            'entity_id': unicode(uuid4()),
            'record_id': record_id,
            'source_hash': md5_hash,
            'clustered': True,
            'checked_out': False,
        }
        entity_table = Table('entity_{0}'.format(session_id), Base.metadata, 
            autoload=True, autoload_with=engine, keep_existing=True)
        if match_id:
            entity['target_record_id'] = match_id
            entity_id = db_session.query(entity_table.c.entity_id)\
                .filter(entity_table.c.record_id == match_id)\
                .first()
            entity['entity_id'] = entity_id.entity_id
        with engine.begin() as conn:
            conn.execute(entity_table.insert(), **entity)
        deduper = dedupe.StaticGazetteer(StringIO(sess.gaz_settings_file))
        for k,v in obj.items():
            obj[k] = preProcess(unicode(v))
        block_keys = [{'record_id': b[1], 'block_key': b[0]} \
                for b in list(deduper.blocker([(record_id, obj)]))]
        with engine.begin() as conn:
            conn.execute(text(''' 
                INSERT INTO "match_blocks_{0}" (
                    block_key,
                    record_id
                ) VALUES (:block_key, :record_id)
            '''.format(sess.id)), *block_keys)
    if sess.review_count:
        sess.review_count = sess.review_count - 1
        db_session.add(sess)
        db_session.commit()
    resp = make_response(json.dumps(r), status_code)
    resp.headers['Content-Type'] = 'application/json'
    return resp