Example #1
0
def upload_file():
    jurisdiction = request.args.get('jurisdiction')
    event_type = request.args.get('eventType')
    if can_upload_file(jurisdiction, event_type):
        filenames = [key for key in request.files.keys()]
        if len(filenames) != 1:
            return jsonify(
                status='error',
                message='Exactly one file must be uploaded at a time')
        uploaded_file = request.files[filenames[0]]
        filename = secure_filename(uploaded_file.filename)
        cwd = os.getcwd()
        tmp_dir = os.path.join(cwd, 'tmp')
        os.makedirs(tmp_dir, exist_ok=True)
        full_filename = os.path.join(tmp_dir, filename)
        uploaded_file.save(full_filename)
        upload_id = unique_upload_id()
        q = get_q(get_redis_connection())
        job = q.enqueue_call(func=validate_async,
                             args=(uploaded_file.filename, jurisdiction,
                                   full_filename, event_type, current_user.id,
                                   upload_id),
                             result_ttl=5000,
                             timeout=3600,
                             meta={
                                 'event_type': event_type,
                                 'filename': filename,
                                 'upload_id': upload_id
                             })
        logger.info(f"Job id {job.get_id()}")
        return jsonify(status='validating',
                       jobKey=job.get_id(),
                       message='Validating data!')
    else:
        return jsonify(status='not authorized', exampleRows=[])
Example #2
0
def can_access_file(upload_id):
    upload = db_session.query(Upload).get(upload_id)
    if not upload:
        raise ValueError('upload_id: %s not present in metadata database',
                         upload_id)
    logger.info('Found jurisdiction %s and event type %s for upload id %s',
                upload.jurisdiction_slug, upload.event_type_slug, upload_id)
    return can_upload_file(upload.jurisdiction_slug, upload.event_type_slug)
Example #3
0
def get_last_upload_date():
    last_upload = query.last_upload_date()
    try:
        assert len(last_upload) == 1
        last_upload_date = last_upload[0]['upload_start_time']
        logger.info(type(last_upload_date))
        last_upload_date = last_upload_date.strftime('%Y-%m-%d')
        return jsonify(results=last_upload_date)
    except:
        return jsonify("no valid upload date")
Example #4
0
def match_finished(matched_results_paths,
                   match_job_id,
                   match_start_at,
                   match_complete_at,
                   match_status,
                   match_runtime,
                   upload_id=None):
    try:
        logger.info('Writing to match log')
        write_match_log(db_session=db_session,
                        match_job_id=match_job_id,
                        match_start_at=match_start_at,
                        match_complete_at=match_complete_at,
                        match_status=match_status,
                        match_runtime=match_runtime,
                        upload_id=upload_id)
        logger.info('Writing matches to db')
        for event_type, filename in matched_results_paths.items():
            jurisdiction = filename.split('/')[-3]
            logger.info(
                'Writing matches from event type %s and filename %s to db. Parsed jurisdiction %s out of filename',
                event_type, filename, jurisdiction)
            with open_sesame(filename, 'rb') as matches_filehandle:
                write_matches_to_db(db_engine=engine,
                                    event_type=event_type,
                                    jurisdiction=jurisdiction,
                                    matches_filehandle=matches_filehandle)
    except Exception as e:
        logger.error('Error encountered during match_finished: %s', str(e))

    finally:
        logger.info('All done!')
Example #5
0
def notify_matcher(jurisdiction, upload_id=None):
    schema_pk_lookup = list_all_schemas_primary_keys(SCHEMA_DIRECTORY)
    base_data_directory = app_config['base_data_path']
    directory_to_pass = base_data_directory.format(jurisdiction=jurisdiction)

    redis_connection = Redis(host='redis', port=6379)
    q = Queue('matching', connection=redis_connection)
    logger.info('Enqueueing do_match job')

    job = q.enqueue(f="matcher.do_match",
                    args=(directory_to_pass, schema_pk_lookup, upload_id),
                    result_ttl=5000,
                    timeout=100000,
                    meta={'upload_id': upload_id})
    logger.info("Enqueued job %s", job)
Example #6
0
def get_records_by_time():
    start_date = request.args.get('startDate')
    end_date = request.args.get('endDate')
    jurisdiction = request.args.get('jurisdiction')
    limit = request.args.get('limit', 10)
    offset = request.args.get('offset', 0)
    order_column = request.args.get('orderColumn')
    order = request.args.get('order')
    set_status = request.args.get('setStatus')
    logger.info(f'Pulling data from {start_date} to {end_date}')
    records = query.get_records_by_time(
        start_date,
        end_date,
        jurisdiction,
        limit,
        offset,
        order_column,
        order,
        set_status
    )
    return jsonify(results=records)
Example #7
0
def get_contact_dist(data, bins=None):
    data = data.groupby('matched_id').matched_id.count().as_matrix()
    data = data.astype(int)
    one_contact = list(data).count(1)
    rest = np.delete(data, np.argwhere(data == 1))
    if one_contact == len(data):
        df_hist = pd.DataFrame({'contacts': [one_contact]},
                               index=['1 contact'])
        logger.info("all ones!")
        return df_hist, 1

    if len(np.unique(rest)) == 1:
        df_hist = pd.DataFrame(
            {'contacts': [one_contact, len(rest)]},
            index=['1 contact', f"{np.unique(rest)[0]} contacts"])
        return df_hist, 1

    if bins is not None:
        num, groups = np.histogram(rest, bins)
    else:
        num, groups = np.histogram(rest, 'auto')
        num, groups = np.histogram(rest, np.unique(groups.round()))
        if len(groups) > 4:
            bins = 4
            num, groups = np.histogram(rest, bins)
            num, groups = np.histogram(rest, np.unique(groups.round()))
    hist = [one_contact] + list(num)
    index = [pd.Interval(1, 2, 'left')] + [
        pd.Interval(int(b[0]),
                    int(b[1]) + 1, 'left')
        for b in list(window(list(groups), 2))
    ]
    df_hist = pd.DataFrame({'contacts': hist},
                           index=contacts_interval_to_text(index))
    logger.info(num)
    logger.info(groups)
    logger.info(index)
    logger.info(df_hist)
    return df_hist, groups
Example #8
0
def merge_file():
    upload_id = request.args.get('uploadId', None)
    if not upload_id:
        return jsonify(status='invalid', reason='uploadId not present')
    has_access = False
    try:
        has_access = can_access_file(upload_id)
        if has_access:
            upload_log = db_session.query(Upload).get(upload_id)
            logger.info('Retrieved upload log, merging raw table to master')
            raw_table_name = 'raw_{}'.format(upload_id)
            logger.info('Merging raw table to master')
            merge_id = upsert_raw_table_to_master(raw_table_name,
                                                  upload_log.jurisdiction_slug,
                                                  upload_log.event_type_slug,
                                                  upload_id, db_session)
            logger.info('Syncing merged file to s3')

            bootstrap_master_tables(upload_log.jurisdiction_slug, db_session)

            sync_merged_file_to_storage(upload_log.jurisdiction_slug,
                                        upload_log.event_type_slug,
                                        db_session.get_bind())
            merge_log = db_session.query(MergeLog).get(merge_id)
            try:
                logger.info('Merge succeeded. Now querying matcher')
                notify_matcher(upload_log.jurisdiction_slug, upload_id)
            except Exception as e:
                logger.error('Error matching: ', e)
                db_session.rollback()
                return make_response(jsonify(status='error'), 500)
            db_session.commit()
            return jsonify(status='success',
                           new_unique_rows=merge_log.new_unique_rows,
                           total_unique_rows=merge_log.total_unique_rows)
        else:
            return jsonify(status='not authorized')
    except ValueError as e:
        logger.error('Error merging: ', e)
        db_session.rollback()
        return make_response(jsonify(status='error'), 500)
Example #9
0
def get_histogram_bar_chart_data(data, distribution_function, shared_ids,
                                 data_name):
    intersection_data = data[data.matched_id.isin(shared_ids)]
    distribution, groups = distribution_function(data)
    distribution_intersection, _ = distribution_function(
        intersection_data, groups)
    bins = []
    logger.info(data_name)
    logger.info(distribution_intersection)
    logger.info(len(data.matched_id.unique()))
    for bin_index in range(len(distribution)):
        try:
            of_status = {
                "x":
                data_name,
                "y":
                int(distribution.iloc[bin_index]) /
                len(data.matched_id.unique()) * 100
            }
        except ZeroDivisionError:
            of_status = {"x": data_name, "y": 0}
        try:
            all_status = {
                "x":
                "Jail & Homeless",
                "y":
                int(distribution_intersection.iloc[bin_index]) /
                len(intersection_data.matched_id.unique()) * 100
            }
        except Exception as e:
            logger.error(
                'Error encountered while calculating intersection distribution: %s',
                e)
            all_status = {"x": "Jail & Homeless", "y": 0}
        bins.append((of_status, all_status))
    return [bins, list(distribution.index)]