def upload_file(): jurisdiction = request.args.get('jurisdiction') event_type = request.args.get('eventType') if can_upload_file(jurisdiction, event_type): filenames = [key for key in request.files.keys()] if len(filenames) != 1: return jsonify( status='error', message='Exactly one file must be uploaded at a time') uploaded_file = request.files[filenames[0]] filename = secure_filename(uploaded_file.filename) cwd = os.getcwd() tmp_dir = os.path.join(cwd, 'tmp') os.makedirs(tmp_dir, exist_ok=True) full_filename = os.path.join(tmp_dir, filename) uploaded_file.save(full_filename) upload_id = unique_upload_id() q = get_q(get_redis_connection()) job = q.enqueue_call(func=validate_async, args=(uploaded_file.filename, jurisdiction, full_filename, event_type, current_user.id, upload_id), result_ttl=5000, timeout=3600, meta={ 'event_type': event_type, 'filename': filename, 'upload_id': upload_id }) logger.info(f"Job id {job.get_id()}") return jsonify(status='validating', jobKey=job.get_id(), message='Validating data!') else: return jsonify(status='not authorized', exampleRows=[])
def can_access_file(upload_id): upload = db_session.query(Upload).get(upload_id) if not upload: raise ValueError('upload_id: %s not present in metadata database', upload_id) logger.info('Found jurisdiction %s and event type %s for upload id %s', upload.jurisdiction_slug, upload.event_type_slug, upload_id) return can_upload_file(upload.jurisdiction_slug, upload.event_type_slug)
def get_last_upload_date(): last_upload = query.last_upload_date() try: assert len(last_upload) == 1 last_upload_date = last_upload[0]['upload_start_time'] logger.info(type(last_upload_date)) last_upload_date = last_upload_date.strftime('%Y-%m-%d') return jsonify(results=last_upload_date) except: return jsonify("no valid upload date")
def match_finished(matched_results_paths, match_job_id, match_start_at, match_complete_at, match_status, match_runtime, upload_id=None): try: logger.info('Writing to match log') write_match_log(db_session=db_session, match_job_id=match_job_id, match_start_at=match_start_at, match_complete_at=match_complete_at, match_status=match_status, match_runtime=match_runtime, upload_id=upload_id) logger.info('Writing matches to db') for event_type, filename in matched_results_paths.items(): jurisdiction = filename.split('/')[-3] logger.info( 'Writing matches from event type %s and filename %s to db. Parsed jurisdiction %s out of filename', event_type, filename, jurisdiction) with open_sesame(filename, 'rb') as matches_filehandle: write_matches_to_db(db_engine=engine, event_type=event_type, jurisdiction=jurisdiction, matches_filehandle=matches_filehandle) except Exception as e: logger.error('Error encountered during match_finished: %s', str(e)) finally: logger.info('All done!')
def notify_matcher(jurisdiction, upload_id=None): schema_pk_lookup = list_all_schemas_primary_keys(SCHEMA_DIRECTORY) base_data_directory = app_config['base_data_path'] directory_to_pass = base_data_directory.format(jurisdiction=jurisdiction) redis_connection = Redis(host='redis', port=6379) q = Queue('matching', connection=redis_connection) logger.info('Enqueueing do_match job') job = q.enqueue(f="matcher.do_match", args=(directory_to_pass, schema_pk_lookup, upload_id), result_ttl=5000, timeout=100000, meta={'upload_id': upload_id}) logger.info("Enqueued job %s", job)
def get_records_by_time(): start_date = request.args.get('startDate') end_date = request.args.get('endDate') jurisdiction = request.args.get('jurisdiction') limit = request.args.get('limit', 10) offset = request.args.get('offset', 0) order_column = request.args.get('orderColumn') order = request.args.get('order') set_status = request.args.get('setStatus') logger.info(f'Pulling data from {start_date} to {end_date}') records = query.get_records_by_time( start_date, end_date, jurisdiction, limit, offset, order_column, order, set_status ) return jsonify(results=records)
def get_contact_dist(data, bins=None): data = data.groupby('matched_id').matched_id.count().as_matrix() data = data.astype(int) one_contact = list(data).count(1) rest = np.delete(data, np.argwhere(data == 1)) if one_contact == len(data): df_hist = pd.DataFrame({'contacts': [one_contact]}, index=['1 contact']) logger.info("all ones!") return df_hist, 1 if len(np.unique(rest)) == 1: df_hist = pd.DataFrame( {'contacts': [one_contact, len(rest)]}, index=['1 contact', f"{np.unique(rest)[0]} contacts"]) return df_hist, 1 if bins is not None: num, groups = np.histogram(rest, bins) else: num, groups = np.histogram(rest, 'auto') num, groups = np.histogram(rest, np.unique(groups.round())) if len(groups) > 4: bins = 4 num, groups = np.histogram(rest, bins) num, groups = np.histogram(rest, np.unique(groups.round())) hist = [one_contact] + list(num) index = [pd.Interval(1, 2, 'left')] + [ pd.Interval(int(b[0]), int(b[1]) + 1, 'left') for b in list(window(list(groups), 2)) ] df_hist = pd.DataFrame({'contacts': hist}, index=contacts_interval_to_text(index)) logger.info(num) logger.info(groups) logger.info(index) logger.info(df_hist) return df_hist, groups
def merge_file(): upload_id = request.args.get('uploadId', None) if not upload_id: return jsonify(status='invalid', reason='uploadId not present') has_access = False try: has_access = can_access_file(upload_id) if has_access: upload_log = db_session.query(Upload).get(upload_id) logger.info('Retrieved upload log, merging raw table to master') raw_table_name = 'raw_{}'.format(upload_id) logger.info('Merging raw table to master') merge_id = upsert_raw_table_to_master(raw_table_name, upload_log.jurisdiction_slug, upload_log.event_type_slug, upload_id, db_session) logger.info('Syncing merged file to s3') bootstrap_master_tables(upload_log.jurisdiction_slug, db_session) sync_merged_file_to_storage(upload_log.jurisdiction_slug, upload_log.event_type_slug, db_session.get_bind()) merge_log = db_session.query(MergeLog).get(merge_id) try: logger.info('Merge succeeded. Now querying matcher') notify_matcher(upload_log.jurisdiction_slug, upload_id) except Exception as e: logger.error('Error matching: ', e) db_session.rollback() return make_response(jsonify(status='error'), 500) db_session.commit() return jsonify(status='success', new_unique_rows=merge_log.new_unique_rows, total_unique_rows=merge_log.total_unique_rows) else: return jsonify(status='not authorized') except ValueError as e: logger.error('Error merging: ', e) db_session.rollback() return make_response(jsonify(status='error'), 500)
def get_histogram_bar_chart_data(data, distribution_function, shared_ids, data_name): intersection_data = data[data.matched_id.isin(shared_ids)] distribution, groups = distribution_function(data) distribution_intersection, _ = distribution_function( intersection_data, groups) bins = [] logger.info(data_name) logger.info(distribution_intersection) logger.info(len(data.matched_id.unique())) for bin_index in range(len(distribution)): try: of_status = { "x": data_name, "y": int(distribution.iloc[bin_index]) / len(data.matched_id.unique()) * 100 } except ZeroDivisionError: of_status = {"x": data_name, "y": 0} try: all_status = { "x": "Jail & Homeless", "y": int(distribution_intersection.iloc[bin_index]) / len(intersection_data.matched_id.unique()) * 100 } except Exception as e: logger.error( 'Error encountered while calculating intersection distribution: %s', e) all_status = {"x": "Jail & Homeless", "y": 0} bins.append((of_status, all_status)) return [bins, list(distribution.index)]