def test_multiple_persons_by_npa(): persons = 100 timestamp = int(time.time()) npa = 3240 for id in range(0,persons): doc_id = str(uuid.uuid4())[0:10] print(doc_id) report = IndividualReportModel( document_id=doc_id, diagnostic=random.randint(0, 4), locator=npa, session_id=str(uuid.uuid4()), timestamp=timestamp, analysis_done=False ) session.add(report) session.commit() analysis_next_report(persons) _date = datetime.fromtimestamp(timestamp) print(_date) q = session.query(DailyDiagnosticChangeModel) daily_change = q.filter_by(locator=npa, year=_date.year, month=_date.month, day=_date.day).first() total = daily_change.diagnostic_0 + \ daily_change.diagnostic_1 + \ daily_change.diagnostic_2 + \ daily_change.diagnostic_3 + \ daily_change.diagnostic_4 assert persons == total
def add_person(): if not request.json: raise InvalidUsage("No data supplied") param_list = ['id', 'data'] check_param(request.json, param_list) data_list = ['diagnostic', 'locator', 'sessionId', 'timestamp'] check_param(request.json['data'], data_list) timestamp_list = ["_seconds", "_nanoseconds"] check_param(request.json['data']['timestamp'], timestamp_list) try: report = IndividualReportFactory.build(request.json) try: session.add(report) session.commit() except: session.rollback() abort(500, "Could not insert to database") except TypeError: raise InvalidUsage( "Some parameter was wrongly typed (string, int, array).") except: message = ( "Could not create Individual Report. Probably malformed json. JSON:{%s}, %s", request.json) abort(400, message)
def upload_geo_data(): """ csv headers: country_code,postal_code,latitude,longitude,region_id :return: """ geo_locations = {} with open(str(GEOCODING_RAW_FILE), 'r') as file: reader = csv.reader(file) for i, row in enumerate(reader): if i == 0: continue # skip header location = { 'country_code': row[0], 'postal_code': row[1], 'longitude': float(row[2]), 'latitude': float(row[3]), 'region_id': row[4], } # remove duplicate by this... geo_locations[location['postal_code']] = location for key, loc in geo_locations.items(): location = LocationModel( postal_code=loc['postal_code'], country_code=loc['country_code'], region_id=loc['region_id'], longitude=loc['longitude'], latitude=loc['latitude'], ) session.add(location) session.commit()
def test_one_person_update(): nb_update = 1000 timestamps = range(1584892474, 1584892474 + nb_update) session_id = str(uuid.uuid4()) npa = 1246 for timestamp in timestamps: doc_id = str(uuid.uuid4())[0:10] print(doc_id) report = IndividualReportModel( document_id=doc_id, diagnostic=random.randint(0, 4), locator=npa, session_id=session_id, timestamp=timestamp, analysis_done=False ) session.add(report) session.commit() analysis_next_report(nb_update) _date = datetime.fromtimestamp(timestamps[0]) q = session.query(DailyDiagnosticChangeModel) daily_change = q.filter_by(locator=npa, year=_date.year, month=_date.month, day=_date.day).first() total = daily_change.diagnostic_0 + \ daily_change.diagnostic_1 + \ daily_change.diagnostic_2 + \ daily_change.diagnostic_3 + \ daily_change.diagnostic_4 print(total) assert total == 1
def upload_geo_data(): geo_locations = {} with open(str(GEO_DATA_FILE_FIRST), 'r') as file: reader = csv.reader(file) for i, row in enumerate(reader): if i == 0: continue # skip header location = { 'npa': int(row[5]), 'town': row[4], 'state': row[3], 'longitude': float(row[2]), 'latitude': float(row[1]), } # remove duplicate by this... geo_locations[location['npa']] = location with open(str(GEO_DATA_FILE_SECOND), 'r') as file: reader = csv.reader(file) for i, row in enumerate(reader): if i == 0: continue # skip header location = { 'npa': int(row[0]), 'town': row[1], 'state': row[2], 'longitude': float(row[3]), 'latitude': float(row[4]), } if location['npa'] in geo_locations: geo_locations[location['npa']]['town'] = location['town'] geo_locations[ location['npa']]['longitude'] = location['longitude'] geo_locations[ location['npa']]['latitude'] = location['latitude'] else: # add it geo_locations[location['npa']] = location for key, loc in geo_locations.items(): location = LocationModel( npa=loc['npa'], town=loc['town'], state=loc['state'], longitude=loc['longitude'], latitude=loc['latitude'], ) session.add(location) session.commit()
def test_insert_speed(): timestamps = range(1584892474, 1584892474 + 1000) npa = 1240 start_time = time.time() for timestamp in timestamps: doc_id = str(uuid.uuid4())[0:10] report = IndividualReportModel( document_id=doc_id, diagnostic=random.randint(0, 4), locator=npa, session_id=str(uuid.uuid4()), timestamp=timestamp ) session.add(report) session.commit() print('Insertion time:',time.time() - start_time)
def map_calculate(collection_size: int): """Calcalate symptom factor S for the whole DB where analysis_done = 0""" start_time_analysis = time.time() # load the next collection of reports to analyse next_reports = pd.read_sql( ("SELECT * FROM individual_report WHERE analysis_done = 0 " "ORDER BY timestamp LIMIT ") + str(collection_size), con=engine, index_col="document_id", ) S = next_reports.apply(calculate, axis=1) query = """ UPDATE individual_report AS old, temp_table AS new SET old.S = new.0, old.analysis_done = 1 WHERE old.document_id = new.document_id """ with engine.begin() as con: S.to_sql( "temp_table", con, if_exists="replace", dtype={ "document_id": sa.String(30), "S": sa.Integer, }, ) try: con.execute(query) except: print("ERROR: while executing query: {}".format(query)) try: con.execute("DROP TABLE temp_table") except InternalError: print("WARNING: no temp_table to drop") session.commit() spend_time = time.time() - start_time_analysis print("Analysed {} samples in {} s".format(collection_size, spend_time))
def insert_fake_inidividual_reports(): from_day = '2020-03-10' to_day = '2020-03-15' sample_per_day = 1000 person_list = gen_fake_person_list(sample_per_day) from_date = datetime.strptime(from_day, DAY_FORMAT) to_date = datetime.strptime(to_day, DAY_FORMAT) _from = int(datetime.strptime(from_day, DAY_FORMAT).timestamp()) _to = int(datetime.strptime(to_day, DAY_FORMAT).timestamp()) current_day = from_date while current_day <= to_date: timestamp = current_day.timestamp() start_time = time.time() for sample in range(0, sample_per_day): person = random_list_element(person_list) doc_id = str(uuid.uuid4())[0:20] report = IndividualReportModel( document_id=doc_id, diagnostic=random.randint(0, 4), locator=person['npa'], session_id=person['session_id'], timestamp=(timestamp + sample) * 1000, # to millisecond analysis_done=False) # print(report.timestamp) session.add(report) session.commit() spend_time = time.time() - start_time print( str(current_day) + ': uploaded ' + str(sample_per_day) + ' samples in ' + str(spend_time) + 's') current_day = current_day + timedelta(days=1)
def report(): data = request.json print(data) # if token submitted # if token valid # insert report into database # else # generate new token # insert report into database # return token if 'token' in data.keys(): submitted_token = str(data['token']) if len(submitted_token) == 7: pass else: raise InvalidUsage("Invalid token") q = session.query(Token).filter_by(token=submitted_token).first() if q: try: report = ReportFactory.build(data) try: session.add(report) session.commit() except: session.rollback() abort(500, "Could not insert into database") except TypeError: raise InvalidUsage( "Some parameter was wrongly typed (string, int, array).") if report.has_comorbid: data['report']['comorbid']['parent_id'] = report.document_id try: comorbid = ComorbidFactory.build( data['report']['comorbid']) try: session.add(comorbid) session.commit() except: session.rollback() abort(500, "Could not insert into database") except TypeError: raise InvalidUsage( "Some parameter was wrongly typed (string, int, array)." ) return make_response("", 201) else: raise InvalidUsage("Provided token doesn't exist") elif 'report' in data.keys(): generated_token = generate_token() data['token'] = generated_token token = TokenFactory.build(generated_token) try: session.add(token) session.commit() except: session.rollback() abort(500, "Could not insert a new token into database") try: report = ReportFactory.build(data) try: session.add(report) session.commit() except: session.rollback() abort(500, "Could not insert into database") except TypeError: raise InvalidUsage( "Some parameter was wrongly typed (string, int, array).") if report.has_comorbid: data['report']['comorbid']['parent_id'] = report.document_id try: comorbid = ComorbidFactory.build(data['report']['comorbid']) try: session.add(comorbid) session.commit() except: session.rollback() abort(500, "Could not insert into database") except TypeError: raise InvalidUsage( "Some parameter was wrongly typed (string, int, array).") return make_response(jsonify({"token": generated_token}), 201) else: raise InvalidUsage("Required parameters are missing")
def download_hours_frame(_from_hour: str, _to_hour: str, worker_frame_minute: int = 10): if 60 % worker_frame_minute != 0: raise Exception("wrong worker period") _from = int(datetime.strptime(_from_hour, DATE_HOUR_PARAM_FORMAT).timestamp()) _to = int(datetime.strptime(_to_hour, DATE_HOUR_PARAM_FORMAT).timestamp()) now = datetime.now() # current date and time now_str = now.strftime("%Y-%m-%d-%H-%M-%S") backup_dir = BACKUP_DOCUMENTS_PATH / ('download-time-' + now_str) if not backup_dir.exists(): backup_dir.mkdir() # iterate over hours _previous = _from _next = _previous + 60 * 60 while _next <= _to: _prev_str = datetime.fromtimestamp(_previous).strftime(DATE_HOUR_PARAM_FORMAT) _next_str = datetime.fromtimestamp(_next).strftime(DATE_HOUR_PARAM_FORMAT) print("Download from " + _prev_str + " to " + _next_str) minutes = list() # iterate over minutes _previous_minute = _previous _to_minute = _next _next_minute = _previous_minute + 60 * worker_frame_minute while _next_minute <= _to_minute: minutes.append((_previous_minute, _next_minute)) _previous_minute = _next_minute _next_minute = _previous_minute + worker_frame_minute * 60 hour_data = [] # launch pool process_count = int(60 / worker_frame_minute) with Pool(processes=process_count) as pool: res = pool.map(download_worker, minutes) for worker_results in res: if len(worker_results) > 0: for data in worker_results: hour_data.append(data) # save results to json file if len(hour_data) > 0: file_path = backup_dir / ('data-' + _prev_str + '_' + _next_str + '.json') with open(str(file_path), 'w') as outfile: json.dump(hour_data, outfile) try: # try batch commit for report_json in hour_data: report = IndividualReportFactory.build(report_json) session.add(report) session.commit() except IntegrityError: session.rollback() # duplication, then one by one print('Warn: Duplicate in batch commit') for report_json in hour_data: try: report = IndividualReportFactory.build(report_json) session.add(report) session.commit() except IntegrityError: session.rollback() print('Warn: Duplicate doc id: ' + report_json['id']) _previous = _next _next = _previous + (60 * 60)
def insert_fake_inidividual_reports(): from_day = '2020-03-10' to_day = '2020-03-15' sample_per_day = 1000 person_list = gen_fake_person_list(sample_per_day) from_date = datetime.strptime(from_day, DAY_FORMAT) to_date = datetime.strptime(to_day, DAY_FORMAT) _from = int(datetime.strptime(from_day, DAY_FORMAT).timestamp()) _to = int(datetime.strptime(to_day, DAY_FORMAT).timestamp()) def random_bool(): return random.randint(0, 1) current_day = from_date while current_day <= to_date: timestamp = current_day.timestamp() start_time = time.time() for sample in range(0, sample_per_day): person = random_list_element(person_list) doc_id = str(uuid.uuid4())[0:20] has_comorbid = random_bool() comorbid = Comorbid( hypertension = random_bool(), cardiovascular = random_bool(), pulmonary = random_bool(), cancer = random_bool(), diabetes = random_bool(), renal = random_bool(), neurological = random_bool(), respiratory = random_bool(), ) report = IndividualReportModel( # Metadata document_id=doc_id, diagnostic=random.randint(0, 4), locator=person['npa'], session_id=person['session_id'], timestamp=(timestamp + sample ) * 1000, # to millisecond analysis_done=False, # Actual data temp=enum.Scale3(random.randint(1, 3)).name, cough=enum.Scale4(random.randint(0, 3)).name, breathless=enum.Scale4(random.randint(0, 3)).name, energy=enum.Energy(random.randint(0, 4)).name, exposure=enum.Exposure(random.randint(0, 2)).name, has_comorbid=has_comorbid, comorbid=comorbid, compromised_immune=random_bool(), age=enum.Scale3(random.randint(1, 3)).name, ) # print(report.timestamp) session.add(report) session.commit() spend_time = time.time() - start_time print(str(current_day) + ': uploaded ' + str(sample_per_day) + ' samples in ' + str(spend_time) + 's') current_day = current_day + timedelta(days=1)
def analysis_next_report(collection_size: int): start_time_analysis = time.time() # load all previous reports report_done_df = pd.read_sql("SELECT * FROM individual_report WHERE analysis_done = 1", con=engine, index_col="document_id") # load the next collection of reports to analyse next_reports = pd.read_sql("SELECT * FROM individual_report WHERE analysis_done = 0 ORDER BY timestamp LIMIT " + str(collection_size), con=engine, index_col="document_id") # timestamp from millisecond to second report_done_df["timestamp"] = report_done_df["timestamp"] / 1000 next_reports["timestamp"] = next_reports["timestamp"] / 1000 # Hold the reports to update in the end report_to_update = list() # Hold the reports to update in the end daily_delta_df = pd.DataFrame(columns=['locator','year','month','day','0','1','2','3','4','5']) sample = 0 start_time = time.time() # iterate in all the report to analyse for doc_id, next_report in next_reports.iterrows(): sample += 1 _date = datetime.fromtimestamp(next_report['timestamp']) # to seconds delta = [0, 0, 0, 0, 0, 0] # search for last status same_session_id_df = report_done_df.loc[report_done_df['session_id'] == next_report['session_id']] if same_session_id_df.empty: delta[next_report['diagnostic']] = 1 #print('Delta (new user)',str(delta)) else: last_report = same_session_id_df.sort_values(by=['timestamp'], ascending=False).iloc[0] if last_report['diagnostic'] != next_report['diagnostic']: delta[last_report['diagnostic']] = -1 delta[next_report['diagnostic']] = 1 #print('Delta (existing user)', str(delta)) # check if daily delta already exist previous_daily = daily_delta_df.loc[(daily_delta_df['year'] == _date.year) & (daily_delta_df['month'] == _date.month) & (daily_delta_df['day'] == _date.day) & (daily_delta_df['locator'] == next_report['locator'])] if previous_daily.empty: # then create it new_daily = dict() new_daily['locator'] = next_report['locator'] new_daily['year'] = _date.year new_daily['month'] = _date.month new_daily['day'] = _date.day new_daily['0'] = delta[0] new_daily['1'] = delta[1] new_daily['2'] = delta[2] new_daily['3'] = delta[3] new_daily['4'] = delta[4] new_daily['5'] = delta[5] new_daily[str(next_report['diagnostic'])] = 1 daily_delta_df = daily_delta_df.append(new_daily, ignore_index=True) else: # then update it previous_daily = previous_daily.iloc[0] # convert dataframe to serie by ask the first (the only one) row # print('new diagnostic:',next_report['diagnostic']) daily_delta_df.at[previous_daily.name, "0"] = previous_daily["0"] + delta[0] daily_delta_df.at[previous_daily.name, "1"] = previous_daily["1"] + delta[1] daily_delta_df.at[previous_daily.name, "2"] = previous_daily["2"] + delta[2] daily_delta_df.at[previous_daily.name, "3"] = previous_daily["3"] + delta[3] daily_delta_df.at[previous_daily.name, "4"] = previous_daily["4"] + delta[4] daily_delta_df.at[previous_daily.name, "5"] = previous_daily["5"] + delta[5] if sample % 100 == 0: spend_time = time.time() - start_time print('Analysed ' + str(sample) + ' samples in ' + str(spend_time) + 's') start_time = time.time() # update report status next_report['analysis_done'] = 1 # keep trace of the report to update in DB #report_to_update_df = report_to_update_df.append(next_report) report_to_update.append(doc_id) # update the done report list report_done_df = report_done_df.append(next_report) #print('====== Report to update ========') #print(report_to_update) print('====== Daily change ========') print(daily_delta_df) # upload daily changes for doc_id, daily_delta in daily_delta_df.iterrows(): locator = daily_delta['locator'] year = daily_delta['year'] month = daily_delta['month'] day = daily_delta['day'] q = session.query(DailyDiagnosticChangeModel) daily_change = q.filter_by(locator=locator, year=year, month=month, day=day).first() if daily_change is None: # then create it daily_change = DailyDiagnosticChangeModel( locator=locator, year=year, month=month, day=day, diagnostic_0=daily_delta['0'], diagnostic_1=daily_delta['1'], diagnostic_2=daily_delta['2'], diagnostic_3=daily_delta['3'], diagnostic_4=daily_delta['4'], diagnostic_5=daily_delta['5'], ) session.add(daily_change) else: # update diagnostic daily_change.diagnostic_0 += daily_delta['0'] daily_change.diagnostic_1 += daily_delta['1'] daily_change.diagnostic_2 += daily_delta['2'] daily_change.diagnostic_3 += daily_delta['3'] daily_change.diagnostic_4 += daily_delta['4'] daily_change.diagnostic_5 += daily_delta['5'] session.commit() # build query to update individual reports ids = '(' idx = 0 for doc_id in report_to_update: if idx != 0: ids += ',' ids += '"'+doc_id+'"' idx += 1 ids += ')' query = 'UPDATE individual_report SET analysis_done=1 WHERE individual_report.document_id IN ' + ids + '; ' with engine.connect() as con: con.execute(query) print('Analysis of ' + str(collection_size) + ' in ' + str(time.time() - start_time_analysis) + 's')
def group_reports_by_location(): start_time_analysis = time.time() # load all analysed reports df = pd.read_sql( "SELECT * FROM individual_report WHERE analysis_done = 1", con=engine, index_col="document_id", ) # diagnosis using S factors # healthy = df.query('S == 0') # sick_guess_no_corona = df.query('0 < S < 4') # sick_guess_corona = df.query('S >= 4') # diagnosis from report # healthy = df.query('diagnostic == 0') # sick_guess_no_corona = df.query('diagnostic == 1') # sick_guess_corona = df.query('diagnostic == 2') # sick_corona_confirmed = df.query('diagnostic == 3') # recovered_confirmed = df.query('diagnostic == 4') # recovered_not_confirmed = df.query('diagnostic == 5') def group(df_diagnosis): """Get number of reports by location""" return df_diagnosis.groupby("locator").sum() def to_sql(totals, column): query = """ UPDATE locations AS old, temp_totals AS new SET old.{column} = new.analysis_done WHERE old.postal_code = new.locator """.format(column=column) with engine.begin() as con: totals.to_sql( "temp_totals", con, if_exists="replace", dtype={ "locator": sa.Integer, }, ) try: con.execute(query) except: print("ERROR: while executing query: {}".format(query)) try: con.execute("DROP TABLE temp_totals") except InternalError: print("WARNING: no temp_totals to drop") # total_healthy = group(healthy) # print(total_healthy) # to_sql(total_healthy, "total_healthy") for diagnostic, column in enumerate(COLUMNS): df_diagnosis = df.query("diagnostic == {}".format(diagnostic)) df_totals = group(df_diagnosis) to_sql(df_totals, "total_{}".format(column)) session.commit() spend_time = time.time() - start_time_analysis print("Grouped {} samples by location in {} s".format(len(df), spend_time))
""" Created by nuffer at 3/25/20 """ import json from analysis.utils.factory import IndividualReportFactory from analysis.utils.db import session from analysis import BACKUP_DOCUMENTS_PATH if __name__ == '__main__': for dir in BACKUP_DOCUMENTS_PATH.glob('*'): for file in dir.glob('*.json'): print('Loading file: ' + str(file)) with open(str(file)) as json_file: data = json.load(json_file) for report in data: report = IndividualReportFactory.build(report) session.add(report) session.commit()