def run(self): """Build scan results""" self.processed = data_preprocessor( self.readings, self.percentage_missing, self.max_anom_per_day, self.N_sigma, self.repeats, self.rolling_hours, self.fap_threshold, self.consecutive_missing_threshold, self.global_threshold, self.drop_sparse, self.drop_anomalous, self.drop_aperiodic, self.drop_consecutives, ) self.forecast = count_baseline( self.processed, self.days_in_past, self.days_in_future, self.ts_method, alpha=self.alpha, beta=self.beta, gamma=self.gamma, kern=self.kernel, ) self.all_results = scan(self.forecast, self.grid_resolution) self.grid_results = database_results(self.all_results)
def results_builder( outbreak_df: pd.DataFrame, outbreak_detectors: pd.DataFrame, outbreak_start, hist_sim_data, days_in_past: int, days_in_future: int, method: str, grid_partition: int, scan_type: str, show_plots=True, ): """Builds daily results from the scan up over a number of days, determined by the size of outbreak_dataframe. e.g. if outbreak_dataframe has N days worth of data, the first `days_in_past` number of days will be dedicated to forecasting. This function will then return scan results for the remaining N - `days_in_past` days, using the settings described below. Basically - simulating what will be stored in the database over a period of time. Args: outbreak_dataframe: simulated from `simulate_outbreak()` outbreak_detectors: dataframe of affected detectors outbreak_start: outbreak_start time hist_sim_data: Dataframe of F(S) scores from historic data days_in_past: as in count_baseline days_in_future: as in count_baseline method: as in count_baseline grid_partition: as in scan() scan_type as in scan() Returns: Dataframe of results spanning (len(outbreak_dataframe) - days_in_past) days worth of analysis Dataframe of highest scoring regions per day """ t_min = outbreak_df["measurement_start_utc"].min() t_max = outbreak_df["measurement_end_utc"].max() # Get outbreak characteristics num_outbreak_detectors = len(set(outbreak_detectors["detector_id"])) ob_x_min = outbreak_detectors["lon"].min() ob_x_max = outbreak_detectors["lon"].max() ob_y_min = outbreak_detectors["lat"].min() ob_y_max = outbreak_detectors["lat"].max() total_num_days = (t_max - t_min).days print("Total number of days in dataframe: ", total_num_days) first_analysis_day = (t_min + np.timedelta64(days_in_past, "D") + np.timedelta64(days_in_future, "D")) num_forecast_days = (t_max - first_analysis_day).days + 1 print("Producing forecasts and scans for {} days in total.\n".format( num_forecast_days)) print("Outbreak begins at {}.".format(outbreak_start)) # False-Positive rates to check fps = [0.01, 0.05, 0.10, 0.25, 0.50] # Threshold of EBP score required to be detected threshs = [ np.percentile(hist_sim_data["l_score_EBP"], 100 * (1 - x)) for x in fps ] dataframe_list = [] daily_highest_scoring_regions = {} today = first_analysis_day for i in range(num_forecast_days): print("\nAnalysis day: {}. Looking back at last {} hours.".format( today, 24 * days_in_future)) available_today = outbreak_df[ outbreak_df["measurement_end_utc"] <= today].copy() forecast_df = count_baseline( available_today, days_in_past=days_in_past, days_in_future=days_in_future, method=method, ) forecast_df = cleanse_forecast_data(forecast_df) if show_plots: CB_plot(forecast_df) res_df = scan(forecast_df, grid_partition=grid_partition, scan_type=scan_type) if show_plots: plot_region_by_rank(0, res_df, forecast_df, plot_type="count", add_legend=False) # Return Highest Scoring region here highest_region = res_df.iloc[0][[ "x_min", "x_max", "y_min", "y_max", "t_min", "t_max", "l_score_EBP", "l_score_000", "l_score_025", "l_score_050", "posterior_bbayes", ]].to_dict() # Add some Spatial analysis x_min = highest_region["x_min"] x_max = highest_region["x_max"] y_min = highest_region["y_min"] y_max = highest_region["y_max"] num_detectors_in_highest_region = len( set(outbreak_df[ (outbreak_df["lon"].between(x_min, x_max)) & (outbreak_df["lat"].between(y_min, y_max))].detector_id)) overlap_x_min = max([x_min, ob_x_min]) overlap_x_max = min([x_max, ob_x_max]) overlap_y_min = max([y_min, ob_y_min]) overlap_y_max = min([y_max, ob_y_max]) num_detectors_in_highest_region_and_true = len( set(outbreak_df[ (outbreak_df["lon"].between(overlap_x_min, overlap_x_max)) & (outbreak_df["lat"].between(overlap_y_min, overlap_y_max))]. detector_id)) # Calculate Spatial Precision and Recall precision = (num_detectors_in_highest_region_and_true / num_detectors_in_highest_region) recall = num_detectors_in_highest_region_and_true / num_outbreak_detectors highest_region["precision"] = precision highest_region["recall"] = recall highest_region["day"] = today # ================================= # How significant is today's score? # ================================= highest_region["days_since_outbreak"] = (today - outbreak_start).days # XXX - There is a much better way of doing this! detections = [ is_outbreak_detected(hist_sim_data, highest_region["l_score_EBP"], fp_rate=x) for x in fps ] highest_region["F_thresh_fp=0.01"] = threshs[0] highest_region["detected_fp=0.01"] = detections[0] highest_region["F_thresh_fp=0.05"] = threshs[1] highest_region["detected_fp=0.05"] = detections[1] highest_region["F_thresh_fp=0.10"] = threshs[2] highest_region["detected_fp=0.10"] = detections[2] highest_region["F_thresh_fp=0.25"] = threshs[3] highest_region["detected_fp=0.25"] = detections[3] highest_region["F_thresh_fp=0.50"] = threshs[4] highest_region["detected_fp=0.50"] = detections[4] # Append to list of dataframes daily_highest_scoring_regions[i] = highest_region # ========================= # Simulate Database Storage # ========================= database_df = database_results(res_df) # Updates data correctly with most reliable average likelihood scores. # i.e. today is wednesday, and days_in_future = 2 # We are getting scores for monday and tuesday, and append it in a list. # Now, the next day, we get scores for tuesday and wednesday. We throw away the old tuesday, # and keep the new one. if len(dataframe_list) - (days_in_future - 1) >= 0: dataframe_list = dataframe_list[:len(dataframe_list) - (days_in_future - 1)] days_dict = dict( iter(database_df.groupby(database_df["start_time_utc"].dt.day))) for j in range(days_in_future): forecast_day = (today - np.timedelta64(days_in_future - j, "D")).day dataframe_list.append(days_dict[forecast_day]) today += np.timedelta64(1, "D") # Return list of highest scoring regions too - add to plot? return ( pd.concat(dataframe_list, ignore_index=True), pd.DataFrame.from_dict(daily_highest_scoring_regions, "index"), )
def rerun_scan(self): # Assumes everything remains the same up to scanning print('Using cached processed and forecast data to rebuild scan') self.all_results = scan(self.forecast, self.grid_resolution) self.grid_results = database_results(self.all_results)
def run(self): """Build scan results""" self.all_results = scan(self.forecast, self.grid_resolution, self.grid_dict) self.grid_results = database_results(self.all_results)