def __init__(self, incidents, deployments, vehicle_types, location_ids, start_time=None, end_time=None, predictor="basic", fc_dir="/data", verbose=True): """ Initialize all properties by extracting probabilities from the data. """ self.incidents = incidents[np.in1d( incidents["hub_vak_bk"].fillna(0).astype(int).astype(str), location_ids)] self.deployments = deployments self.vehicle_types = vehicle_types if not isinstance(location_ids[0], str): raise ValueError('locations must be an iterable of strings') self.location_ids = location_ids self.verbose = verbose self.types = self._infer_incident_types() self._assign_predictor(predictor, fc_dir) self._set_sampling_dict(start_time, end_time, incident_types=self.types) self._create_incident_types() self._create_demand_locations() self.reset_time() progress("IncidentSampler ready for simulation.", verbose=self.verbose)
def _assign_predictor(self, predictor, fc_dir): """ Initialize incident rate predictor and assign to property. Parameters ---------- predictor: str, one of ['prophet'] The predictor to use to forecast the incident rates. Currently, only supports predictor='prophet'. """ if predictor == "prophet": progress("Initializing ProphetIncidentPredictor...", verbose=self.verbose) predictor_cls = ProphetIncidentPredictor elif predictor == "basic": progress("Initializing BasicLambdaForecaster...", verbose=self.verbose) predictor_cls = BasicLambdaForecaster else: raise ValueError( "'predictor' must be one of {}.".format(predictors)) self.predictor = predictor_cls(load_forecast=True, fc_dir=fc_dir, verbose=self.verbose)
def _prep_data_for_fitting(self, incidents, deployments, stations, vehicles, osrm_host, save): """Perform basic preprocessing and calculate OSRM estimates for travel time. Prepared data is stored under self.data. Nothing is returned. Parameters ---------- incidents: pd.DataFrame The incident data. deployments: pd.DataFrame The deployment data. stations: pd.DataFrame The station information including coordinates and station names. vehicles: array-like of strings The types of vehicles to use. Defaults to ["TS", "RV", "HV", "WO"]. osrm_host: str The url to the OSRM API. save: boolean Whether to save the data to a csv file after preparing it. """ progress("Preprocessing and merging datasets.", verbose=self.verbose) data = prepare_data_for_response_time_analysis(incidents, deployments, stations, vehicles) progress("Adding OSRM distance and duration.", verbose=self.verbose) self.data = add_osrm_distance_and_duration(data, osrm_host=osrm_host) if save: progress("Saving file.", verbose=self.verbose) self.data.to_csv(os.path.join(self.data_dir, self.file_name), index=False) progress("Data prepared for fitting.", verbose=self.verbose)
def fit(self, data, types=None): """ Perform time series decomposition using Prophet. This function first prepares the data and saves the prepared data as 'self.incidents'. then it creates a dictionary of Prophet() objects, where the keys equal the incident types and the corresponding model is fitted to the data of that type. The dictionary of models is stored as 'self.models_dict' and used when predict is called. Notes ----- This function does not return anything. Parameters ---------- data: pd.DataFrame The incidents to train the models on. types: Sequence(str) The incident types to fit models for. If None, uses all incident types in the data, except 'nan' and 'NVT'. Defaults to None. """ if types is not None: self.types = types else: progress("No incident types given, using all types in data.", verbose=self.verbose) self.types = [ t for t in data["dim_incident_incident_type"].unique() if t not in ["nan", "NVT", np.nan] ] progress("Preparing incident data for analysis...", verbose=self.verbose) self.incidents = self._prep_data_for_prediction(data) self.incidents["hourly_datetime"] = self._create_date_hour_column( self.incidents, datetime_col="dim_incident_start_datumtijd") start = self.incidents["hourly_datetime"].min() end = self.incidents["hourly_datetime"].max() self.time_index = self._create_complete_hourly_index(start, end_datetime=end) self.models_dict = dict() for type_ in self.types: progress("Fitting model for type {}...".format(type_), verbose=self.verbose) m = Prophet() dfprophet = self._create_prophet_data(self.incidents, self.time_index, type_=type_) m.fit(dfprophet) self.models_dict[type_] = m self.fitted = True progress("Models fitted.", verbose=self.verbose)
def _apply_filters(self, log, metric_set): """Applies all the filtering specified in a metric set and returns the resulting observations in the simulation log. Also adds relevant performance measures. Parameters ---------- log: pd.DataFrame The simulation log metric_set: dict The metric set as created by this class. Returns ------- data: pd.Dataframe The filtered log. y_col: str The name of the column that describes the measure of the metric set. """ data = log.copy() # apply filters for f in self.filters: if metric_set[f] is not None: progress("Filtering on {}.".format(f), verbose=self.verbose) data = self._filter_data(data, self.filter_column_map[f], metric_set[f]) if metric_set["first_only"]: progress("Keeping only first vehicle per incident.", verbose=self.verbose) data.sort_values(self.response_time_col, inplace=True) data.drop_duplicates(subset=[self.run_col, self.incident_id_col], inplace=True) data.sort_values([self.run_col, self.incident_id_col], inplace=True) # add relevant performance measures to data if metric_set["measure"] == "response_time": y_col = self.response_time_col if metric_set["measure"] == "on_time": data["on_time"] = (data[self.response_time_col] <= data[self.target_col]) y_col = "on_time" if metric_set["measure"] == "delay": data["delay"] = data[self.response_time_col] - data[ self.target_col] y_col = "delay" return data, y_col
def _create_demand_locations(self): """ Initialize demand locations and their building function distributions. Creates a dictionary of DemandLocation objects. Each such object has its own distribution over building functions that is used during sampling. """ progress("Getting building function probabilities.", verbose=self.verbose) building_probs = get_building_function_probabilities( self.incidents, locations=self.location_ids) progress("Initializing demand locations", verbose=self.verbose) self.locations = { l: DemandLocation(l, building_probs[l]) for l in building_probs.keys() }
def _get_travel_durations(self): """ Use OSRM to find the travel durations between every set of demand locations and stations. """ progress("Creating matrix of travel times...", verbose=self.verbose) coord_list = list(self.demand_locs.values()) + list( self.station_locs.values()) id_list = list(self.demand_locs.keys()) + list( self.station_locs.keys()) time_matrix, _, _ = osrm.table(coord_list, coords_dest=coord_list, ids_origin=id_list, ids_dest=id_list, output='dataframe', url_config=self.osrm_config) return time_matrix
def predict(self, periods=365 * 24, freq="H", save=False, future=None): """ Forecast the incident rate using Prophet. Notes ----- Can only be called after calling '.fit()', throws assertion error otherwise. Does not return anything, since it's main use cases are sampling from directly from this predictor and saving predictions to file. The result of this method can be obtained by calling 'get_forecast()' afterwards. Parameters ---------- periods: int The number of periods to forecast. freq: str, The frequency to predict the incident rates at. Accepts any valid frequency for pd.date_range, such as 'H' (default), 'D', or 'M'. save: boolean Whether to save the forecast to a csv file. Optional, defaults to false. """ assert self.fitted, "First use 'fit()' to fit a model before predicting." if future is None: future = self.models_dict[self.types[0]].make_future_dataframe( periods=periods, freq=freq, include_history=False) forecast_dict = dict(ds=future["ds"].tolist()) for type_ in self.types: progress("Predicting incident rates for {}".format(type_), verbose=self.verbose) forecast_dict[type_] = np.maximum( 0.0, self.models_dict[type_].predict(future)["yhat"].tolist()) self.forecast = pd.DataFrame(forecast_dict) progress("Forecast made.", verbose=self.verbose) if save: self.save_forecast()
def _filter_data(self, data, remove_unfinished_month=True, last_n_years=5): """Filter out some stuff for proper analysis.""" data[self.date_col] = pd.to_datetime(data[self.date_col], dayfirst=True) end = data[self.date_col].max() if remove_unfinished_month: cutoff = pd.Timestamp(year=end.year, month=end.month, day=1, hour=0) progress("Cutting off at {}.".format(cutoff)) data = data[data[self.date_col] < cutoff].copy() else: cutoff = end if last_n_years: start = pd.Timestamp(year=(cutoff.year - last_n_years), month=cutoff.month, day=cutoff.day, hour=cutoff.hour) progress("Using incidents after {}.".format(start)) data = data[data[self.date_col] >= start].copy() progress("Data filtered.", verbose=self.verbose) return data
def _evaluate_metric_set(self, log, metric_set): """Evaluate a set of metrics relating to a single measure. Parameters ---------- log: pd.DataFrame The log of simulation outputs. metric_set: dict The description of the metrics to calculate as created in :code:`.add_metric()`. Returns ------- result: pd.DataFrame The calculated metrics. """ data, y_col = self._apply_filters(log, metric_set) # calculate metrics progress("Calculating requested metrics.", verbose=self.verbose) if self.by_run: results_per_run = self._calculate_descriptors_by_run( data, y_col=y_col, count=metric_set["count"], mean=metric_set["mean"], std=metric_set["std"], missing=metric_set["missing"], quantiles=metric_set["quantiles"]) results_per_run.drop(self.run_col, axis=1, inplace=True) return results_per_run else: results = self._calculate_descriptors( data[y_col], count=metric_set["count"], mean=metric_set["mean"], std=metric_set["std"], missing=metric_set["missing"], quantiles=metric_set["quantiles"]) return results
def __init__(self, demand_locs=None, station_locs=None, osrm_host="http://192.168.56.101:5000", load_matrix=True, save_matrix=False, data_dir="data", verbose=True): """ Create the matrix of travel durations with OSRM. """ self.osrm_host = osrm_host self.demand_locs = demand_locs self.station_locs = station_locs self.verbose = verbose self.path = os.path.join(data_dir, "time_matrix.csv") if load_matrix: self.time_matrix_df = self.load_time_matrix(self.path) else: try: global osrm import osrm osrm.RequestConfig.host = self.osrm_host self.osrm_config = osrm.RequestConfig self.time_matrix_df = self._get_travel_durations() except ImportError: raise ImportError( "If load_matrix=False, OSRM is required to calculate the " "travel durations. Either use load_matrix=True or install" " the osrm Python package.") self._prepare_dispatch_information() if save_matrix: self.save_time_matrix(self.path) progress("Dispatcher ready to go.", verbose=self.verbose)
def _create_incident_types(self): """ Initialize incident types with their characteristics. Creates a dictionary of IncidentType objects. Every such object holds type-specific distributions about priority, required vehicles, and demand locations. """ progress("Getting priority probabilities.", verbose=self.verbose) prio_probs = get_prio_probabilities_per_type(self.incidents) progress("Getting vehicle requirement probabilities.", verbose=self.verbose) vehicle_probs = get_vehicle_requirements_probabilities( self.incidents, self.deployments, self.vehicle_types) progress("Getting spatial distributions.", verbose=self.verbose) location_probs = get_spatial_distribution_per_type( self.incidents, locations=self.location_ids) progress("Initializing incident types.", verbose=self.verbose) self.incident_types = { t: IncidentType(prio_probs[t], vehicle_probs[t], location_probs[t]) for t in self.types }
def evaluate(self, log): """Evaluate a given simulation output on all set metrics. Parameters ---------- log: pd.DataFrame The raw simulation output/log. Returns ------- metrics: pd.DataFrame The calculated metrics. """ progress("Evaluating {} sets of metrics.".format( len(self.metric_set_names)), verbose=self.verbose) result_dict = {} for name in self.metric_set_names: progress("Evaluating {}.".format(name), verbose=self.verbose) result_dict[name] = self._evaluate_metric_set( log, self.metric_sets[name]) progress("Evaluation completed.", verbose=self.verbose) return result_dict
def predict(self, start, end, predict_nye=True, save=False): """Forecast arrival rates for a given future period and save it under 'self.forecast'. Parameters ---------- start, end: datetime object, The start and end dates and times (rounded to the whole hour) for the period to forecast. predict_nye: boolean, optional (default: True), Whether to predict NYE with high activity like in reality (True) or ignore it and forecast a regular day instead (False). """ assert self.fitted, "First use the 'fit' method before making predictions." def replace_with_other(df1, df2, match_cols, fill_cols): """Fill one dataframe with values from another, based on specified columns.""" assert len(match_cols ) == 3, "This function needs three columns to match on." for i in range(len(df2)): mask = ((df1[match_cols[0]] == df2[match_cols[0]].iloc[i]) & (df1[match_cols[1]] == df2[match_cols[1]].iloc[i]) & (df1[match_cols[2]] == df2[match_cols[2]].iloc[i])) df1.loc[mask, fill_cols] = df2[fill_cols].iloc[i, :].values return df1 # create dataframe with requested date range indx = pd.date_range(start=start, end=end, freq="H") df = pd.DataFrame({"ds": pd.Series(indx)}) df[self.month_col] = df["ds"].dt.month df[self.day_col] = df["ds"].apply(lambda x: x.isoweekday()) df[self.month_day_col] = df["ds"].dt.day df[self.hour_col] = df["ds"].dt.hour types = self.lambdas.columns for type_ in types: df[type_] = np.nan lambdas = self.lambdas.copy() lambdas.reset_index(drop=False, inplace=True) # fill with the overall patterns/lambdas progress("Filling future DataFrame..", verbose=self.verbose) cols = [self.month_col, self.day_col, self.hour_col] df = replace_with_other(df, lambdas, cols, types) progress("DataFrame filled with general patterns (shape: {}).".format( df.shape)) # fill NYEs with high activity if requested if predict_nye: progress("Filling future New Year's Eves", verbose=self.verbose) cols = [self.month_col, self.month_day_col, self.hour_col] nye = self.nye_lambdas.copy() nye.reset_index(drop=False, inplace=True) df = replace_with_other(df, nye, cols, types) msg = "New Year's Eve forecasts added to DataFrame (shape: {})".format( df.shape) progress(msg, verbose=self.verbose) # remove added columns df.drop( [self.month_col, self.day_col, self.month_day_col, self.hour_col], axis=1, inplace=True) self.forecast = df progress("Forecast created.", verbose=self.verbose) if save: self.save_forecast()
def fit(self, data, last_n_years=8, fit_nye=True): """Obtain arrival rates from the data. Fits arrival rates per incident type, month, day of the week, and hour of the day. Saves the results under self.lambdas and self.nye_lambdas (if fit_nye == True). Sets self.fitted = True when fit procedure is completed. Parameters ---------- data: pd.DataFrame, The incident data. last_n_years: int, optional (default: 8), How many years to use to estimate the arrival rates. It uses the latest 'last_n_years' years. fit_nye: boolean, optional (default: True), Whether to fit New Year's Eve separately (True) or to treat it as a regular day. """ progress("Start fitting arrival rates.", verbose=self.verbose) # prepare data data = self._filter_data(data, last_n_years=last_n_years) data[self.day_col] = data[self.day_name_col].map({ "Maandag": 1, "Dinsdag": 2, "Woensdag": 3, "Donderdag": 4, "Vrijdag": 5, "Zaterdag": 6, "Zondag": 7 }) for col in [ self.month_col, self.day_col, self.hour_col, self.month_day_col ]: data[col] = data[col].astype(float).astype(int) # obtain lambdas progress("Obtaining lambdas..", verbose=self.verbose) lambdas = (data.groupby([ self.type_col, self.month_col ]).apply(lambda x: self._get_incidents_per_hour_of_week(x, x.name[1]))) # reindex on a complete set of types, months, and weekdays new_index = pd.MultiIndex.from_product( [data[self.type_col].unique(), np.arange(1, 13), np.arange(1, 8)], names=[self.type_col, self.month_col, self.day_col]) lambdas = lambdas.reindex(new_index, fill_value=0) # stack the hour columns and use types as columns instead self.lambdas = lambdas.stack().unstack(self.type_col, fill_value=0) progress("Lambdas obtained.", verbose=self.verbose) if fit_nye: progress("Fitting New Year's Eve.", verbose=self.verbose) self.nye_lambdas = self._get_incidents_at_nye(data) progress("New Year's Eve arrival rates fitted.", verbose=self.verbose) progress("Fit completed.", verbose=self.verbose) self.fitted = True
def fit(self, incidents=None, deployments=None, stations=None, loc_coords=None, vehicle_types=["TS", "RV", "HV", "WO"], osrm_host="http://192.168.56.101:5000", save_prepared_data=False, location_col="hub_vak_bk", volunteer_stations=[ "DRIEMOND", "DUIVENDRECHT", "AMSTELVEEN VRIJWILLIG" ]): """ Fit random variables related to response time. Parameters ---------- incidents: pd.DataFrame The incident data. Only required when no prepared data is loaded. deployments: pd.DataFrame (optional) The deployment data. Only required when no prepared data is loaded. stations: pd.DataFrame (optional) The station information including coordinates and station names. Only required when no prepared data is loaded. vehicle_types: array-like of strings The types of vehicles to use. Defaults to ["TS", "RV", "HV", "WO"]. osrm_host: str The url to the OSRM API, required when object is initialized with load_data=False or when no prepared data was found. save_prepared_data: boolean Whether to write the preprocessed data to a csv file so that it can be loaded the next time. Defaults to False. location_col: str The name of the column that specifies the demand locations, defaults to "hub_vak_bk". volunteer_stations: array-like of str, optional (default: None) The names of the stations that are run by volunteers. Turn-out times are fitted separately for these stations, since volunteers have to travel to the station first. Notes ----- Performs the following steps: - Prepares data (merges and adds OSRM distance and duration per deployment) - Fits lognormal random variables to dispatch times per incident type. - Fits Gamma random variables to turnout time per station and type. - Models the travel time as :math:`\\alpha + \\beta * \\gamma (\\theta, k) * \\hat{t}`, per vehicle type. Here :math:`\\hat{t}` represents the OSRM estiamte of the travel time and :math:`\\gamma` is a random noise factor. - Saves the station and demand location coordinates in dictionaries. """ self.location_col = location_col if self.data is None: if incidents is not None and deployments is not None and stations is not None: progress("No data loaded, preprocess with OSRM.", verbose=self.verbose) self._prep_data_for_fitting(incidents=incidents, deployments=deployments, stations=stations, vehicles=vehicle_types, osrm_host=osrm_host, save=save_prepared_data) else: raise ValueError( "No prepared data loaded and not all data fed to 'fit()'.") if loc_coords is None: progress( "Location coordinates provided. Extracting station coordinates", verbose=self.verbose) self.location_coords = loc_coords _, self.station_coords = get_coordinates_locations_stations( self.data, location_col=location_col) else: progress("Extracting station and location coordinates.", verbose=self.verbose) self.location_coords, self.station_coords = \ get_coordinates_locations_stations(self.data, location_col=location_col) progress('Fitting random variables on response time...', verbose=self.verbose) self.high_prio_data = (self.data[ (self.data["dim_prioriteit_prio"] == 1) & (self.data["inzet_terplaatse_volgnummer"] == 1)].copy()) self.dispatch_rv_dict = fit_dispatch_times(self.high_prio_data) self.turnout_time_rv_dict = fit_turnout_times( self.data, vehicle_types=vehicle_types, volunteer_stations=volunteer_stations) self.travel_time_dict = model_travel_time_per_vehicle( self.high_prio_data) self.onscene_time_rv_dict = fit_onscene_times(self.data) progress("Creating response time generators.", verbose=self.verbose) self._create_response_time_generators() progress("Response time variables fitted.", verbose=self.verbose) self.fitted = True
def save_forecast(self): """ Save forecasted incident rate to csv. """ path = os.path.join(self.fc_dir, self.file_name) self.forecast.to_csv(path, index=False) progress("Forecast saved to {}.".format(path), verbose=self.verbose)
def create_sampling_dict(self, start_time=None, end_time=None, incident_types=None): """Create a dictionary that can conveniently be used for sampling random incidents based on the forecast. Parameters ---------- start_time: Timestamp or str convertible to Timestamp The earliest time that should be included in the dictionary. end_time: Timestamp or str convertible to Timestamp The latest time that should be included in the dictionary. incident_types: array-like of strings The incident types to forecast for. Defaults to None. If None, uses all incident types in the forecast. Returns ------- sampling_dict: dict, The sampling dictionary as described below. Notes ----- Stores three results: -self.sampling_dict, a dictionary like: `{t -> {'type_distribution' -> probs, 'beta' -> expected interarrival time in minutes, 'time' -> the timestamp corresponding to start_time+t}}` where t is an integer representing the time_units since the start_time. -self.sampling_start_time, timestamp of earliest time in the dictionary. -self.sampling_end_time, timestamp of the latest time in the dictionary. """ assert self.forecast is not None, \ ("No forecast available, initiate with load_forecast=True " "or use .fit() and .predict() to create one.") # determine incident types if incident_types is not None: fc = self.forecast[["ds"] + list(incident_types)].copy() else: fc = self.forecast.copy() # determine start and end times fc["ds"] = pd.to_datetime(fc["ds"], dayfirst=True) if start_time is None: start_time = fc["ds"].min() if end_time is None: end_time = fc["ds"].max() msg = "Creating a sampling dictionary from {} to {}.".format( start_time, end_time) progress(msg, verbose=self.verbose) # process date time range and remove it from the forecast fc = fc[(fc["ds"] >= start_time) & (fc["ds"] <= end_time)] timestamps = fc["ds"].copy() del fc["ds"] # create the dictionary rates_dict = fc.reset_index(drop=True).T.to_dict(orient="list") self.sampling_dict = {} for i, rts in rates_dict.items(): self.sampling_dict[i] = { "type_distribution": np.array(rts) / np.sum(rts), "beta": 1 / np.sum(rts) * 60, "lambda": np.sum(rts), "time": timestamps.iloc[i] } # save start and end time for future reference self.sampling_start_time = start_time self.sampling_end_time = end_time progress("Sampling dictionary created.", verbose=self.verbose) return self.sampling_dict
def add_metric(self, measure, name=None, description=None, count=True, mean=True, std=True, missing=True, quantiles=[0.5, 0.75, 0.90, 0.95, 0.98, 0.99], prios=None, locations=None, vehicles=None, incident_types=None, objects=None, hours=None, days_of_week=None, first_only=False): """Add metrics that should be evaluated. Parameters ---------- measure: str, one of ["response_time", "on_time", "delay"] The measure to evaluate. name: str, optional, default=None How to name the set of metrics for reference in outputs. If None, a standard name is given (i.e., 'metric set 1', 'metric set 2'). description: str, optional, default=None A description of the set of evaluation metrics. This can be used to explain, e.g., the applied filtering in a more elaborate way, whereas the 'name' property should be kept concise. count, mean, std, missing: boolean, optional, default=True Whether to describe the measure by its count, mean, standard deviation and proportion of missing (NaN) values. Note that a missing response time means the response was carried out by an external vehicle. quantiles: array(float), optional, default=[0.5, 0.75, 0.90, 0.95, 0.98, 0.99]) Which quantiles to describe the measure with. Set to None to not use any quantiles. prios: int or array-like of ints, optional, default=None Which priority levels to include during evaluation. If None, uses all levels. locations, vehicles, incident_types, objects: array(str), optional (default: None), Which locations, vehicles types, incident types and object functions to include during evaluation. If None, uses all values. hours: array-like of ints or None, optional, default=None Which hours of dat to incorporate during evaluation. Values must be integers in [0, 23]. days_of_week: array-like of ints or None, optional, default=None Which days of the week to incorporate during evaluation. Monday = 0, ..., Sunday = 6. first_only: boolean, optional, default=False Whether to calculate the metrics for only the first arriving vehicle per incident (True) or to evaluate all vehicles (False). """ if name is None: if len(self.metric_set_names) == 0: i = 1 else: i = int( np.max([int(n[-1]) for n in self.metric_set_names]) + 1) name = "metric_set_{}".format(i) assert measure in self.measures, "'measure' must be one of {}. Received {}" \ .format(measure, self.measures) self.metric_set_measures[name] = measure if locations is not None: locations = np.array(locations, dtype=str) self.metric_sets[name] = { "count": count, "mean": mean, "std": std, "missing": missing, "quantiles": quantiles, "locations": locations, "prios": prios, "vehicles": vehicles, "incident_types": incident_types, "objects": objects, "hours": hours, "days_of_week": days_of_week, "first_only": first_only, "description": description, "measure": measure } self.metric_set_names.append(name) progress("Set of metrics '{}' added.".format(name), verbose=self.verbose)