def compute_station_river_distances(): """compute the distance from every river to every weather station""" repo = Repository() runs = repo.get_all_runs() stations = repo.get_all_stations() # foreach run, find the close USGS, NOAA, and SNOW station for run in runs.iterrows(): distances = stations.apply(lambda row: get_distance_between_geo_points( run[1].put_in_latitude, run[1].put_in_longitude, row.latitude, row. longitude, run[1].run_id, row.station_id, row.source), axis=1).apply(pd.Series) distances.sort_values('distance', inplace=True) usgs_ = distances[distances.source == 'USGS'].iloc[0, :] noaa_ = distances[distances.source == 'NOAA'].iloc[0, :] snow_ = distances[distances.source == 'SNOW'].iloc[0, :] usgs = StationRiverDistance(station_id=usgs_.station, run_id=run[1].run_id, distance=round(float(usgs_.distance), 2)) noaa = StationRiverDistance(station_id=noaa_.station, run_id=run[1].run_id, distance=round(float(noaa_.distance), 2)) snow = StationRiverDistance(station_id=snow_.station, run_id=run[1].run_id, distance=round(float(snow_.distance), 2)) repo.put_station_river_distances([usgs, noaa, snow])
class Arima: """ Creates predictions for future flow rate using ARIMA model Args: session: (Session) db session """ def __init__(self, session): self.repo = Repository(session) def get_data(self, run_id, metric_ids=None): """Retrieves data for selected run from database for past four years from current date using Repository.get_measurements function. Args: run_id (int): id of run for which model will be created metric_ids ([str]) - optional: list of metric ids to include Returns: DataFrame: containing four years of measurements up to current date for the given run """ now = datetime.datetime.now() end = datetime.datetime(now.year, now.month, now.day) start = end - datetime.timedelta(days=4 * 365) test_measures = self.repo.get_measurements(run_id=run_id, start_date=start, end_date=end, metric_ids=metric_ids) return test_measures def daily_avg(self, run_id): """Creates dataframe needed for modelling Calls Arima.get_data to retrieve measurements for given run and creates a dataframe with daily averages for flow rate and exogenous predictors. Args: run_id (int): id of run for which model will be created Returns: DataFrame: containing daily measurements """ time_series = self.get_data(run_id=run_id, metric_ids=['00003', '00060', '00001']) if len(time_series) == 0: return None precip = time_series[time_series.metric_id == '00003'] precip['date_time'] = pd.to_datetime(precip['date_time'], utc=True) precip.index = precip['date_time'] precip_daily = precip.resample('D').sum() flow = time_series[time_series.metric_id == '00060'] flow['date_time'] = pd.to_datetime(flow['date_time'], utc=True) flow.index = flow['date_time'] flow_daily = flow.resample('D').mean() temp = time_series[time_series.metric_id == '00001'] temp['date_time'] = pd.to_datetime(temp['date_time'], utc=True) temp.index = temp['date_time'] temp_daily = temp.resample('D').mean() time_series_daily = temp_daily\ .merge(flow_daily, how='inner', left_index=True, right_index=True) \ .merge(precip_daily, how='inner', left_index=True, right_index=True) time_series_daily.columns = ['temp', 'flow', 'precip'] time_series_daily = time_series_daily.dropna() return time_series_daily def arima_model(self, run_id): """Creates flow rate predictions using ARIMA model. Calls Arima.daily_avg to retrieve data for given run, then creates flow rate predictions by using statsmodels functions arma_order_select_ic and ARIMA. Three weeks of past flow rate data are also returned for plotting purposes. Args: run_id (int): id of run for which model will be created Returns: DataFrame: containing time-series flow rate predictions for next 7 days and historical flow rate for past 21 days """ # Retrieve data for modelling measures = self.daily_avg(run_id) # don't try to compute if there aren't any measures if measures is None: return pd.DataFrame() # Take past 7-day average of exogenous predictors to use for # future prediction exog_future_predictors = pd.concat( [measures.iloc[-7:, :].mean(axis=0).to_frame().T] * 7, ignore_index=True) try: # Find optimal order for model params = arma_order_select_ic(measures['flow'], ic='aic') try: # Build and fit model mod = ARIMA(measures['flow'], order=(params.aic_min_order[0], 0, params.aic_min_order[1]), exog=measures[['temp', 'precip']]).fit() prediction = pd.DataFrame([ mod.forecast( steps=7, exog=exog_future_predictors[['temp', 'precip']], alpha=0.05)[0] ]).T except Exception: # If model doesn't converge, return "prediction" # of most recent day prediction = pd.concat([measures.iloc[-1, :].to_frame().T] * 7, ignore_index=True)['flow'] except ValueError: # If order fitting doesn't converge, return "prediction" # of most recent day prediction = pd.concat([measures.iloc[-1, :].to_frame().T] * 7, ignore_index=True)['flow'] # Add dates and return past 21 days for plotting prediction_dates = [ measures.index[-2] + datetime.timedelta(days=x) for x in range(0, 7) ] prediction.index = prediction_dates past = measures['flow'][-22:-1] prediction = pd.concat([past[:-1], prediction], axis=0) return prediction def get_min_max(self, run_id): """Gets min and max runnable flow rate for river run to use for plots Args: run_id: id of run for which model will be created Returns: levels: minimum and maximum runnable flow rate for river """ runs = self.repo.get_all_runs() levels = runs[['min_level', 'max_level']][runs['run_id'] == run_id] return levels