def upload_data_from_file(csv_file, from_csv=False):
    """ insert all records contained in file to database

    Args:
        csv_file (str): full path of CSV file containing records
        from_csv (bool): whether to insert into database using CSV or ORM (CSV scales better)

    Returns:
        bool: success/exception
    """
    r = Repository()

    if from_csv:
        success = r.put_measurements_from_csv(csv_file=csv_file)

    else:
        measurements = []
        with open(csv_file, "r") as f:
            for line in f:
                site_id, param_code, date_time, value = line.strip().split(",")
                measurement = Measurement(
                    station_id=site_id,
                    metric_id=param_code,
                    date_time=dateutil.parser.parse(date_time),
                    value=float(value)
                )
                measurements.append(measurement)
        success = r.put_measurements_from_list(measurements=measurements)

    return success
def put_24hr_observations(session):
    """get yesterdays observations

    Args
        session (Session): database session
    """
    # create a repo and pull all the weather stations from NOAA
    repo = Repository(session)
    stations = repo.get_all_stations(source='NOAA')

    # setup the day to retrieve
    yesterday = dt.datetime.now() - dt.timedelta(hours=24)
    yesterday = dt.datetime(year=yesterday.year, month=yesterday.month, day=yesterday.day)

    # apply the api request to each station
    content = stations.apply(
        lambda station: make_station_observation_request(station, yesterday.isoformat()),
        axis=1
    ).values

    # put them all in the db
    added = 0
    for station_measurements in content:
        repo.put_measurements_from_list(station_measurements)
        added += len(station_measurements)

    return added
def get_usgs_site_ids():
    """ retrieve USGS site ids from database

    Returns:
        [str]: list of site ids
    """
    r = Repository()
    sites = r.get_all_stations(source="USGS")
    site_ids = [s for s in sites["station_id"]]
    return site_ids
Example #4
0
def compute_station_river_distances():
    """compute the distance from every river to every weather station"""
    repo = Repository()

    runs = repo.get_all_runs()
    stations = repo.get_all_stations()

    # foreach run, find the close USGS, NOAA, and SNOW station
    for run in runs.iterrows():
        distances = stations.apply(lambda row: get_distance_between_geo_points(
            run[1].put_in_latitude, run[1].put_in_longitude, row.latitude, row.
            longitude, run[1].run_id, row.station_id, row.source),
                                   axis=1).apply(pd.Series)

        distances.sort_values('distance', inplace=True)

        usgs_ = distances[distances.source == 'USGS'].iloc[0, :]
        noaa_ = distances[distances.source == 'NOAA'].iloc[0, :]
        snow_ = distances[distances.source == 'SNOW'].iloc[0, :]

        usgs = StationRiverDistance(station_id=usgs_.station,
                                    run_id=run[1].run_id,
                                    distance=round(float(usgs_.distance), 2))

        noaa = StationRiverDistance(station_id=noaa_.station,
                                    run_id=run[1].run_id,
                                    distance=round(float(noaa_.distance), 2))

        snow = StationRiverDistance(station_id=snow_.station,
                                    run_id=run[1].run_id,
                                    distance=round(float(snow_.distance), 2))

        repo.put_station_river_distances([usgs, noaa, snow])
def get_noaa_predictions(run_id, session):
    """retrieve NOAA predictions for run

    Args
        run_id (int): run
        session (Session): database session

    Returns
        DataFrame: containing predictions
    """
    repo = Repository(session)
    run = repo.get_run(run_id)

    lat = run.put_in_latitude
    lon = run.put_in_longitude

    r = requests.get(f'https://api.weather.gov/points/{lat},{lon}/forecast/hourly')

    if r.status_code == 200 and len(r.content) > 10:
        return pd.DataFrame(r.json()['properties']['periods'])
    else:
        return None
def fill_noaa_gaps(start_date, end_date, db=settings.DATABASE):
    """use as needed to fill gaps in weather measurements

    Args:
        start_date: the start day, included in API calls
        end_date: the end day, inclusive
    """
    context = Context(db)
    session = context.Session()

    repo = Repository(session)
    stations = repo.get_all_stations(source='NOAA')
    total = 0

    # loop through each day retrieving observations
    while start_date <= end_date:
        content = stations.apply(
            lambda station: make_station_observation_request(station, start_date.isoformat()),
            axis=1
        ).values

        # put them all in the db
        added = 0
        for station_measurements in content:
            try:
                repo.put_measurements_from_list(station_measurements)
            except SQLAlchemyError:
                session.rollback()
                continue
            added += len(station_measurements)

            station = station_measurements[0].station
            print(f'added {added} measurements for station_id {station_measurements} - {start_date.isoformat()}')

        start_date += dt.timedelta(days=1)
        total += added

    return total
    def setUpClass(cls):
        """perform at test class initialization

        Note:
            * ensure only a TContext is used NEVER Context or we'll lose all
            our hard-scraped data
            * any existing data in the mock db will be deleted
            * 5 random addresses are generated because nearly all unittests
            require addresses to exist as a foreign key dependency
        """
        cls.context = TContext()
        cls.session = cls.context.Session()
        cls.connection = psycopg2.connect(**settings.PSYCOPG_DB_TEST)
        cls.repo = Repository(session=cls.session, connection=cls.connection)

        cls.context.clear_dependency_data(cls.session)
        cls.context.generate_addresses(cls.session)
Example #8
0
def compute_predictions(session):
    """compute and cache predictions for all runs

    Args:
        session: (Session) database connection

    Returns:
        True: if observations were successfully retrieved and inserted
        False: otherwise
    """
    try:
        arima = Arima(session)
        repo = Repository(session)

        runs = repo.get_all_runs_as_list()
        for run in runs:
            try:
                predictions = arima.arima_model(run.run_id)

                to_add = [
                    Prediction(run_id=run.run_id,
                               timestamp=pd.to_datetime(d),
                               fr_lb=round(float(p), 1),
                               fr=round(float(p), 1),
                               fr_ub=round(float(p), 1)) for p, d in
                    zip(predictions.values, predictions.index.values)
                ]

                repo.clear_predictions(run.run_id)
                repo.put_predictions(to_add)
                log(f'predictions for {run.run_id}-{run.run_name} added to db')

            except SQLAlchemyError as e:
                log(f'{run.run_id}-{run.run_name} failed - {[str(a) for a in e.args]}'
                    )
                session.rollback()

            except Exception as e:
                log(f'predictions for {run.run_id}-{run.run_name} failed - {[str(a) for a in e.args]}'
                    )

        return True

    except Exception as e:
        log(f'failed to compute daily predictions - {str(e.args)}')
        return False
Example #9
0
 def __init__(self, session):
     self.repo = Repository(session)
Example #10
0
class Arima:
    """
    Creates predictions for future flow rate using ARIMA model

    Args:
        session: (Session) db session
    """
    def __init__(self, session):
        self.repo = Repository(session)

    def get_data(self, run_id, metric_ids=None):
        """Retrieves data for selected run from database for past four years
        from current date using Repository.get_measurements function.

        Args:
            run_id (int): id of run for which model will be created
            metric_ids ([str]) - optional: list of metric ids to include

        Returns:
            DataFrame: containing four years of measurements up to current
            date for the given run
        """
        now = datetime.datetime.now()
        end = datetime.datetime(now.year, now.month, now.day)
        start = end - datetime.timedelta(days=4 * 365)
        test_measures = self.repo.get_measurements(run_id=run_id,
                                                   start_date=start,
                                                   end_date=end,
                                                   metric_ids=metric_ids)
        return test_measures

    def daily_avg(self, run_id):
        """Creates dataframe needed for modelling

        Calls Arima.get_data to retrieve measurements for given run and
        creates a dataframe with daily averages for flow rate and exogenous
        predictors.

        Args:
            run_id (int): id of run for which model will be created

        Returns:
            DataFrame: containing daily measurements
        """
        time_series = self.get_data(run_id=run_id,
                                    metric_ids=['00003', '00060', '00001'])
        if len(time_series) == 0:
            return None

        precip = time_series[time_series.metric_id == '00003']
        precip['date_time'] = pd.to_datetime(precip['date_time'], utc=True)
        precip.index = precip['date_time']
        precip_daily = precip.resample('D').sum()

        flow = time_series[time_series.metric_id == '00060']
        flow['date_time'] = pd.to_datetime(flow['date_time'], utc=True)
        flow.index = flow['date_time']
        flow_daily = flow.resample('D').mean()

        temp = time_series[time_series.metric_id == '00001']
        temp['date_time'] = pd.to_datetime(temp['date_time'], utc=True)
        temp.index = temp['date_time']
        temp_daily = temp.resample('D').mean()

        time_series_daily = temp_daily\
            .merge(flow_daily,
                   how='inner',
                   left_index=True,
                   right_index=True) \
            .merge(precip_daily,
                   how='inner',
                   left_index=True,
                   right_index=True)
        time_series_daily.columns = ['temp', 'flow', 'precip']
        time_series_daily = time_series_daily.dropna()
        return time_series_daily

    def arima_model(self, run_id):
        """Creates flow rate predictions using ARIMA model.

        Calls Arima.daily_avg to retrieve data for given run, then creates
        flow rate predictions by using statsmodels functions
        arma_order_select_ic and ARIMA. Three weeks of past flow rate data
        are also returned for plotting purposes.

        Args:
            run_id (int): id of run for which model will be created

        Returns:
            DataFrame: containing time-series flow rate predictions for next
            7 days and historical flow rate for past 21 days
        """
        # Retrieve data for modelling
        measures = self.daily_avg(run_id)

        # don't try to compute if there aren't any measures
        if measures is None:
            return pd.DataFrame()

        # Take past 7-day average of exogenous predictors to use for
        # future prediction
        exog_future_predictors = pd.concat(
            [measures.iloc[-7:, :].mean(axis=0).to_frame().T] * 7,
            ignore_index=True)

        try:
            # Find optimal order for model
            params = arma_order_select_ic(measures['flow'], ic='aic')
            try:
                # Build and fit model
                mod = ARIMA(measures['flow'],
                            order=(params.aic_min_order[0], 0,
                                   params.aic_min_order[1]),
                            exog=measures[['temp', 'precip']]).fit()

                prediction = pd.DataFrame([
                    mod.forecast(
                        steps=7,
                        exog=exog_future_predictors[['temp', 'precip']],
                        alpha=0.05)[0]
                ]).T
            except Exception:
                # If model doesn't converge, return "prediction"
                # of most recent day
                prediction = pd.concat([measures.iloc[-1, :].to_frame().T] * 7,
                                       ignore_index=True)['flow']
        except ValueError:
            # If order fitting doesn't converge, return "prediction"
            # of most recent day
            prediction = pd.concat([measures.iloc[-1, :].to_frame().T] * 7,
                                   ignore_index=True)['flow']

        # Add dates and return past 21 days for plotting
        prediction_dates = [
            measures.index[-2] + datetime.timedelta(days=x)
            for x in range(0, 7)
        ]
        prediction.index = prediction_dates
        past = measures['flow'][-22:-1]
        prediction = pd.concat([past[:-1], prediction], axis=0)

        return prediction

    def get_min_max(self, run_id):
        """Gets min and max runnable flow rate for river run to use for plots

        Args:
            run_id: id of run for which model will be created

        Returns:
            levels: minimum and maximum runnable flow rate for river
        """
        runs = self.repo.get_all_runs()
        levels = runs[['min_level', 'max_level']][runs['run_id'] == run_id]
        return levels
Example #11
0
from riverrunner.repository import Repository
from riverrunner import settings

# IP address for running application
HOST_IP = '192.168.80.13'

# enable for application debugging features
DEBUG = False

# mapping from river's predicted status to a color code
COLOR_MAP = dict(unknown='#41434C',
                 optimal='#4254CC',
                 fair='#8F8A18',
                 not_recommended='#A63617')

repo = Repository()
runs = repo.get_all_runs_as_list()
runs = [run for run in runs if run.todays_runability != -2]
options = [r.select_option for r in runs]
options.sort(key=lambda r: r['label'])

# create a new Dash app adding custom fonts and CSS
app = dash.Dash()
font_url = 'https://fonts.googleapis.com/css?family=Montserrat|Permanent+Marker'
app.css.append_css({'external_url': font_url})


def color_scale(x):
    """prediction binning

    method bins river predictions into discrete categories for color coding
Example #12
0
    test_model: runs stationarity tests and acf/pcf tests and then
    creates ARIMA model for one run and plots results
"""

import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import arma_order_select_ic
from riverrunner.repository import Repository

REPO = Repository()


def daily_avg(time_series):
    """Creates dataframe needed for modelling

    Takes time series with measurements on different timeframes and creates a
    dataframe with daily averages for flow rate and exogenous predictors.

    Args:
        time_series: dataframe with metrics for one run_id, assumes output
        from get_measurements function

    Returns:
        DataFrame: containing daily measurements
    """