Ejemplo n.º 1
0
    def test_create_trips_analysis(self):
        analyze.create_trips_analysis(self.db_path,
                                      self.start_datetime,
                                      self.end_datetime,
                                      self.shapefile_path,
                                      self.taxizones_path,
                                      title='test',
                                      overwrite=True,
                                      create_zones_tables=True,
                                      verbose=0)

        sql_trips = 'SELECT * FROM trips_analysis_test;'
        df_test = query(self.db_path, sql_trips)
        sql_taxi_zones = 'SELECT * FROM taxi_zones_test;'
        df_zones = query(self.db_path, sql_taxi_zones)
        sql_taxi_boroughs = 'SELECT * FROM taxi_boroughs_test;'
        df_boroughs = query(self.db_path, sql_taxi_boroughs)

        assert list(df_test['trip_id']) == [4, 5, 6, 7, 8, 9] and \
               list(df_test['pickup_location_id']) == [249, 90, 162, 162, 162,
                                                       162] and \
               list(df_test['dropoff_location_id']) == [90, 161, 229, 229, 229,
                                                        229] and \
               list(df_test['pickup_hour']) == [7, 15, 15, 15, 16, 16]

        assert df_zones.shape == (265, 4)

        assert list(df_boroughs['borough_id']) == [1, 2, 3, 4, 5, 6] and \
               list(df_boroughs['borough_name']) == ['Bronx', 'Brooklyn', 'EWR',
                                                     'Manhattan', 'Queens',
                                                     'Staten Island']
Ejemplo n.º 2
0
    def test_create_expected_load(self):
        summary_table = 'load'
        start_ref = pd.Timestamp('2012-10-01 00:00:00', tz='America/New_York')
        end_ref = pd.Timestamp('2012-10-20 23:59:59', tz='America/New_York')
        datetimeUTC_range_ref = (start_ref.tz_convert(
            tz='UTC').tz_localize(None), end_ref.tz_convert(
                tz='UTC').tz_localize(None))
        start = pd.Timestamp('2012-10-01 00:00:00', tz='America/New_York')
        end = pd.Timestamp('2012-10-05 23:59:59', tz='America/New_York')
        datetimeUTC_range = (start.tz_convert(tz='UTC').tz_localize(None),
                             end.tz_convert(tz='UTC').tz_localize(None))
        _ = ny.create_expected_load(self.db_path,
                                    summary_table,
                                    self.zones_path,
                                    datetimeUTC_range_ref,
                                    datetimeUTC_range_excl=datetimeUTC_range,
                                    title='test',
                                    overwrite=True,
                                    verbose=0)

        sql = 'SELECT * FROM expected_load_test;'
        df_test = query(self.db_path, sql)

        assert (df_test.loc[2, ['mean_integrated_load']].values[0] == 1385) and \
               (df_test.loc[2, ['num_rows']].values[0] == 2) and \
               (df_test.shape == (1848, 7))
Ejemplo n.º 3
0
    def test_import_load(self):
        import_num = ny.import_load(self.dl_dir,
                                    self.db_path,
                                    to_zoneid=True,
                                    zones_path=self.zones_path,
                                    overwrite=True,
                                    verbose=0)
        sql = 'SELECT * FROM load;'
        df_test = query(self.db_path, sql)

        assert (import_num == 31) and (df_test.loc[5, 'rowid'] == 6) and \
               (df_test.loc[5, 'datetimeUTC'] == '2012-10-01 04:00:00') and \
               (df_test.loc[5, 'zone_id'] == 6) and \
               (df_test.loc[5, 'integrated_load'] == 1015.9) and \
               (df_test.shape == (8184, 4))
Ejemplo n.º 4
0
 def test_grouping_zone_hour(self):
     # test grouping by zone and hour
     analyze.create_summary_route_time(
         self.db_path,
         byborough=False,
         byday=False,
         title=self.title,
         trips_analysis_table=self.trips_analysis_table,
         taxi_zones_table=self.taxi_zones_table,
         overwrite=True,
         verbose=0)
     sql = 'SELECT * FROM summary_routezone_hour_{title};'.format(
         title=self.title)
     df_test = query(self.db_path, sql, parse_dates=False, verbose=0)
     assert list(df_test['pickup_location_id']) == [249, 90, 162, 162] and \
            list(df_test['dropoff_location_id']) == [90, 161, 229, 229] and \
            list(df_test['trip_count']) == [1, 1, 2, 2]
def test_import_trips():

    url_path = None
    dl_dir = 'tests/nyctlc/raw/'
    db_path = 'tests/nyctlc/test.db'
    taxi_type = 'yellow'
    usecols = [
        'vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
        'trip_distance', 'pickup_longitude', 'pickup_latitude',
        'pickup_location_id', 'dropoff_longitude', 'dropoff_latitude',
        'dropoff_location_id'
    ]

    dl_num, import_num = clean.import_trips(url_path,
                                            dl_dir,
                                            db_path,
                                            taxi_type,
                                            nrows=None,
                                            usecols=usecols,
                                            overwrite=True,
                                            verbose=0)

    sql = 'SELECT * FROM trips;'
    df_test = query(db_path, sql)

    assert dl_num == 0 and import_num == 2 and \
        df_test['trip_id'][0] == 1 and \
        df_test['taxi_type'][0] == 2 and \
        df_test['vendor_id'][0] == 1 and \
        df_test['pickup_datetime'][0] == '2012-10-01 07:57:00' and \
        df_test['dropoff_datetime'][0] == '2012-10-01 07:57:00' and \
        df_test['passenger_count'][0] == 1 and \
        df_test['trip_distance'][0] == 0.2 and \
        round(df_test['pickup_longitude'][0], 2) == -73.98 and \
        round(df_test['pickup_latitude'][0], 2) == 40.79 and \
        pd.isnull(df_test['pickup_location_id'][0]) and \
        round(df_test['dropoff_longitude'][0], 2) == -73.98 and \
        round(df_test['dropoff_latitude'][0], 2) == 40.79 and \
        pd.isnull(df_test['dropoff_location_id'][0]) and \
        df_test['trip_duration'][0] == 0 and \
        df_test['trip_pace'][0] == 0 and \
        round(df_test['trip_straightline'][0], 2) == 0.28 and \
        round(df_test['trip_windingfactor'][0], 2) == 0.73
Ejemplo n.º 6
0
 def test_grouping_zone_day(self):
     # test grouping by zone and day
     analyze.create_summary_route_time(
         self.db_path,
         byborough=False,
         byday=True,
         title=self.title,
         trips_analysis_table=self.trips_analysis_table,
         taxi_zones_table=self.taxi_zones_table,
         overwrite=True,
         verbose=0)
     sql = 'SELECT * FROM summary_routezone_day_{title};'.format(
         title=self.title)
     df_test = query(self.db_path, sql, parse_dates=False, verbose=0)
     assert list(df_test['pickup_location_id']) == [249, 90, 162] and \
            list(df_test['dropoff_location_id']) == [90, 161, 229] and \
            list(df_test['trip_count']) == [1, 1, 4] and \
            [round(pace, 2) for pace in df_test['mean_pace']] == [183.21, 303.37,
                                                                  334.26]
Ejemplo n.º 7
0
 def test_gouping_borough_hour(self):
     # test grouping by borough and hour
     analyze.create_summary_route_time(
         self.db_path,
         byborough=True,
         byday=False,
         title=self.title,
         trips_analysis_table=self.trips_analysis_table,
         taxi_zones_table=self.taxi_zones_table,
         overwrite=True,
         verbose=0)
     sql = 'SELECT * FROM summary_routeborough_hour_{title};'.format(
         title=self.title)
     df_test = query(self.db_path, sql, parse_dates=False, verbose=0)
     assert list(df_test['pickup_borough_id']) == [4, 4, 4] and \
            list(df_test['dropoff_borough_id']) == [4, 4, 4] and \
            list(df_test['trip_count']) == [1, 3, 2] and \
            [round(pace, 2) for pace in df_test['mean_pace']] == [183.21, 285.71,
                                                                  400.00]
Ejemplo n.º 8
0
    def test_create_standard_load(self):
        summary_table = 'load'
        expected_table = 'expected_load_test'
        start = pd.Timestamp('2012-10-01 00:00:00', tz='America/New_York')
        end = pd.Timestamp('2012-10-05 23:59:59', tz='America/New_York')
        datetimeUTC_range = (start.tz_convert(tz='UTC').tz_localize(None),
                             end.tz_convert(tz='UTC').tz_localize(None))
        _ = ny.create_standard_load(self.db_path,
                                    summary_table,
                                    expected_table,
                                    datetimeUTC_range,
                                    min_num_rows=1,
                                    title='test',
                                    overwrite=True,
                                    verbose=0)

        sql = 'SELECT * FROM standard_load_test;'
        df_test = query(self.db_path, sql)

        assert (round(df_test.loc[2, ['z_integrated_load']].values[0], 2)
                == -0.04) and (df_test.shape == (1320, 4))
Ejemplo n.º 9
0
def create_standard_load(db_path, summary_table, expected_table,
                         datetimeUTC_range, min_num_rows=5, title=None,
                         overwrite=False, verbose=0):
    """Creates a table and dataframe of standardized data from the
    summary_table table. Standardization is relative to the mean and variance of
    corresponding data from the specified reference datetime range (saved as
    an expected_load_[] table in the database).

    Parameters
    ----------
    db_path : str
        Path to sqlite database to create or connect to.

    summary_table : str
        Name of the db table containing summary data to calculate
        standardized integrated_load for.

    expected_table : str
        Name of the db table containing expected data (i.e. mean and
        variance) to calculate standardized integrated_load from.

    datetimeUTC_range : tuple
        Specifies the start and end of the time period to calculate
        standardized integrated_load for (inclusive). Specify as a 2-element
        tuple of UTC datetime strings with year-month-day and
        hour:minutes:seconds. E.g. ('2012-10-29 00:00:00', '2012-11-03
        23:59:59') to calculate standardized integrated_load for times between
        10/29/2012 and 11/03/2012.

    min_num_rows : int
        Defines the minimum number of rows needed in the reference set to
        standardize data.

    title : str
        Defines the suffix of the standard_load_[title] table to be created.

    overwrite : bool
        Defines whether or not to overwrite existing table.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df_std : dataframe
        Dataframe written to db table.

    Notes
    -----
    """

    table = 'standard_load_{title}'.format(title=title)
    if verbose >= 1:
        output('Started creating or updating {table} table.'.format(
            table=table))

    # query expected values calculated from at least min_num_rows data points
    sql = """
            SELECT * FROM {expected_table} 
            WHERE num_rows >= {min_num_rows};""".format(
        expected_table=expected_table, min_num_rows=min_num_rows)
    df_exp = query(db_path, sql)
    df_exp = df_exp[['dayofweek', 'hour', 'zone_id', 'mean_integrated_load',
                     'var_integrated_load']]

    # query data to standardize
    sql = """
            SELECT datetimeUTC, zone_id, integrated_load
            FROM {summary_table}
            WHERE
                datetimeUTC BETWEEN "{start_datetime}" AND "{end_datetime}";
            """.format(summary_table=summary_table,
                       start_datetime=datetimeUTC_range[0],
                       end_datetime=datetimeUTC_range[1])
    df = query(db_path, sql)

    # add dayofweek (0 = Monday) and hour (0-23)
    df['datetimeUTC'] = pd.to_datetime(df['datetimeUTC'])
    df['datetimeUTC'] = [dtUTC.tz_localize(tz='UTC') for dtUTC in
                         df['datetimeUTC']]
    df['datetime'] = [dtUTC.tz_convert(tz='America/New_York') for dtUTC in
                      df['datetimeUTC']]
    df['dayofweek'] = df['datetime'].dt.dayofweek
    df['hour'] = df['datetime'].dt.hour

    # calculate z-scores
    df = pd.merge(df, df_exp, how='left',
                  on=['dayofweek', 'hour', 'zone_id'])
    del df_exp
    df_std = df[['datetimeUTC', 'zone_id']]
    df_std['z_integrated_load'] = \
        (df['integrated_load'] - df['mean_integrated_load']) \
        / df['var_integrated_load']
    df_std = df_std.set_index(['datetimeUTC', 'zone_id'])
    del df

    # create table
    sql = """
                CREATE TABLE IF NOT EXISTS {table} (
                    rowid INTEGER PRIMARY KEY,
                    datetimeUTC TEXT,
                    zone_id INTEGER,
                    z_integrated_load FLOAT
                ); """.format(table=table)
    create_table(db_path=db_path, table=table, create_sql=sql, indexes=[],
                 overwrite=overwrite, verbose=verbose)

    # write data to table
    df_write = df_std.reset_index()
    df_write['datetimeUTC'] = df_write['datetimeUTC'].dt.tz_localize(
        None)
    df_to_table(db_path, df_write, table=table, overwrite=False,
                verbose=verbose)

    if verbose >= 1:
        output('Finished creating or updating {table} table. Dataframe shape '
               'is '.format(table=table) + str(df_std.shape) + '.')

    return df_std
Ejemplo n.º 10
0
def create_forecast_err(db_path, load_table, forecast_table, overwrite=False,
                        verbose=0):
    """Creates a table and dataframe of load forecast error. Error is
    calculated as percent error relative to the actual load.

    I.e. error = (forecast - actual) / actual

    Parameters
    ----------
    db_path : str
        Path to sqlite database to create or connect to.

    load_table : str
        Name of the db table containing actual load data (i.e.
        based on palIntegrated data).

    forecast_table : str
        Name of the db table containing load forecast data (i.e. based on
        isolf).

    overwrite : bool
        Defines whether or not to overwrite existing table.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df : dataframe
        Dataframe written to db table.

    Notes
    -----
    """

    if verbose >= 1:
        output('Started creating or updating forecast_error table.')

    # query actual loads
    sql = """
            SELECT datetimeUTC, zone_id, integrated_load
            FROM {load_table}
          ;""".format(load_table=load_table)
    df_load = query(db_path, sql)
    df_load['datetimeUTC'] = pd.to_datetime(df_load['datetimeUTC'])
    df_load = df_load.set_index(['datetimeUTC', 'zone_id'])

    # query forecast loads
    sql = """
            SELECT datetimeUTC, zone_id, load_forecast_p0, load_forecast_p1,
                load_forecast_p2, load_forecast_p3, load_forecast_p4, 
                load_forecast_p5, load_forecast_p6
            FROM {forecast_table}
          ;""".format(forecast_table=forecast_table)
    df_forecast = query(db_path, sql)
    df_forecast['datetimeUTC'] = pd.to_datetime(df_forecast['datetimeUTC'])
    df_forecast = df_forecast.set_index(['datetimeUTC', 'zone_id'])

    # calculate relative forecast errors
    df = pd.merge(df_load, df_forecast, how='inner', left_index=True,
                  right_index=True)
    del df_load, df_forecast
    df['forecast_error_p0'] = (df['load_forecast_p0'] -
                               df['integrated_load']) / df['integrated_load']
    df['forecast_error_p1'] = (df['load_forecast_p1'] -
                               df['integrated_load']) / df['integrated_load']
    df['forecast_error_p2'] = (df['load_forecast_p2'] -
                               df['integrated_load']) / df['integrated_load']
    df['forecast_error_p3'] = (df['load_forecast_p3'] -
                               df['integrated_load']) / df['integrated_load']
    df['forecast_error_p4'] = (df['load_forecast_p4'] -
                               df['integrated_load']) / df['integrated_load']
    df['forecast_error_p5'] = (df['load_forecast_p5'] -
                               df['integrated_load']) / df['integrated_load']
    df['forecast_error_p6'] = (df['load_forecast_p6'] -
                               df['integrated_load']) / df['integrated_load']
    df = df.drop(['load_forecast_p0', 'load_forecast_p1',
                  'load_forecast_p2', 'load_forecast_p3',
                  'load_forecast_p4', 'load_forecast_p5',
                  'load_forecast_p6'], axis=1)

    # create table
    sql = """
            CREATE TABLE IF NOT EXISTS forecast_error (
                rowid INTEGER PRIMARY KEY,
                datetimeUTC TEXT,
                zone_id INTEGER,
                integrated_load REAL,
                forecast_error_p0 REAL,
                forecast_error_p1 REAL,
                forecast_error_p2 REAL,
                forecast_error_p3 REAL,
                forecast_error_p4 REAL,
                forecast_error_p5 REAL,
                forecast_error_p6 REAL
          ); """
    indexes = ['CREATE UNIQUE INDEX IF NOT EXISTS '
               'forecast_error_datetimeUTC_zone_id ON forecast_error '
               '(datetimeUTC, zone_id);'
               ]
    create_table(db_path=db_path, table='forecast_error', create_sql=sql,
                 indexes=indexes,
                 overwrite=overwrite, verbose=verbose)

    # write data to table
    df_write = df.reset_index()
    df_write['datetimeUTC'] = df_write['datetimeUTC'].dt.tz_localize(
        None)
    df_to_table(db_path, df_write, table='forecast_error', overwrite=False,
                verbose=verbose)

    if verbose >= 1:
        output('Finished creating or updating forecast_error table. Dataframe '
               'shape is ' + str(df.shape) + '.')
    return df
Ejemplo n.º 11
0
def create_expected_load(db_path, summary_table, zones_path,
                         datetimeUTC_range_ref, datetimeUTC_range_excl=None,
                         title=None, overwrite=False, verbose=0):
    """Creates a table and dataframe of expected data from the summary_table
    table. Expectation includes mean and variance of integrated_load for the
    specified reference datetime range. Expectation is calculated for every
    possible dayofweek-hour-zone combination, with NaNs for those missing data.

    Parameters
    ----------
    db_path : str
        Path to sqlite database to create or connect to.

    summary_table : str
        Name of the db summary table containing data to calculate
        expected integrated_load from.

    zones_path : str
        Path to csv containing all zone_id values (maps zone_id to zone_name).

    datetimeUTC_range_ref : tuple
        Specifies the start and end of the reference time period to use when
        calculating expected values (inclusive). Specify as a 2-element
        tuple of UTC datetime strings with year-month-day and
        hour:minutes:seconds.

    datetimeUTC_range_excl : tuple
        Specifies the start and end of time period to exclude from reference
        time period. Specify as a 2-element tuple of UTC datetime strings with
        year-month-day and hour:minutes:seconds.

    title : str
        Defines the suffix of the expected_load_[title] table to be created.

    overwrite : bool
        Defines whether or not to overwrite existing table.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df_exp : dataframe
        Dataframe written to db table.

    Notes
    -----
    datetimeUTC_range_ref items should be UTC, but with naize format (since
    sqlite does not handle time zones). For example, use the following to
    select reference data for Jan. 1 - Dec. 31 2012 (Eastern):
    start = pd.Timestamp('2012-01-01 00:00:00', tz='America/New_York')
    end = pd.Timestamp('2012-12-31 23:59:59', tz='America/New_York')
    datetimeUTC_range_ref = (start.tz_convert(tz='UTC').tz_localize(None),
                            end.tz_convert(tz='UTC').tz_localize(None))
    """

    table = 'expected_load_{title}'.format(title=title)
    if verbose >= 1:
        output('Started creating or updating {table} table.'.format(
            table=table))

    # query range of zone_id values to consider
    df_zones = pd.read_csv(zones_path)
    zones = df_zones['zone_id'].unique()
    del df_zones

    # query reference data
    if datetimeUTC_range_excl:
        sql = """
            SELECT datetimeUTC, zone_id, integrated_load
            FROM {summary_table}
            WHERE
                (datetimeUTC BETWEEN "{start_datetime}" AND "{end_datetime}")
                AND (datetimeUTC NOT BETWEEN "{start_datetime_excl}" AND 
                    "{end_datetime_excl}")
        ;""".format(summary_table=summary_table,
                    start_datetime=datetimeUTC_range_ref[0],
                    end_datetime=datetimeUTC_range_ref[1],
                    start_datetime_excl=datetimeUTC_range_excl[0],
                    end_datetime_excl=datetimeUTC_range_excl[1])
    else:
        sql = """
            SELECT datetimeUTC, zone_id, integrated_load
            FROM {summary_table}
            WHERE
                (datetimeUTC BETWEEN "{start_datetime}" AND "{end_datetime}")
        ;""".format(summary_table=summary_table,
                    start_datetime=datetimeUTC_range_ref[0],
                    end_datetime=datetimeUTC_range_ref[1])
    df = query(db_path, sql)

    # add dayofweek (0 = Monday) and hour (0-23)
    df['datetimeUTC'] = pd.to_datetime(df['datetimeUTC'])
    df['datetimeUTC'] = [dtUTC.tz_localize(tz='UTC') for dtUTC in
                         df['datetimeUTC']]
    df['datetime'] = [dtUTC.tz_convert(tz='America/New_York') for dtUTC in
                      df['datetimeUTC']]

    df['dayofweek'] = df['datetime'].dt.dayofweek
    df['hour'] = df['datetime'].dt.hour

    # calculate mean and variance for each dayofweek-hour-zone combination
    expected = []
    for dayofweek in range(7):
        for hour in range(24):
            for zone in zones:
                # filter to current dayofweek, hour, and zone
                df_filter = df[(df['dayofweek'] == dayofweek) &
                               (df['hour'] == hour) &
                               (df['zone_id'] == zone)]

                # calculate mean and variance
                if not df_filter.empty:
                    mean_integrated_load = np.mean(
                        df_filter['integrated_load'].values)
                    var_integrated_load = np.var(
                        df_filter['integrated_load'].values)
                    num_rows = df_filter.shape[0]
                    expected.append([dayofweek, hour, zone,
                                     mean_integrated_load, var_integrated_load,
                                     num_rows])
                else:
                    expected.append([dayofweek, hour, zone,
                                     np.nan, np.nan, np.nan])
    df_exp = pd.DataFrame(expected,
                          columns=['dayofweek', 'hour', 'zone_id',
                                   'mean_integrated_load',
                                   'var_integrated_load', 'num_rows'])
    df_exp.set_index(['dayofweek', 'hour', 'zone_id'])

    # create table
    sql = """
            CREATE TABLE IF NOT EXISTS {table} (
                rowid INTEGER PRIMARY KEY,
                dayofweek INTEGER,
                hour INTEGER,
                zone_id INTEGER,
                mean_integrated_load FLOAT,
                var_integrated_load FLOAT,
                num_rows INTEGER
            ); """.format(table=table)
    create_table(db_path=db_path, table=table, create_sql=sql, indexes=[],
                 overwrite=overwrite, verbose=verbose)

    # write data to table
    df_to_table(db_path, df_exp, table=table, overwrite=False,
                verbose=verbose)

    if verbose >= 1:
        output('Finished creating or updating {table} table. Dataframe shape '
               'is '.format(table=table) + str(df_exp.shape) + '.')

    return df_exp
Ejemplo n.º 12
0
def load_nyiso(startdate, enddate, db_path, verbose=0):
    """Query and clean nyiso load forecast error data for the specified date
    range from a sqlite database. Assumes the database contains a
    forecast_error table created using create_forecast_err.

    Parameters
    ----------
    startdate : Timestamp
        Start date to include tweets from (inclusive), specified as a
        timezone-aware Pandas Timestamp object.
        E.g. startdate = pd.Timestamp('2012-10-28 00:00:00',
        tz='America/New_York')

    enddate : Timestamp
        End date to include tweets from (exclusive), specified as a
        timezone-aware Pandas Timestamp object.
        e.g. enddate = pd.Timestamp('2012-11-03 00:00:00',
        tz='America/New_York')

    db_path : str
        Path to sqlite database containing table.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df : dataframe

    Notes
    -----
    Sqlite date queries are inclusive for start and end, forecast_error
    datetimes are UTC.
    """

    if verbose >= 1:
        output('Started query.')

    # convert datetimes
    startdateUTC = startdate.tz_convert('UTC')
    enddateUTC = enddate.tz_convert('UTC') - pd.Timedelta('1 second')
    startdate_sql = startdateUTC.strftime("%Y-%m-%d %H:%M:%S")
    enddate_sql = enddateUTC.strftime("%Y-%m-%d %H:%M:%S")

    # load nyiso load data
    sql = """
            SELECT datetimeUTC, zone_id AS nyiso_zone,
                forecast_error_p0 AS err0
            FROM forecast_error
            WHERE
                datetimeUTC BETWEEN "{startdate_sql}" AND "{enddate_sql}"
          ;""".format(startdate_sql=startdate_sql, enddate_sql=enddate_sql)
    df = query(db_path, sql)

    # convert datetimes
    df['datetimeUTC'] = pd.to_datetime(df['datetimeUTC'])
    df['datetimeUTC'] = [
        datetime.tz_localize(tz='UTC') for datetime in df['datetimeUTC']
    ]
    df['datetimeNY'] = [
        datetime.tz_convert('America/New_York')
        for datetime in df['datetimeUTC']
    ]

    # add and drop columns
    df['percent-err0'] = df['err0'] * 100
    df = df.drop(['datetimeUTC'], axis=1)

    # index and sort
    df = df.set_index(['nyiso_zone', 'datetimeNY'])
    df = df.sort_index(level=0)

    if verbose >= 1:
        output('[min, max] forecast error datetimeNY: [' +
               str(min(df.index.get_level_values(level=1))) + ', ' +
               str(max(df.index.get_level_values(level=1))) + '].')
        output('[min, max] forecast error: [' + str(np.nanmin(df['err0'])) +
               ', ' + str(np.nanmax(df['err0'])) + '].')
        output('Finished query.')

    return df
Ejemplo n.º 13
0
def load_nyctlc_zone_hour(startdate,
                          enddate,
                          trip_type,
                          trip_count_filter,
                          db_path,
                          verbose=0):
    """Query and clean nyctlc dropoff or pickup data for the specified date
    range from a sqlite database, grouped by zone and hour. Assumes the
    database contains a standard_zonedropoff_hour_sandy or
    standard_zonepickup_hour_sandy table created using
    create_standard_zone_hour.

    Parameters
    ----------
    startdate : Timestamp
        Start date to include tweets from (inclusive), specified as a
        timezone-aware Pandas Timestamp object.
        E.g. startdate = pd.Timestamp('2012-10-28 00:00:00',
        tz='America/New_York')

    enddate : Timestamp
        End date to include tweets from (exclusive), specified as a
        timezone-aware Pandas Timestamp object.
        e.g. enddate = pd.Timestamp('2012-11-03 00:00:00',
        tz='America/New_York')

    trip_type : str
        Trip type: 'dropoff' or 'pickup'.

    trip_count_filter : int
        Minimum number of trips required to load a data point.

    db_path : str
        Path to sqlite database containing table.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df_taxi : dataframe

    Notes
    -----
    Sqlite date queries are inclusive for start and end, datetimes in nyctlc
    database are local (i.e. NY timezone).
    """

    if verbose >= 1:
        output('Started query.')

    # define trip type
    if trip_type not in ['dropoff', 'pickup']:
        raise ValueError(
            'Invalid trip_type argument: {arg}.'.format(arg=trip_type))

    # convert datetimes
    enddate_exclusive = enddate - pd.Timedelta('1 second')
    startdate_sql = startdate.strftime("%Y-%m-%d %H:%M:%S")
    enddate_sql = enddate_exclusive.strftime("%Y-%m-%d %H:%M:%S")

    # load dropoff/pickup data
    sql = """
            SELECT {trip_type}_datetime AS datetimeNY,
                {trip_type}_location_id AS location_id,
                z_mean_pace AS zpace, z_trip_count AS ztrips
            FROM standard_zone{trip_type}_hour_sandy
            WHERE
                trip_count > {trip_count_filter} AND
                {trip_type}_datetime BETWEEN
                "{startdate_sql}" AND "{enddate_sql}"
          ;""".format(trip_count_filter=trip_count_filter,
                      startdate_sql=startdate_sql,
                      enddate_sql=enddate_sql,
                      trip_type=trip_type)
    df_taxi = query(db_path, sql)

    # add columns
    df_taxi['abs-zpace'] = abs(df_taxi['zpace'])
    df_taxi['abs-ztrips'] = abs(df_taxi['ztrips'])

    # convert datetimes
    df_taxi['datetimeNY'] = pd.to_datetime(df_taxi['datetimeNY'])
    df_taxi['datetimeNY'] = [
        dt.tz_localize(tz='America/New_York') for dt in df_taxi['datetimeNY']
    ]

    # index and sort
    df_taxi = df_taxi.set_index(['location_id', 'datetimeNY'])
    df_taxi = df_taxi.sort_index(level=0)

    if verbose >= 1:
        output('[min, max] taxi datetimeNY (hourly): [' +
               str(min(df_taxi.index.get_level_values(level=1))) + ', ' +
               str(max(df_taxi.index.get_level_values(level=1))) + '].')
        output('[min, max] taxi pace and trips mean z-score (hourly): [' +
               str(np.nanmin(df_taxi['zpace'])) + ', ' +
               str(np.nanmax(df_taxi['zpace'])) + '], [' +
               str(np.nanmin(df_taxi['ztrips'])) + ', ' +
               str(np.nanmax(df_taxi['ztrips'])) + '].')

    # add drop or pick to column names
    if trip_type == 'dropoff':
        val = '-drop'
    elif trip_type == 'pickup':
        val = '-pick'
    else:
        pass
    col_dict = {}
    for col in df_taxi.columns.values:
        col_dict[col] = col + val
    df_taxi = df_taxi.rename(col_dict, axis='columns')

    return df_taxi