def taxi_regex_patterns(taxi_type='all'): """Creates a regex pattern for specified taxi type. Parameters ---------- taxi_type : str Taxi type to create regex for (fhv, green, yellow, or all). Returns ------- pattern : regex Regex pattern for specified taxi type. Notes ----- """ # define taxi type regex pattern if taxi_type == 'fhv': pattern = re.compile('fhv_tripdata_.+.csv') elif taxi_type == 'green': pattern = re.compile('green_tripdata_.+.csv') elif taxi_type == 'yellow': pattern = re.compile('yellow_tripdata_.+.csv') elif taxi_type == 'all': pattern = re.compile('(fhv|green|yellow)\_tripdata_.+.csv') else: output('Unknown taxi_type.', fn_str='regex_pattern') return None return pattern
def load_loaddate(date, load_type, dl_dir, verbose=0): """Loads a nyiso load data file (one day of data) into a dataframe. Assumes the file is zipped with other files for that month. Parameters ---------- date : str Date to load data for. Assumes 'yearmonthday' format (e.g. '20121030'). load_type : str Defines type of load data. Current valid arguments: 'palIntegrated' ( integrated real-time) and 'isolf' (load forecast). dl_dir : str Path to the directory containing downloaded zip files. Assumes each zip file is of the following format: 'yearmonth01{load_type}_csv.zip' (e.g. '20121001palIntegrated_csv.zip'). verbose : int Defines verbosity for output statements. Returns ------- df : dataframe Dataframe of one day of load data. Notes ----- """ if verbose >= 2: output('Started loading {load_type} file for {date} from ' '\"{dl_dir}\".'.format(load_type=load_type, date=date, dl_dir=dl_dir)) if load_type not in ['palIntegrated', 'isolf']: raise ValueError('Unknown type argument: {load_type}. See docs for ' 'valid types'.format(load_type=load_type)) elif len(date) != 8: raise ValueError('Incorrect format for date argument: {date}. Must ' 'be yearmonthday with 8 characters.'.format(date=date)) # read file into dataframe zip_path = dl_dir + date[0:6] + '01{load_type}_csv.zip'.format( load_type=load_type) file_path = date + load_type + '.csv' with zipfile.ZipFile(zip_path) as zip_file: with zip_file.open(file_path) as csv_file: df = pd.read_csv(csv_file) if verbose >= 2: output('Finished loading {load_type} file for {date} from ' '\"{dl_dir}\".'.format(load_type=load_type, date=date, dl_dir=dl_dir)) return df
def clean_payment_type(df, verbose=0): """Cleans the payment_type column. Parameters ---------- df : dataframe Dataframe to clean. verbose : int Defines verbosity for output statements. Returns ------- df : dataframe Dataframe with cleaned column. Notes ----- """ col_names = list(pd.Series(df.columns.values)) if 'payment_type' in col_names: # replace payment_type values with IDs payment_str = 'payment_type' df[payment_str] = df[payment_str].replace(['Credit', 'CREDIT', 'CRE', 'Cre', 'CRD'], '1') df[payment_str] = df[payment_str].replace(['CASH', 'Cash', 'CAS', 'Cas', 'CSH'], '2') df[payment_str] = df[payment_str].replace(['No', 'No ', 'No Charge', 'NOC'], '3') df[payment_str] = df[payment_str].replace(['Dis', 'DIS', 'Dispute'], '4') df[payment_str] = df[payment_str].replace(['UNK', 'C', 'NA', 'NA '], '5') df[payment_str] = df[payment_str].replace(['Voided trip'], '6') df[payment_str] = df[payment_str].astype('int') if verbose >= 2: output('Finished replacing ' + payment_str + ' with IDs.') # check that values match expected expected_values = [1, 2, 3, 4, 5, 6] match = check_expected_list(df, payment_str, expected_values, verbose=verbose) if not match: raise ValueError('Unexpected ' + payment_str + ' value(s).') elif verbose >= 2: output('Unable to clean payment_type column due to missing column.') return df
def add_trip_columns(df, verbose=0): """Adds calculated trip columns to the dataframe. Assumes the dataframe has already been cleaned. Also removes any trips with unreasonable values. Can only calculate distance-related trip data for records with pickup/dropoff lat/lon data. Parameters ---------- df : dataframe Dataframe to add trip calculation columns to. verbose : int Defines verbosity for output statements. Returns ------- df : dataframe Dataframe with added columns. Notes ----- """ col_names = list(pd.Series(df.columns.values)) # add trip_duration column if ('dropoff_datetime' in col_names) and ('pickup_datetime' in col_names): df['trip_duration'] = (df['dropoff_datetime'] - df['pickup_datetime']) \ / np.timedelta64(1, 's') if verbose >= 2: output('Finished adding trip duration column.') elif verbose >= 2: output('Unable to add trip_duration column due to missing columns.') # add calculated trip columns if ('pickup_longitude' in col_names) and \ ('pickup_latitude' in col_names) and \ ('dropoff_longitude' in col_names) and \ ('dropoff_latitude' in col_names): # add trip_pace column df['trip_distance'].replace(0, np.nan, inplace=True) df['trip_pace'] = df['trip_duration'] / df['trip_distance'] # add trip_straightline_distance column df['trip_straightline'] = haversine(df['pickup_latitude'], df['pickup_longitude'], df['dropoff_latitude'], df['dropoff_longitude']) # add trip_windingfactor column df['trip_windingfactor'] = df['trip_distance'] / df['trip_straightline'] if verbose >= 2: output('Finished adding calculated trip columns.') elif verbose >= 2: output('Unable to add calculated trip columns due to missing columns.') return df
def clean_store_and_fwd_flag(df, verbose=0): """Cleans the store_and_fwd_flag column. Parameters ---------- df : dataframe Dataframe to clean. verbose : int Defines verbosity for output statements. Returns ------- df : dataframe Dataframe with cleaned column. Notes ----- """ col_names = list(pd.Series(df.columns.values)) if 'store_and_fwd_flag' in col_names: # replace store_and_fwd_flag values with IDs store_str = 'store_and_fwd_flag' df[store_str] = df[store_str].replace(r'\s+', np.nan, regex=True) df[store_str] = df[store_str].replace(['*', '2', 2], np.nan) df[store_str] = df[store_str].replace(['N', '0'], 0) df[store_str] = df[store_str].replace(['Y', '1'], 1) df[store_str] = df[store_str].astype('float') df[store_str] = df[store_str].round() if verbose >= 2: output('Finished replacing ' + store_str + ' with IDs.') # check that values match expected expected_values = [0, 1, np.nan] match = check_expected_list(df, store_str, expected_values, verbose=verbose) if not match: raise ValueError('Unexpected ' + store_str + ' value(s).') elif verbose >= 2: output('Unable to clean store_and_fwd_flag column due to missing ' 'column.') return df
def clean_vendor_id(df, verbose=0): """Cleans the vendor_id column. Parameters ---------- df : dataframe Dataframe to clean. verbose : int Defines verbosity for output statements. Returns ------- df : dataframe Dataframe with cleaned column. Notes ----- """ col_names = list(pd.Series(df.columns.values)) if 'vendor_id' in col_names: # replace vendor_id values with IDs vendor_str = 'vendor_id' df[vendor_str] = df[vendor_str].replace('CMT', '1') df[vendor_str] = df[vendor_str].replace('DDS', '3') df[vendor_str] = df[vendor_str].replace('VTS', '4') df[vendor_str] = df[vendor_str].astype('int') if verbose >= 2: output('Finished replacing ' + vendor_str + ' with IDs.') # check that values match expected expected_values = [1, 2, 3, 4] match = check_expected_list(df, vendor_str, expected_values, verbose=verbose) if not match: raise ValueError('Unexpected ' + vendor_str + ' value(s).') elif verbose >= 2: output('Unable to clean vendor_id column due to missing column.') return df
def clean_lat_lon(df, verbose=0): """Cleans the latitude and longitude columns. Parameters ---------- df : dataframe Dataframe to clean. verbose : int Defines verbosity for output statements. Returns ------- df : dataframe Dataframe with cleaned column. Notes ----- """ col_names = list(pd.Series(df.columns.values)) if ('pickup_longitude' in col_names) and \ ('pickup_latitude' in col_names) and \ ('dropoff_longitude' in col_names) and \ ('dropoff_latitude' in col_names): # replace lat/lon outside of possible ranges with nan df.loc[abs(df['pickup_latitude']) > 90, 'pickup_latitude'] = np.nan df.loc[abs(df['dropoff_latitude']) > 90, 'dropoff_latitude'] = np.nan df.loc[abs(df['pickup_longitude']) > 180, 'pickup_longitude'] = np.nan df.loc[abs(df['dropoff_longitude']) > 180, 'dropoff_longitude'] = np.nan if verbose >= 2: output('Finished replacing lat/lon outside of possible ranges with ' 'nan.') elif verbose >= 1: output('Unable to clean lat/lon columns due to missing columns.') return df
def clean_column_names(df, year, verbose=0): """Cleans the dataframe column names. Column names are loosely based on "data_dictionary_trip_records_yellow.pdf". Parameters ---------- df : dataframe Dataframe to clean. year : int Year data comes from. verbose : int Defines verbosity for output statements. Returns ------- df : dataframe Dataframe with cleaned column. Notes ----- """ # update column names df = df.rename(index=str, columns=col_names_dict(year)) if verbose >= 2: output('Finished re-naming columns.') # add taxi_type column (2 for yellow) df.insert(0, 'taxi_type', 2) # check that column names match expected expected_names = ['taxi_type', 'vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance', 'pickup_longitude', 'pickup_latitude', 'pickup_location_id', 'rate_code_id', 'store_and_fwd_flag', 'dropoff_longitude', 'dropoff_latitude', 'dropoff_location_id', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'improvement_surcharge', 'tip_amount', 'tolls_amount', 'total_amount'] col_names = pd.Series(df.columns.values) col_names_in = col_names.isin(expected_names) if verbose >= 3: output('Column names: ') print(col_names) print('') if not all(col_names_in): col_names_not_in = [not i for i in col_names_in] output('Error : Unexpected column name(s).', 'clean_column_names') print(col_names[col_names_not_in]) raise ValueError('Unexpected column name(s).') return df
def create_forecast_err(db_path, load_table, forecast_table, overwrite=False, verbose=0): """Creates a table and dataframe of load forecast error. Error is calculated as percent error relative to the actual load. I.e. error = (forecast - actual) / actual Parameters ---------- db_path : str Path to sqlite database to create or connect to. load_table : str Name of the db table containing actual load data (i.e. based on palIntegrated data). forecast_table : str Name of the db table containing load forecast data (i.e. based on isolf). overwrite : bool Defines whether or not to overwrite existing table. verbose : int Defines verbosity for output statements. Returns ------- df : dataframe Dataframe written to db table. Notes ----- """ if verbose >= 1: output('Started creating or updating forecast_error table.') # query actual loads sql = """ SELECT datetimeUTC, zone_id, integrated_load FROM {load_table} ;""".format(load_table=load_table) df_load = query(db_path, sql) df_load['datetimeUTC'] = pd.to_datetime(df_load['datetimeUTC']) df_load = df_load.set_index(['datetimeUTC', 'zone_id']) # query forecast loads sql = """ SELECT datetimeUTC, zone_id, load_forecast_p0, load_forecast_p1, load_forecast_p2, load_forecast_p3, load_forecast_p4, load_forecast_p5, load_forecast_p6 FROM {forecast_table} ;""".format(forecast_table=forecast_table) df_forecast = query(db_path, sql) df_forecast['datetimeUTC'] = pd.to_datetime(df_forecast['datetimeUTC']) df_forecast = df_forecast.set_index(['datetimeUTC', 'zone_id']) # calculate relative forecast errors df = pd.merge(df_load, df_forecast, how='inner', left_index=True, right_index=True) del df_load, df_forecast df['forecast_error_p0'] = (df['load_forecast_p0'] - df['integrated_load']) / df['integrated_load'] df['forecast_error_p1'] = (df['load_forecast_p1'] - df['integrated_load']) / df['integrated_load'] df['forecast_error_p2'] = (df['load_forecast_p2'] - df['integrated_load']) / df['integrated_load'] df['forecast_error_p3'] = (df['load_forecast_p3'] - df['integrated_load']) / df['integrated_load'] df['forecast_error_p4'] = (df['load_forecast_p4'] - df['integrated_load']) / df['integrated_load'] df['forecast_error_p5'] = (df['load_forecast_p5'] - df['integrated_load']) / df['integrated_load'] df['forecast_error_p6'] = (df['load_forecast_p6'] - df['integrated_load']) / df['integrated_load'] df = df.drop(['load_forecast_p0', 'load_forecast_p1', 'load_forecast_p2', 'load_forecast_p3', 'load_forecast_p4', 'load_forecast_p5', 'load_forecast_p6'], axis=1) # create table sql = """ CREATE TABLE IF NOT EXISTS forecast_error ( rowid INTEGER PRIMARY KEY, datetimeUTC TEXT, zone_id INTEGER, integrated_load REAL, forecast_error_p0 REAL, forecast_error_p1 REAL, forecast_error_p2 REAL, forecast_error_p3 REAL, forecast_error_p4 REAL, forecast_error_p5 REAL, forecast_error_p6 REAL ); """ indexes = ['CREATE UNIQUE INDEX IF NOT EXISTS ' 'forecast_error_datetimeUTC_zone_id ON forecast_error ' '(datetimeUTC, zone_id);' ] create_table(db_path=db_path, table='forecast_error', create_sql=sql, indexes=indexes, overwrite=overwrite, verbose=verbose) # write data to table df_write = df.reset_index() df_write['datetimeUTC'] = df_write['datetimeUTC'].dt.tz_localize( None) df_to_table(db_path, df_write, table='forecast_error', overwrite=False, verbose=verbose) if verbose >= 1: output('Finished creating or updating forecast_error table. Dataframe ' 'shape is ' + str(df.shape) + '.') return df
def create_expected_load(db_path, summary_table, zones_path, datetimeUTC_range_ref, datetimeUTC_range_excl=None, title=None, overwrite=False, verbose=0): """Creates a table and dataframe of expected data from the summary_table table. Expectation includes mean and variance of integrated_load for the specified reference datetime range. Expectation is calculated for every possible dayofweek-hour-zone combination, with NaNs for those missing data. Parameters ---------- db_path : str Path to sqlite database to create or connect to. summary_table : str Name of the db summary table containing data to calculate expected integrated_load from. zones_path : str Path to csv containing all zone_id values (maps zone_id to zone_name). datetimeUTC_range_ref : tuple Specifies the start and end of the reference time period to use when calculating expected values (inclusive). Specify as a 2-element tuple of UTC datetime strings with year-month-day and hour:minutes:seconds. datetimeUTC_range_excl : tuple Specifies the start and end of time period to exclude from reference time period. Specify as a 2-element tuple of UTC datetime strings with year-month-day and hour:minutes:seconds. title : str Defines the suffix of the expected_load_[title] table to be created. overwrite : bool Defines whether or not to overwrite existing table. verbose : int Defines verbosity for output statements. Returns ------- df_exp : dataframe Dataframe written to db table. Notes ----- datetimeUTC_range_ref items should be UTC, but with naize format (since sqlite does not handle time zones). For example, use the following to select reference data for Jan. 1 - Dec. 31 2012 (Eastern): start = pd.Timestamp('2012-01-01 00:00:00', tz='America/New_York') end = pd.Timestamp('2012-12-31 23:59:59', tz='America/New_York') datetimeUTC_range_ref = (start.tz_convert(tz='UTC').tz_localize(None), end.tz_convert(tz='UTC').tz_localize(None)) """ table = 'expected_load_{title}'.format(title=title) if verbose >= 1: output('Started creating or updating {table} table.'.format( table=table)) # query range of zone_id values to consider df_zones = pd.read_csv(zones_path) zones = df_zones['zone_id'].unique() del df_zones # query reference data if datetimeUTC_range_excl: sql = """ SELECT datetimeUTC, zone_id, integrated_load FROM {summary_table} WHERE (datetimeUTC BETWEEN "{start_datetime}" AND "{end_datetime}") AND (datetimeUTC NOT BETWEEN "{start_datetime_excl}" AND "{end_datetime_excl}") ;""".format(summary_table=summary_table, start_datetime=datetimeUTC_range_ref[0], end_datetime=datetimeUTC_range_ref[1], start_datetime_excl=datetimeUTC_range_excl[0], end_datetime_excl=datetimeUTC_range_excl[1]) else: sql = """ SELECT datetimeUTC, zone_id, integrated_load FROM {summary_table} WHERE (datetimeUTC BETWEEN "{start_datetime}" AND "{end_datetime}") ;""".format(summary_table=summary_table, start_datetime=datetimeUTC_range_ref[0], end_datetime=datetimeUTC_range_ref[1]) df = query(db_path, sql) # add dayofweek (0 = Monday) and hour (0-23) df['datetimeUTC'] = pd.to_datetime(df['datetimeUTC']) df['datetimeUTC'] = [dtUTC.tz_localize(tz='UTC') for dtUTC in df['datetimeUTC']] df['datetime'] = [dtUTC.tz_convert(tz='America/New_York') for dtUTC in df['datetimeUTC']] df['dayofweek'] = df['datetime'].dt.dayofweek df['hour'] = df['datetime'].dt.hour # calculate mean and variance for each dayofweek-hour-zone combination expected = [] for dayofweek in range(7): for hour in range(24): for zone in zones: # filter to current dayofweek, hour, and zone df_filter = df[(df['dayofweek'] == dayofweek) & (df['hour'] == hour) & (df['zone_id'] == zone)] # calculate mean and variance if not df_filter.empty: mean_integrated_load = np.mean( df_filter['integrated_load'].values) var_integrated_load = np.var( df_filter['integrated_load'].values) num_rows = df_filter.shape[0] expected.append([dayofweek, hour, zone, mean_integrated_load, var_integrated_load, num_rows]) else: expected.append([dayofweek, hour, zone, np.nan, np.nan, np.nan]) df_exp = pd.DataFrame(expected, columns=['dayofweek', 'hour', 'zone_id', 'mean_integrated_load', 'var_integrated_load', 'num_rows']) df_exp.set_index(['dayofweek', 'hour', 'zone_id']) # create table sql = """ CREATE TABLE IF NOT EXISTS {table} ( rowid INTEGER PRIMARY KEY, dayofweek INTEGER, hour INTEGER, zone_id INTEGER, mean_integrated_load FLOAT, var_integrated_load FLOAT, num_rows INTEGER ); """.format(table=table) create_table(db_path=db_path, table=table, create_sql=sql, indexes=[], overwrite=overwrite, verbose=verbose) # write data to table df_to_table(db_path, df_exp, table=table, overwrite=False, verbose=verbose) if verbose >= 1: output('Finished creating or updating {table} table. Dataframe shape ' 'is '.format(table=table) + str(df_exp.shape) + '.') return df_exp
def clean_isolf(df, to_zoneid=False, zones_path=None, verbose=0): """Cleans a dataframe of nyiso load forecast data. Cleaning involves: renaming columns, converting datetimes, setting indexes, removing columns, converting to zone_id, and reshaping. Parameters ---------- df : dataframe Dataframe to clean. to_zoneid : bool If True, converts zone names to zone ids, based on zones_path csv (zones_path must be defined if True). If False, leaves zones_name column. zones_path : str or None Path to csv mapping zone_id to zone_name. Required if to_zoneid is True. verbose : int Defines verbosity for output statements. Returns ------- df : dataframe Cleaned dataframe. Notes ----- """ if verbose >= 2: output('Started cleaning dataframe.') # clean column names df = df.rename(columns={'Time Stamp': 'datetimeNY', 'Capitl': 'CAPITL', 'Centrl': 'CENTRL', 'Dunwod': 'DUNWOD', 'Genese': 'GENESE', 'Hud Vl': 'HUD VL', 'Longil': 'LONGIL', 'Mhk Vl': 'MHK VL', 'Millwd': 'MILLWD', 'N.Y.C.': 'N.Y.C.', 'North': 'NORTH', 'West': 'WEST'}) # clean datetime df['datetimeNY'] = pd.to_datetime(df['datetimeNY'], format='%m/%d/%Y %H:%M') if any(df.duplicated('datetimeNY')): # deal with ambiguous time zone due to end of DST (two 01:00 entries) transition_idx = next( i for i, val in enumerate(df.duplicated('datetimeNY')) if val) datetimes = [] for i, val in enumerate(df['datetimeNY']): if i < transition_idx: datetimes.append(val.tz_localize(tz='America/New_York', ambiguous=True)) else: datetimes.append(val.tz_localize(tz='America/New_York', ambiguous=False)) df['datetimeNY'] = datetimes else: df['datetimeNY'] = [datetime.tz_localize(tz='America/New_York') for datetime in df['datetimeNY']] # set index df = df.set_index('datetimeNY') # remove columns df = df.drop(['NYISO'], axis=1) # clean zone_id if to_zoneid: zone_col = 'zone_id' if zones_path: df_zones = pd.read_csv(zones_path) zones = dict(zip(df_zones['name'], df_zones['zone_id'])) df = df.rename(columns=zones) else: raise ValueError('Must provide zones_path argument if to_zoneid is ' 'True.') else: zone_col = 'zone_name' # reshape dataframe s = df.stack() s.index.names = ['datetimeNY', zone_col] df = pd.DataFrame(s.rename('load_forecast')) df = df.sort_index(level=0) dates = df.index.get_level_values(0).date for i, date in enumerate(pd.unique(dates)): print(date) col_name = 'load_forecast_p' + str(i) df[col_name] = np.nan s = df[dates == date]['load_forecast'].copy() s = s.rename(col_name) df.update(s) df = df.drop('load_forecast', axis=1) # add utc column datetimeUTC = [datetime.tz_convert('UTC') for datetime in df.index.get_level_values(0)] df.insert(0, 'datetimeUTC', datetimeUTC) if verbose >= 2: output('Finished cleaning dataframe.') return df
def clean_palint(df, to_zoneid=False, zones_path=None, verbose=0): """Cleans a dataframe of nyiso integrated real-time actual load data. Cleaning involves: renaming columns, converting datetimes (assumes ny timezone), converting to zone_id, removing columns, and setting indexes. Parameters ---------- df : dataframe Dataframe to clean. to_zoneid : bool If True, converts zone names to zone ids, based on zones_path csv (zones_path must be defined if True). If False, leaves zones_name column. zones_path : str or None Path to csv mapping zone_id to zone_name. Required if to_zoneid is True. verbose : int Defines verbosity for output statements. Returns ------- df : dataframe Cleaned dataframe. Notes ----- """ if verbose >= 2: output('Started cleaning dataframe.') # clean column names df = df.rename(columns={'Time Stamp': 'datetime', 'Time Zone': 'timezone', 'Name': 'name', 'Integrated Load': 'integrated_load'}) # clean datetime df['datetime'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M:%S') offset = df['timezone'].replace({'EDT': pd.Timedelta('4 hours'), 'EST': pd.Timedelta('5 hours')}) df['datetimeUTC'] = offset + pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M:%S') df['datetimeUTC'] = [dtUTC.tz_localize(tz='UTC') for dtUTC in df['datetimeUTC']] # clean zone_id if to_zoneid: zone_col = 'zone_id' if zones_path: df_zones = pd.read_csv(zones_path) zones = dict(zip(df_zones['name'], df_zones['zone_id'])) print(df.keys()) df['zone_id'] = df['name'].replace(zones) else: raise ValueError('Must provide zones_path argument if to_zoneid is ' 'True.') else: zone_col = 'zone_name' # remove columns df = df[['datetimeUTC', zone_col, 'integrated_load']] # set index df = df.set_index(['datetimeUTC', zone_col]) df = df.sort_index(level=0) if verbose >= 2: output('Finished cleaning dataframe.') return df
def max_cross_corr(df, col1, col2, zone_col, shifts, min_overlap, verbose=0): """Creates a dataframe containing the time shift that maximizes cross-correlation between two time series, the max cross-correlation value, and the number of overlapping data points in those series. Parameters ---------- df : Dataframe Dataframe to containing time series data (e.g. from create_timeseries). Assumes dataframe is multi-indexed by zone_col and timedelta (in hours). col1 : str Name of column containing first time series. col2 : str Name of column containing second time series. This is the shifted time series, where col2_shifted = col2 + shift. zone_col : str Name of spatial zone index. shifts : list List of time shifts to apply to 2nd time series (in hours). min_overlap : int Minimum number of overlapping data points (after the 2nd series is time shifted) needed to calculate cross-correlation. verbose : int Defines verbosity for output statements. Returns ------- df_max_rho : dataframe Dataframe of max cross-correlations and associated shifts and counts. df_rho : dataframe Dataframe of cross-correlations and associated shifts and counts for all shifts. Notes ----- """ df_rho = pd.DataFrame(columns=['shift', zone_col, 'rho']) df_count = pd.DataFrame(columns=['shift', zone_col, 'count']) skipped = [] zones = pd.unique(df.index.get_level_values(zone_col)) for shift in shifts: for zone in zones: s_y1 = df[col1].xs(zone, level=0).dropna() s_y2 = df[col2].xs(zone, level=0).dropna() s_y1.index = pd.to_timedelta(s_y1.index.values, unit='h') s_y2.index = pd.to_timedelta(s_y2.index.values, unit='h') # shift 2nd time series s_y2_shift = s_y2.shift(1, freq=pd.Timedelta(shift, unit='h')) # skip zone if not enough overlapping data points (after shift) df_zone = pd.concat([s_y1, s_y2_shift], axis=1).dropna() num_overlap = df_zone.shape[0] if num_overlap < min_overlap: df_rho = df_rho.append( { 'shift': shift, zone_col: zone, 'rho': np.nan }, ignore_index=True) skipped.append((shift, zone)) continue # normalized cross-correlation rho = cross_corr(df_zone[col1].values, df_zone[col2].values, True) df_rho = df_rho.append({ 'shift': shift, zone_col: zone, 'rho': rho }, ignore_index=True) df_count = df_count.append( { 'shift': shift, zone_col: zone, 'count': num_overlap }, ignore_index=True) # reshape and get max rhos and associated shifts and counts df_rho = df_rho.set_index(['shift', zone_col]) df_rho_reshape = df_rho.reset_index() df_rho_reshape = df_rho_reshape.pivot(index='shift', columns=zone_col, values='rho') s_max_shifts = df_rho_reshape.idxmax(axis=0) s_max_shifts.name = 'max-shift' s_max_rhos = df_rho_reshape.max(axis=0) s_max_rhos.name = 'max-rho' df_count = df_count.set_index(['shift', zone_col]) max_counts = [] for zone in zones: max_shift = s_max_shifts.loc[zone] if np.isnan(max_shift): max_counts.append(np.nan) else: max_counts.append(df_count.loc[max_shift, zone].item()) s_max_counts = pd.Series(max_counts, index=zones) s_max_counts.name = 'max-count' df_max_rho = pd.concat([s_max_rhos, s_max_shifts, s_max_counts], axis=1) if verbose >= 2: output( 'Skipped {num_skipped} (shift, {zone}) combos: {skipped}'.format( num_skipped=len(skipped), zone=zone_col, skipped=skipped)) return df_max_rho, df_rho
def load_nyiso(startdate, enddate, db_path, verbose=0): """Query and clean nyiso load forecast error data for the specified date range from a sqlite database. Assumes the database contains a forecast_error table created using create_forecast_err. Parameters ---------- startdate : Timestamp Start date to include tweets from (inclusive), specified as a timezone-aware Pandas Timestamp object. E.g. startdate = pd.Timestamp('2012-10-28 00:00:00', tz='America/New_York') enddate : Timestamp End date to include tweets from (exclusive), specified as a timezone-aware Pandas Timestamp object. e.g. enddate = pd.Timestamp('2012-11-03 00:00:00', tz='America/New_York') db_path : str Path to sqlite database containing table. verbose : int Defines verbosity for output statements. Returns ------- df : dataframe Notes ----- Sqlite date queries are inclusive for start and end, forecast_error datetimes are UTC. """ if verbose >= 1: output('Started query.') # convert datetimes startdateUTC = startdate.tz_convert('UTC') enddateUTC = enddate.tz_convert('UTC') - pd.Timedelta('1 second') startdate_sql = startdateUTC.strftime("%Y-%m-%d %H:%M:%S") enddate_sql = enddateUTC.strftime("%Y-%m-%d %H:%M:%S") # load nyiso load data sql = """ SELECT datetimeUTC, zone_id AS nyiso_zone, forecast_error_p0 AS err0 FROM forecast_error WHERE datetimeUTC BETWEEN "{startdate_sql}" AND "{enddate_sql}" ;""".format(startdate_sql=startdate_sql, enddate_sql=enddate_sql) df = query(db_path, sql) # convert datetimes df['datetimeUTC'] = pd.to_datetime(df['datetimeUTC']) df['datetimeUTC'] = [ datetime.tz_localize(tz='UTC') for datetime in df['datetimeUTC'] ] df['datetimeNY'] = [ datetime.tz_convert('America/New_York') for datetime in df['datetimeUTC'] ] # add and drop columns df['percent-err0'] = df['err0'] * 100 df = df.drop(['datetimeUTC'], axis=1) # index and sort df = df.set_index(['nyiso_zone', 'datetimeNY']) df = df.sort_index(level=0) if verbose >= 1: output('[min, max] forecast error datetimeNY: [' + str(min(df.index.get_level_values(level=1))) + ', ' + str(max(df.index.get_level_values(level=1))) + '].') output('[min, max] forecast error: [' + str(np.nanmin(df['err0'])) + ', ' + str(np.nanmax(df['err0'])) + '].') output('Finished query.') return df
def dl_urls(url_path, dl_dir, taxi_type='all', verbose=0): """Downloads NYC TLC taxi record files for the specified taxi type into the specified directory, based on a text file containing urls. Parameters ---------- url_path : str or None Path to text file containing NYC TLC taxi record file urls to download from. Does nothing if None. dl_dir : str Path of directory to download files to. taxi_type : str Taxi type to create regex for. Use None for all (fhv, green, and yellow). verbose : int Defines verbosity for output statements. Returns ------- dl_num : int Number of files downloaded. Notes ----- url_path = '/Users/httran/Documents/projects/twitterinfrastructure/data /raw/nyctlc-triprecorddata/raw_data_urls.txt' dl_dir = '/Users/httran/Documents/projects/twitterinfrastructure/data/raw /nyctlc-triprecorddata/data/' """ if not url_path: return if verbose >= 1: output('Started downloading taxi record files from ' + url_path + ' to ' + dl_dir) # get existing files in directory files = get_regex_files(dl_dir, pattern=taxi_regex_patterns(taxi_type='all')) # get urls df_urls = pd.read_table(url_path, header=None, names=['url']) urls = df_urls.as_matrix() # download files for specified taxi type (skip already existing ones) dl_num = 0 pattern = taxi_regex_patterns(taxi_type) for url in urls: parts = url[0].split('/') fname = parts[-1] if pattern.match(fname) and (fname not in files): urlretrieve(url[0], dl_dir + fname) output('downloaded: ' + fname) dl_num += 1 if verbose >= 1: output('Downloaded ' + str(dl_num) + ' taxi record files from ' + url_path + ' to ' + dl_dir) return dl_num
def load_nyctlc_zone_date(startdate, enddate, trip_type, trip_count_filter, db_path, verbose=0): """Query and clean nyctlc dropoff or pickup data for the specified date range from a sqlite database, grouped by zone and date. Assumes the database contains a standard_zonedropoff_hour_sandy or standard_zonepickup_hour_sandy table created using create_standard_zone_hour. Parameters ---------- startdate : Timestamp Start date to include tweets from (inclusive), specified as a timezone-aware Pandas Timestamp object. E.g. startdate = pd.Timestamp('2012-10-28 00:00:00', tz='America/New_York') enddate : Timestamp End date to include tweets from (exclusive), specified as a timezone-aware Pandas Timestamp object. e.g. enddate = pd.Timestamp('2012-11-03 00:00:00', tz='America/New_York') trip_type : str Trip type: 'dropoff' or 'pickup'. trip_count_filter : int Minimum number of trips required to load a data point. db_path : str Path to sqlite database containing table. verbose : int Defines verbosity for output statements. Returns ------- df_taxi : dataframe Notes ----- Sqlite date queries are inclusive for start and end, datetimes in nyctlc database are local (i.e. NY timezone). """ df_taxi = load_nyctlc_zone_hour(startdate, enddate, trip_type, trip_count_filter, db_path, verbose=verbose) # remove index, adjust datetime to date, and group by zone and date df_taxi = df_taxi.reset_index() df_taxi['datetimeNY'] = pd.to_datetime(df_taxi['datetimeNY']).dt.date df_taxi = df_taxi.groupby(['location_id', 'datetimeNY']).mean() if verbose >= 1: if trip_type == 'dropoff': output('[min, max] taxi pace and trips mean z-score: [' + str(np.nanmin(df_taxi['zpace-drop'])) + ', ' + str(np.nanmax(df_taxi['zpace-drop'])) + '], [' + str(np.nanmin(df_taxi['ztrips-drop'])) + ', ' + str(np.nanmax(df_taxi['ztrips-drop'])) + '].') elif trip_type == 'pickup': output('[min, max] taxi pace and trips mean z-score: [' + str(np.nanmin(df_taxi['zpace-pick'])) + ', ' + str(np.nanmax(df_taxi['zpace-pick'])) + '], [' + str(np.nanmin(df_taxi['ztrips-pick'])) + ', ' + str(np.nanmax(df_taxi['ztrips-pick'])) + '].') return df_taxi
def clean_yellow(df, year, month, verbose=0): """Cleans a dataframe of NYC TLC yellow taxi record data. Assumes all data is from the same year. Cleaning involves: - updating column names and adding taxi_type column - replacing vendor_id values with IDs - replacing store_and_fwd_flag values with IDs - replacing payment_type values with IDs - replacing lat/lon values outside of possible ranges with nans Parameters ---------- df : dataframe Dataframe to clean. year : int Year data comes from. month : int Month data comes from. verbose : int Defines verbosity for output statements. Returns ------- df : dataframe Cleaned dataframe. Notes ----- vendor_id = {1: ['CMT', 'Creative Mobile Technologies, LLC'], 2: 'VeriFone Inc.', 3: 'DDS', 4: 'VTS'} """ if verbose >= 1: output('Started cleaning dataframe for ' + str(year) + '-' + str(month) + '. ') nrows_removed = 0 # clean column names df = clean_column_names(df, year, verbose) # clean datetime columns df, nrows_removed_datetime = clean_datetime(df, year, month, verbose) nrows_removed += nrows_removed_datetime # clean vendor_id column df = clean_vendor_id(df, verbose) # clean store_and_fwd_flag column df = clean_store_and_fwd_flag(df, verbose) # clean payment_type column df = clean_payment_type(df, verbose) # clean lat/lon columns and add calculated trip columns df = clean_lat_lon(df, verbose) df = add_trip_columns(df, verbose) if verbose >= 1: output('Cleaned dataframe for ' + str(year) + '-' + str(month) + '. ' + str(nrows_removed) + ' rows removed due to errors during clean.') return df
def import_load_forecast(dl_dir, db_path, zones_path=None, overwrite=False, verbose=0): """Loads, cleans, and imports nyiso load forecast data into a sqlite database. load_forecast_px column represents the forecast for the current row (i.e. datetime and zone) x days prior. E.g. the _p2 column for a row with datetime of 10/5/2012 01:00:00 contains the forecast for 10/5/2012 01:00:00 from two days before (i.e. 10/3/2012). Parameters ---------- dl_dir : str Path to the directory containing downloaded zip files. Imports all files in directory. Assumes each zip file is of the following format: 'yearmonth01isolf_csv.zip' (e.g. '20121001isolf_csv.zip'). db_path : str Path to sqlite database. zones_path : str or None Path to csv mapping zone_id to zone_name. Required if to_zoneid is True. overwrite : bool Defines whether or not to overwrite existing database tables. verbose : int Defines verbosity for output statements. Returns ------- import_num : int Number of files imported into database. Notes ----- """ # get files pattern = re.compile('\d{8}isolf_csv.zip') files = get_regex_files(dl_dir, pattern=pattern, verbose=verbose) # create load table (if needed) create_sql = """ CREATE TABLE IF NOT EXISTS load_forecast ( rowid INTEGER PRIMARY KEY, datetimeNY TEXT, datetimeUTC TEXT, zone_id INTEGER, load_forecast_p0 REAL, load_forecast_p1 REAL, load_forecast_p2 REAL, load_forecast_p3 REAL, load_forecast_p4 REAL, load_forecast_p5 REAL, load_forecast_p6 REAL ); """ indexes = ['CREATE INDEX IF NOT EXISTS load_forecast_datetimeNY_zone_id ' 'ON load_forecast (datetimeNY, zone_id);', 'CREATE UNIQUE INDEX IF NOT EXISTS ' 'load_forecast_datetimeUTC_zone_id ON load_forecast ' '(datetimeUTC, zone_id);' ] create_table(db_path, 'load_forecast', create_sql, indexes=indexes, overwrite=overwrite, verbose=verbose) # load, clean, and import load data into table import_num = 0 for file in files: if verbose >= 1: output('Started importing \"' + file + '\".') date = pd.Timestamp(file[0:8]).date() last_day = calendar.monthrange(date.year, date.month)[1] start_date = pd.Timestamp(year=date.year, month=date.month, day=1) end_date = pd.Timestamp(year=date.year, month=date.month, day=last_day) dates = pd.date_range(start_date, end_date) for date in dates: date_str = date.strftime('%Y%m%d') # load and clean data for current date df = load_loaddate(date_str, load_type='isolf', dl_dir=dl_dir, verbose=verbose) df = clean_isolf(df, to_zoneid=True, zones_path=zones_path, verbose=verbose) # write to database conn = connect_db(db_path) c = conn.cursor() df_write = df.reset_index() df_write['datetimeNY'] = df_write['datetimeNY'].dt.tz_localize(None) df_write['datetimeUTC'] = df_write['datetimeUTC'].dt.tz_localize( None) for index, row in df_write.iterrows(): dtNY = row['datetimeNY'] dtUTC = row['datetimeUTC'] zone = row['zone_id'] val = row.drop( ['datetimeNY', 'zone_id', 'datetimeUTC']).dropna() col_name = val.index.values[0] sql = """ INSERT INTO load_forecast (datetimeNY, datetimeUTC, zone_id, {col_name}) VALUES ("{dtNY}", "{dtUTC}", {zone}, {val}) ON CONFLICT(datetimeUTC, zone_id) DO UPDATE SET {col_name} = excluded.{col_name} ;""".format(col_name=col_name, val=val[0], dtNY=dtNY, dtUTC=dtUTC, zone=zone) c.execute(sql) conn.commit() conn.close() import_num += 1 if verbose >= 1: output('Finished importing \"' + file + '\".') output('Finished importing ' + str(import_num) + ' files from \"{dl_dir}\".'.format(dl_dir=dl_dir)) return import_num
def load_yellow(path, nrows=None, usecols=None, verbose=0): """Loads an NYC TLC yellow taxi record file (one month of data) into a dataframe. Parameters ---------- path : str Path to NYC TLC taxi record file to load. nrows : int or None Number of rows to read. Set to None to read all rows. usecols : list List of column names to include. Specify columns names as strings. Column names can be entered based on names found in original tables for the year specified or names found in the trips table. Set to None to read all columns. verbose : int Defines verbosity for output statements. Returns ------- df : dataframe Dataframe of one month of cleaned yellow taxi data. year : int Year data is from. month : int Month data is from. Notes ----- path = '/Users/httran/Documents/projects/twitterinfrastructure/data/raw /nyctlc-triprecorddata/data/yellow_tripdata_2012-01.csv' """ if verbose >= 1: output('Started loading to dataframe: ' + path + '.') parts = re.split('[/_-]', path) year = int(parts[-2]) parts2 = parts[-1].split('.') month = int(parts2[0]) # adjusts usecols to correctly map to column names for the year data is from if usecols: col_dict = col_names_dict(year) usecols_year = [] for col in usecols: if col in col_dict: usecols_year.append(col) else: col_name_year = [key for key, val in col_dict.items() if val == col] if col_name_year: usecols_year.append(col_name_year[0]) elif verbose > 1: output('No matching usecols column name "' + col + '" for ' 'year ' + str(year) + '.', 'load_yellow') else: pass else: usecols_year = usecols # read file into dataframe df = pd.read_csv(path, nrows=nrows, usecols=usecols_year, error_bad_lines=False, warn_bad_lines=False) if verbose >= 1: output('Finished loading to dataframe: ' + path + '.') return df, year, month
def import_trips(url_path, dl_dir, db_path, taxi_type, nrows=None, usecols=None, overwrite=False, verbose=0): """Downloads, cleans, and imports nyc tlc taxi record files for the specified taxi type into a sqlite database. Parameters ---------- url_path : str or None Path to text file containing nyc tlc taxi record file urls to download from. Set to None to skip download. dl_dir : str Path of directory to download files to or load files from. db_path : str Path to sqlite database. taxi_type : str Taxi type to create regex for ('fhv', 'green', 'yellow', or 'all'). nrows : int or None Number of rows to read. Set to None to read all rows. usecols : list List of column names to include. Specify columns names as strings. Column names can be entered based on names found in original tables for the year specified or names found in the trips table. Set to None to read all columns. overwrite : bool Defines whether or not to overwrite existing database tables. verbose : int Defines verbosity for output statements. Returns ------- import_num : int Number of files imported into database. Notes ----- """ # download taxi record files if url_path: dl_num = dl_urls(url_path, dl_dir, taxi_type, verbose=verbose) else: dl_num = 0 # get taxi record files files = get_regex_files(dl_dir, taxi_regex_patterns(taxi_type), verbose=verbose) # create trips table (if needed) create_sql = """ CREATE TABLE IF NOT EXISTS trips ( trip_id INTEGER PRIMARY KEY, taxi_type INTEGER, vendor_id INTEGER, pickup_datetime TEXT, dropoff_datetime TEXT, passenger_count INTEGER, trip_distance REAL, pickup_longitude REAL, pickup_latitude REAL, pickup_location_id INTEGER, dropoff_longitude REAL, dropoff_latitude REAL, dropoff_location_id INTEGER, trip_duration REAL, trip_pace REAL, trip_straightline REAL, trip_windingfactor REAL ); """ indexes = ['CREATE INDEX IF NOT EXISTS trips_pickup_datetime ON trips ' '(pickup_datetime);'] create_table(db_path, 'trips', create_sql, indexes=indexes, overwrite=overwrite, verbose=verbose) # load, clean, and import taxi files into table import_num = 0 for file in files: if verbose >= 1: output('Started importing ' + file + '.') if taxi_type == 'fhv': df = pd.DataFrame({'taxi_type': []}) elif taxi_type == 'green': df = pd.DataFrame({'taxi_type': []}) elif taxi_type == 'yellow': df, year, month = load_yellow(dl_dir + file, nrows=nrows, usecols=usecols, verbose=verbose) df = clean_yellow(df, year, month, verbose=verbose) import_num += 1 else: output('Unknown taxi_type.', fn_str='import_trips') df = pd.DataFrame({'taxi_type': []}) df_to_table(db_path, df, table='trips', overwrite=False, verbose=verbose) if verbose >= 1: output('Imported ' + file + '.') output('Finished importing ' + str(import_num) + ' files.') return dl_num, import_num
def create_standard_load(db_path, summary_table, expected_table, datetimeUTC_range, min_num_rows=5, title=None, overwrite=False, verbose=0): """Creates a table and dataframe of standardized data from the summary_table table. Standardization is relative to the mean and variance of corresponding data from the specified reference datetime range (saved as an expected_load_[] table in the database). Parameters ---------- db_path : str Path to sqlite database to create or connect to. summary_table : str Name of the db table containing summary data to calculate standardized integrated_load for. expected_table : str Name of the db table containing expected data (i.e. mean and variance) to calculate standardized integrated_load from. datetimeUTC_range : tuple Specifies the start and end of the time period to calculate standardized integrated_load for (inclusive). Specify as a 2-element tuple of UTC datetime strings with year-month-day and hour:minutes:seconds. E.g. ('2012-10-29 00:00:00', '2012-11-03 23:59:59') to calculate standardized integrated_load for times between 10/29/2012 and 11/03/2012. min_num_rows : int Defines the minimum number of rows needed in the reference set to standardize data. title : str Defines the suffix of the standard_load_[title] table to be created. overwrite : bool Defines whether or not to overwrite existing table. verbose : int Defines verbosity for output statements. Returns ------- df_std : dataframe Dataframe written to db table. Notes ----- """ table = 'standard_load_{title}'.format(title=title) if verbose >= 1: output('Started creating or updating {table} table.'.format( table=table)) # query expected values calculated from at least min_num_rows data points sql = """ SELECT * FROM {expected_table} WHERE num_rows >= {min_num_rows};""".format( expected_table=expected_table, min_num_rows=min_num_rows) df_exp = query(db_path, sql) df_exp = df_exp[['dayofweek', 'hour', 'zone_id', 'mean_integrated_load', 'var_integrated_load']] # query data to standardize sql = """ SELECT datetimeUTC, zone_id, integrated_load FROM {summary_table} WHERE datetimeUTC BETWEEN "{start_datetime}" AND "{end_datetime}"; """.format(summary_table=summary_table, start_datetime=datetimeUTC_range[0], end_datetime=datetimeUTC_range[1]) df = query(db_path, sql) # add dayofweek (0 = Monday) and hour (0-23) df['datetimeUTC'] = pd.to_datetime(df['datetimeUTC']) df['datetimeUTC'] = [dtUTC.tz_localize(tz='UTC') for dtUTC in df['datetimeUTC']] df['datetime'] = [dtUTC.tz_convert(tz='America/New_York') for dtUTC in df['datetimeUTC']] df['dayofweek'] = df['datetime'].dt.dayofweek df['hour'] = df['datetime'].dt.hour # calculate z-scores df = pd.merge(df, df_exp, how='left', on=['dayofweek', 'hour', 'zone_id']) del df_exp df_std = df[['datetimeUTC', 'zone_id']] df_std['z_integrated_load'] = \ (df['integrated_load'] - df['mean_integrated_load']) \ / df['var_integrated_load'] df_std = df_std.set_index(['datetimeUTC', 'zone_id']) del df # create table sql = """ CREATE TABLE IF NOT EXISTS {table} ( rowid INTEGER PRIMARY KEY, datetimeUTC TEXT, zone_id INTEGER, z_integrated_load FLOAT ); """.format(table=table) create_table(db_path=db_path, table=table, create_sql=sql, indexes=[], overwrite=overwrite, verbose=verbose) # write data to table df_write = df_std.reset_index() df_write['datetimeUTC'] = df_write['datetimeUTC'].dt.tz_localize( None) df_to_table(db_path, df_write, table=table, overwrite=False, verbose=verbose) if verbose >= 1: output('Finished creating or updating {table} table. Dataframe shape ' 'is '.format(table=table) + str(df_std.shape) + '.') return df_std
def load_nyctlc_zone_hour(startdate, enddate, trip_type, trip_count_filter, db_path, verbose=0): """Query and clean nyctlc dropoff or pickup data for the specified date range from a sqlite database, grouped by zone and hour. Assumes the database contains a standard_zonedropoff_hour_sandy or standard_zonepickup_hour_sandy table created using create_standard_zone_hour. Parameters ---------- startdate : Timestamp Start date to include tweets from (inclusive), specified as a timezone-aware Pandas Timestamp object. E.g. startdate = pd.Timestamp('2012-10-28 00:00:00', tz='America/New_York') enddate : Timestamp End date to include tweets from (exclusive), specified as a timezone-aware Pandas Timestamp object. e.g. enddate = pd.Timestamp('2012-11-03 00:00:00', tz='America/New_York') trip_type : str Trip type: 'dropoff' or 'pickup'. trip_count_filter : int Minimum number of trips required to load a data point. db_path : str Path to sqlite database containing table. verbose : int Defines verbosity for output statements. Returns ------- df_taxi : dataframe Notes ----- Sqlite date queries are inclusive for start and end, datetimes in nyctlc database are local (i.e. NY timezone). """ if verbose >= 1: output('Started query.') # define trip type if trip_type not in ['dropoff', 'pickup']: raise ValueError( 'Invalid trip_type argument: {arg}.'.format(arg=trip_type)) # convert datetimes enddate_exclusive = enddate - pd.Timedelta('1 second') startdate_sql = startdate.strftime("%Y-%m-%d %H:%M:%S") enddate_sql = enddate_exclusive.strftime("%Y-%m-%d %H:%M:%S") # load dropoff/pickup data sql = """ SELECT {trip_type}_datetime AS datetimeNY, {trip_type}_location_id AS location_id, z_mean_pace AS zpace, z_trip_count AS ztrips FROM standard_zone{trip_type}_hour_sandy WHERE trip_count > {trip_count_filter} AND {trip_type}_datetime BETWEEN "{startdate_sql}" AND "{enddate_sql}" ;""".format(trip_count_filter=trip_count_filter, startdate_sql=startdate_sql, enddate_sql=enddate_sql, trip_type=trip_type) df_taxi = query(db_path, sql) # add columns df_taxi['abs-zpace'] = abs(df_taxi['zpace']) df_taxi['abs-ztrips'] = abs(df_taxi['ztrips']) # convert datetimes df_taxi['datetimeNY'] = pd.to_datetime(df_taxi['datetimeNY']) df_taxi['datetimeNY'] = [ dt.tz_localize(tz='America/New_York') for dt in df_taxi['datetimeNY'] ] # index and sort df_taxi = df_taxi.set_index(['location_id', 'datetimeNY']) df_taxi = df_taxi.sort_index(level=0) if verbose >= 1: output('[min, max] taxi datetimeNY (hourly): [' + str(min(df_taxi.index.get_level_values(level=1))) + ', ' + str(max(df_taxi.index.get_level_values(level=1))) + '].') output('[min, max] taxi pace and trips mean z-score (hourly): [' + str(np.nanmin(df_taxi['zpace'])) + ', ' + str(np.nanmax(df_taxi['zpace'])) + '], [' + str(np.nanmin(df_taxi['ztrips'])) + ', ' + str(np.nanmax(df_taxi['ztrips'])) + '].') # add drop or pick to column names if trip_type == 'dropoff': val = '-drop' elif trip_type == 'pickup': val = '-pick' else: pass col_dict = {} for col in df_taxi.columns.values: col_dict[col] = col + val df_taxi = df_taxi.rename(col_dict, axis='columns') return df_taxi
def import_load(dl_dir, db_path, to_zoneid=False, zones_path=None, overwrite=False, verbose=0): """Loads, cleans, and imports nyiso load data into a sqlite database. Currently only imports palIntegrated files (i.e. integrated real-time load data). Parameters ---------- dl_dir : str Path to the directory containing downloaded zip files. Imports all files in directory. Assumes each zip file is of the following format: 'yearmonth01palIntegrated_csv.zip' (e.g. '20121001palIntegrated_csv.zip'). db_path : str Path to sqlite database. to_zoneid : bool If True, converts zone names to zone ids, based on zones_path csv (zones_path must be defined if True). If False, leaves zones_name column. zones_path : str or None Path to csv mapping zone_id to zone_name. Required if to_zoneid is True. overwrite : bool Defines whether or not to overwrite existing database tables. verbose : int Defines verbosity for output statements. Returns ------- import_num : int Number of files imported into database. Notes ----- """ if to_zoneid: zone_str = 'zone_id' zone_field = 'zone_id INTEGER' else: zone_str = 'zone_name' zone_field = 'zone_name TEXT' # get files pattern = re.compile('\d{8}palIntegrated_csv.zip') files = get_regex_files(dl_dir, pattern=pattern, verbose=verbose) # create load table (if needed) create_sql = """ CREATE TABLE IF NOT EXISTS load ( rowid INTEGER PRIMARY KEY, datetimeUTC TEXT, {zone_field} TEXT, integrated_load REAL ); """.format(zone_field=zone_field) indexes = ['CREATE INDEX IF NOT EXISTS datetimeUTC_{zone_str} ' 'ON load (datetimeUTC, {zone_str});'.format(zone_str=zone_str)] create_table(db_path, 'load', create_sql, indexes=indexes, overwrite=overwrite, verbose=verbose) # load, clean, and import load data into table import_num = 0 for file in files: if verbose >= 1: output('Started importing \"' + file + '\".') date = pd.Timestamp(file[0:8]).date() last_day = calendar.monthrange(date.year, date.month)[1] start_date = pd.Timestamp(year=date.year, month=date.month, day=1) end_date = pd.Timestamp(year=date.year, month=date.month, day=last_day) dates = pd.date_range(start_date, end_date) for date in dates: date_str = date.strftime('%Y%m%d') # load and clean data for current date df = load_loaddate(date_str, load_type='palIntegrated', dl_dir=dl_dir, verbose=verbose) df = clean_palint(df, to_zoneid=to_zoneid, zones_path=zones_path, verbose=verbose) # write to database df_write = df.reset_index() df_write['datetimeUTC'] = df_write['datetimeUTC'].dt.tz_localize( None) df_to_table(db_path, df_write, table='load', overwrite=False) del df_write import_num += 1 if verbose >= 1: output('Finished importing \"' + file + '\".') output('Finished importing ' + str(import_num) + ' files from \"{dl_dir}\".'.format(dl_dir=dl_dir)) return import_num
def col_names_dict(year): """Returns a dictionary mapping column names for specified year to expected column names (i.e. those used in trips table). Parameters ---------- year : int Year to define keys in column names dictionary. Returns ------- col_dict : dict Dictionary mapping column names for specified year to expected. Notes ----- """ if year == 2009: col_dict = { 'vendor_name': 'vendor_id', 'Trip_Pickup_DateTime': 'pickup_datetime', 'Trip_Dropoff_DateTime': 'dropoff_datetime', 'Passenger_Count': 'passenger_count', 'Trip_Distance': 'trip_distance', 'Start_Lon': 'pickup_longitude', 'Start_Lat': 'pickup_latitude', 'Rate_Code': 'rate_code_id', 'store_and_forward': 'store_and_fwd_flag', 'End_Lon': 'dropoff_longitude', 'End_Lat': 'dropoff_latitude', 'Payment_Type': 'payment_type', 'Fare_Amt': 'fare_amount', 'surcharge': 'extra', 'mta_tax': 'mta_tax', 'Tip_Amt': 'tip_amount', 'Tolls_Amt': 'tolls_amount', 'Total_Amt': 'total_amount' } elif 2010 <= year <= 2013: col_dict = { 'vendor_id': 'vendor_id', 'pickup_datetime': 'pickup_datetime', 'dropoff_datetime': 'dropoff_datetime', 'passenger_count': 'passenger_count', 'trip_distance': 'trip_distance', 'pickup_longitude': 'pickup_longitude', 'pickup_latitude': 'pickup_latitude', 'rate_code': 'rate_code_id', 'store_and_fwd_flag': 'store_and_fwd_flag', 'dropoff_longitude': 'dropoff_longitude', 'dropoff_latitude': 'dropoff_latitude', 'payment_type': 'payment_type', 'fare_amount': 'fare_amount', 'surcharge': 'extra', 'mta_tax': 'mta_tax', 'tip_amount': 'tip_amount', 'tolls_amount': 'tolls_amount', 'total_amount': 'total_amount' } elif year == 2014: col_dict = { 'vendor_id': 'vendor_id', ' pickup_datetime': 'pickup_datetime', ' dropoff_datetime': 'dropoff_datetime', ' passenger_count': 'passenger_count', ' trip_distance': 'trip_distance', ' pickup_longitude': 'pickup_longitude', ' pickup_latitude': 'pickup_latitude', ' rate_code': 'rate_code_id', ' store_and_fwd_flag': 'store_and_fwd_flag', ' dropoff_longitude': 'dropoff_longitude', ' dropoff_latitude': 'dropoff_latitude', ' payment_type': 'payment_type', ' fare_amount': 'fare_amount', ' surcharge': 'extra', ' mta_tax': 'mta_tax', ' tip_amount': 'tip_amount', ' tolls_amount': 'tolls_amount', ' total_amount': 'total_amount' } elif 2015 <= year <= 2016: col_dict = { 'VendorID': 'vendor_id', 'tpep_pickup_datetime': 'pickup_datetime', 'tpep_dropoff_datetime': 'dropoff_datetime', 'passenger_count': 'passenger_count', 'trip_distance': 'trip_distance', 'pickup_longitude': 'pickup_longitude', 'pickup_latitude': 'pickup_latitude', 'RatecodeID': 'rate_code_id', 'store_and_fwd_flag': 'store_and_fwd_flag', 'dropoff_longitude': 'dropoff_longitude', 'dropoff_latitude': 'dropoff_latitude', 'payment_type': 'payment_type', 'fare_amount': 'fare_amount', 'extra': 'extra', 'mta_tax': 'mta_tax', 'tip_amount': 'tip_amount', 'tolls_amount': 'tolls_amount', 'improvement_surcharge': 'improvement_surcharge', 'total_amount': 'total_amount' } elif year == 2017: col_dict = { 'VendorID': 'vendor_id', 'tpep_pickup_datetime': 'pickup_datetime', 'tpep_dropoff_datetime': 'dropoff_datetime', 'passenger_count': 'passenger_count', 'trip_distance': 'trip_distance', 'RatecodeID': 'rate_code_id', 'store_and_forward': 'store_and_fwd_flag', 'PULocationID': 'pickup_location_id', 'DOLocationID': 'dropoff_location_id', 'payment_type': 'payment_type', 'fare_amount': 'fare_amount', 'extra': 'extra', 'mta_tax': 'mta_tax', 'tip_amount': 'tip_amount', 'tolls_amount': 'tolls_amount', 'improvement_surcharge': 'improvement_surcharge', 'total_amount': 'total_amount' } else: output('Error : Unexpected year (' + str(year) + ').', 'col_names_dict') raise ValueError('Unexpected year.') return col_dict
def create_timeseries(df, zone_col, min_count, write_path=None, verbose=0): """Creates a time series dataframe where each column of df is independently linearly interpolated over the total range of timedeltas of each zone. Only time series with at least min_count data points are included. Assumes the dataframe is indexed by a zone column (zone_col) and a timedelta column (e.g. using index_timedelta). Parameters ---------- df : Dataframe Dataframe to calculate time series from. zone_col : str Name of zone column: 'zone_id' (nyiso zone), 'location_id' (taxi zone), or 'borough' (taxi borough). min_count : int Minimum number of data points needed to convert to a time series. write_path : str or None If str, then write a csv of the time series dataframe to the specified path. Else, do not write. verbose : int Defines verbosity for output statements. Returns ------- df_ts : dataframe Notes ----- """ # loop through zones df_ts = pd.DataFrame() skipped = [] zones = pd.unique(df.index.get_level_values(zone_col)) for zone in zones: df_zone = df.xs(zone, level=0) # loop through columns (i.e. data to convert to time series) y_interps = [] cols = df_zone.columns.values for col in cols: s = df_zone[col].dropna() if s.count() < min_count: skipped.append((zone, col)) else: timedeltas = range( s.index.astype('timedelta64[h]').min(), s.index.astype('timedelta64[h]').max() + 1) y_interp = pd.Series(data=np.interp( timedeltas, s.index.astype('timedelta64[h]'), s.values), index=timedeltas, name=col) y_interps.append(y_interp) # add interpolated data to dataframe if y_interps: df_temp = pd.concat(objs=y_interps, axis=1, join='outer') df_temp = df_temp.set_index( pd.to_timedelta(df_temp.index.values, unit='h')) df_temp[zone_col] = zone df_temp.set_index(zone_col, append=True, inplace=True) df_temp.index.names = ['timedelta', zone_col] df_temp = df_temp.reorder_levels([1, 0]) df_ts = df_ts.append(df_temp, sort=False) # save to csv if write_path: df_csv = df_ts.reset_index() df_csv['timedelta'] = df_csv['timedelta'].astype('timedelta64[h]') df_csv.to_csv(write_path, index=False) if verbose >= 1: output('skipped zones for having less than {min_count} data points ' 'in original column data: {skipped}'.format( skipped=skipped, min_count=min_count)) return df_ts
def clean_datetime(df, year, month, verbose=0): """Cleans the datetime columns. Cleaning involves adjusting data type to datetime and removing records outside of expected year and month. Parameters ---------- df : dataframe Dataframe to clean. year : int Year data comes from. month : int Month data comes from. verbose : int Defines verbosity for output statements. Returns ------- df : dataframe Dataframe with cleaned column. nrows_removed : int Number of removed rows. Notes ----- """ col_names = list(pd.Series(df.columns.values)) nrows_removed = 0 if ('pickup_datetime' in col_names) and ('dropoff_datetime' in col_names): # change datetime columns datetime data type and sort df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime']) df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime']) df.sort_values(['pickup_datetime', 'dropoff_datetime'], inplace=True) if verbose >= 2: output('Finished converting datetime columns to datetime dtype ' 'and sorting by pickup_datetime and dropoff_datetime.') # remove rows outside of expected year-month (based on pickup_datetime) start_datetime = pd.datetime(year=year, month=month, day=1, hour=0, minute=0, second=0, microsecond=0) if month < 12: end_datetime = pd.datetime(year=year, month=(month + 1), day=1, hour=0, minute=0, second=0, microsecond=0) else: end_datetime = pd.datetime(year=(year + 1), month=1, day=1, hour=0, minute=0, second=0, microsecond=0) correct_month = (df['pickup_datetime'] >= start_datetime) & \ (df['pickup_datetime'] < end_datetime) if not all(correct_month): nrows = df.shape[0] df = df[correct_month] nrows_removed = nrows - df.shape[0] if verbose >= 2: output('Finished removing records with pickup_datetime outside of ' 'expected year-month date range (' + str(nrows_removed) + ' rows removed).') elif verbose >= 1: output('Unable to clean datetime columns due to missing columns.') return df, nrows_removed