def add_holiday(x): time_range = pd.date_range(start='2015-12-31', end='2019-01-01', freq='h') country_holidays = {'UK': holidays.UK(), 'US': holidays.US(), 'IRL': holidays.Ireland(), 'CAN': holidays.Canada()} holiday_mapping = pd.DataFrame() for site in range(16): holiday_mapping_i = pd.DataFrame({'site': site, 'timestamp': time_range}) holiday_mapping_i['h0'] = holiday_mapping_i['timestamp'].apply( lambda x: x in country_holidays[locate[site]['country']]).astype(int) holiday_mapping = pd.concat([holiday_mapping, holiday_mapping_i], axis=0) x = pd.merge([x, holiday_mapping], on=['site', 'timestamp'], how='left') return x
def add_holiday(df_weather): en_holidays = holidays.England() ir_holidays = holidays.Ireland() ca_holidays = holidays.Canada() us_holidays = holidays.UnitedStates() en_idx = df_weather.query('site_id == 1 or site_id == 5').index ir_idx = df_weather.query('site_id == 12').index ca_idx = df_weather.query('site_id == 7 or site_id == 11').index us_idx = df_weather.query('site_id == 0 or site_id == 2 or site_id == 3 or site_id == 4 or site_id == 6 or site_id == 8 or site_id == 9 or site_id == 10 or site_id == 13 or site_id == 14 or site_id == 15').index df_weather['IsHoliday'] = 0 df_weather.loc[en_idx, 'IsHoliday'] = df_weather.loc[en_idx, 'timestamp'].apply(lambda x: en_holidays.get(x, default=0)) df_weather.loc[ir_idx, 'IsHoliday'] = df_weather.loc[ir_idx, 'timestamp'].apply(lambda x: ir_holidays.get(x, default=0)) df_weather.loc[ca_idx, 'IsHoliday'] = df_weather.loc[ca_idx, 'timestamp'].apply(lambda x: ca_holidays.get(x, default=0)) df_weather.loc[us_idx, 'IsHoliday'] = df_weather.loc[us_idx, 'timestamp'].apply(lambda x: us_holidays.get(x, default=0)) holiday_idx = df_weather['IsHoliday'] != 0 df_weather.loc[holiday_idx, 'IsHoliday'] = 1 df_weather['IsHoliday'] = df_weather['IsHoliday'].astype(np.uint8) return df_weather
def fit(self, X, y=None): self.USh = holidays.UnitedStates() self.CAh = holidays.Canada() self.UKh = holidays.UnitedKingdom() self.IEh = holidays.Ireland() return self
weather_df = weather_df.reset_index() weather_df = weather_df.drop(['datetime', 'day', 'week', 'month'], axis=1) return weather_df weather['timestamp'] = weather['timestamp'].astype(str) weather = fill_weather_dataset(weather) weather['timestamp'] = pd.to_datetime(weather['timestamp']) # holiday imformation import holidays en_holidays = holidays.England() ir_holidays = holidays.Ireland() ca_holidays = holidays.Canada() us_holidays = holidays.UnitedStates() en_idx = weather.query('site_id == 1 or site_id == 5').index ir_idx = weather.query('site_id == 12').index ca_idx = weather.query('site_id == 7 or site_id == 11').index us_idx = weather.query( 'site_id == 0 or site_id == 2 or site_id == 3 or site_id == 4 or site_id == 6 or site_id == 8 or site_id == 9 or site_id == 10 or site_id == 13 or site_id == 14 or site_id == 15' ).index weather['IsHoliday'] = 0 weather.loc[en_idx, 'IsHoliday'] = weather.loc[en_idx, 'timestamp'].apply( lambda x: en_holidays.get(x, default=0)) weather.loc[ir_idx, 'IsHoliday'] = weather.loc[ir_idx, 'timestamp'].apply( lambda x: ir_holidays.get(x, default=0))
import pandas as pd, numpy as np, holidays, seaborn as sns, matplotlib.pyplot as plt, time, xgboost as xgb, json, pickle, sys, os from sqlalchemy import create_engine from datetime import datetime from sklearn.model_selection import train_test_split from sklearn.feature_selection import SelectKBest, f_regression from sklearn import metrics from config import config """This script automates the modelling for the scheduled and actual duration of each bus line. For more detailed information, please refer to pipeline_automation.ipynb and pipeline_testing.ipynb.""" #initialise Irish Holidays holidays_IE = holidays.Ireland() def daystamp_converter(time): """extracts and returns tuple of (weekday,month,hour,holiday) from datetime object.""" global holidays_ie date = datetime.fromtimestamp(time) if date in holidays_IE: holiday = 1 else: holiday = 0 return (date.weekday(), date.month, date.hour, holiday) if __name__ == '__main__': #check provided arguments and matches them with project member # argvs=sys.argv # if(len(argvs)<=1 or argvs[1].lower() not in ["yuqian","callum","jakob"]):
from datetime import datetime, timedelta import holidays BANK_HOLIDAYS = holidays.Ireland(years=[i for i in range( 2017, datetime.today().year + 1)]) PLUS_1_DAY_SCANS = [ "U01", "U02", "U03", "U04", "U05", "U06", "U07", "U08", "U09", "U10", "U11", "U12", "U13", "U14", "U15", "U16", "U17", "U18", "U19", "U20", "U21", "U22", "U23", "U24", "U25", "U26", "U27", "U28", "U29", "U30", "U31", "U32", "U33", "U34", "U35", "U36", "U37", "U38", "U39", "U40", "U41", "U42", "U43", "U44", "U45", "U46", "U47", "U48", "DS1", "R10"] PLUS_2_DAYS_SCANS = ["R02", "RTD"] SAME_DAY_SCANS = ["HDN", "ONB", "NEI", "YES"] def _get_week_day(start_date, plus_days): start_date = datetime.strptime(start_date, '%B %d, %Y') end_date = start_date + timedelta(days=plus_days) delivery_date = start_date while delivery_date < end_date: if ( end_date in BANK_HOLIDAYS or datetime.weekday(end_date) in set([5, 6]) ): end_date += timedelta(days=1) delivery_date += timedelta(days=1)
def weather_feature_engineering(df): fs = us.get_feature_settings() # Humidity if fs['do_humidity']: saturated_vapor_pressure = 6.11 * (10.0 **(7.5 * df['air_temperature'] / (237.3 + df['air_temperature']))) actual_vapor_pressure = 6.11 * (10.0 **(7.5 * df['dew_temperature'] / (237.3 + df['dew_temperature']))) df['humidity'] = (actual_vapor_pressure / saturated_vapor_pressure) * 100 df['humidity'] = df['humidity'].astype(np.float) feature_cols = fs['weather_lag_vars'] lag_values = fs['weather_lag_values'] # lags for site_id in range(c.SITE_ID_RANGE): mask = df['site_id'] == site_id for feature in feature_cols: col_names_lags = [ feature + '_lag_' + str(shift) for shift in lag_values ] for idx in range(0, len(lag_values)): df.loc[mask, col_names_lags[idx]] = df.loc[mask, feature].shift( lag_values[idx]) # window_average feature_cols = fs['weather_average_vars'] window = fs['weather_average_window'] df_site = df.groupby('site_id') df_rolled = df_site[feature_cols].rolling(window=window, min_periods=0) df_mean = df_rolled.mean().reset_index().astype(np.float16) df_std = df_rolled.std().reset_index().astype(np.float16) for feature in feature_cols: df[f'{feature}_mean_window_{window}'] = df_mean[feature] df[f'{feature}_std_window_{window}'] = df_std[feature] # holidays if fs['do_holidays']: en_holidays = holidays.England() ir_holidays = holidays.Ireland() ca_holidays = holidays.Canada() us_holidays = holidays.UnitedStates() en_sites = c.SITE_COUNTRIES.get('England') ir_sites = c.SITE_COUNTRIES.get('Ireland') ca_sites = c.SITE_COUNTRIES.get('Canada') us_sites = c.SITE_COUNTRIES.get('United_States') en_idx = df.query('site_id in @en_sites').index ir_idx = df.query('site_id in @ir_sites').index ca_idx = df.query('site_id in @ca_sites').index us_idx = df.query('site_id in @us_sites').index df['is_holiday'] = 0 df.loc[en_idx, 'is_holiday'] = df.loc[en_idx, 'timestamp'].apply( lambda x: en_holidays.get(x, default=0)) df.loc[ir_idx, 'is_holiday'] = df.loc[ir_idx, 'timestamp'].apply( lambda x: ir_holidays.get(x, default=0)) df.loc[ca_idx, 'is_holiday'] = df.loc[ca_idx, 'timestamp'].apply( lambda x: ca_holidays.get(x, default=0)) df.loc[us_idx, 'is_holiday'] = df.loc[us_idx, 'timestamp'].apply( lambda x: us_holidays.get(x, default=0)) holiday_idx = df['is_holiday'] != 0 df.loc[holiday_idx, 'is_holiday'] = 1 df['is_holiday'] = df['is_holiday'].astype(np.uint8) return df
def setUp(self): self.holidays = holidays.Ireland()