Esempio n. 1
0
def add_holiday(x):
    time_range = pd.date_range(start='2015-12-31', end='2019-01-01', freq='h')
    country_holidays = {'UK': holidays.UK(), 'US': holidays.US(), 'IRL': holidays.Ireland(), 'CAN': holidays.Canada()}

    holiday_mapping = pd.DataFrame()
    for site in range(16):
        holiday_mapping_i = pd.DataFrame({'site': site, 'timestamp': time_range})
        holiday_mapping_i['h0'] = holiday_mapping_i['timestamp'].apply(
            lambda x: x in country_holidays[locate[site]['country']]).astype(int)
        holiday_mapping = pd.concat([holiday_mapping, holiday_mapping_i], axis=0)

    x = pd.merge([x, holiday_mapping], on=['site', 'timestamp'], how='left')
    return x
Esempio n. 2
0
def add_holiday(df_weather):
    en_holidays = holidays.England()
    ir_holidays = holidays.Ireland()
    ca_holidays = holidays.Canada()
    us_holidays = holidays.UnitedStates()
    
    en_idx = df_weather.query('site_id == 1 or site_id == 5').index
    ir_idx = df_weather.query('site_id == 12').index
    ca_idx = df_weather.query('site_id == 7 or site_id == 11').index
    us_idx = df_weather.query('site_id == 0 or site_id == 2 or site_id == 3 or site_id == 4 or site_id == 6 or site_id == 8 or site_id == 9 or site_id == 10 or site_id == 13 or site_id == 14 or site_id == 15').index
    
    df_weather['IsHoliday'] = 0
    df_weather.loc[en_idx, 'IsHoliday'] = df_weather.loc[en_idx, 'timestamp'].apply(lambda x: en_holidays.get(x, default=0))
    df_weather.loc[ir_idx, 'IsHoliday'] = df_weather.loc[ir_idx, 'timestamp'].apply(lambda x: ir_holidays.get(x, default=0))
    df_weather.loc[ca_idx, 'IsHoliday'] = df_weather.loc[ca_idx, 'timestamp'].apply(lambda x: ca_holidays.get(x, default=0))
    df_weather.loc[us_idx, 'IsHoliday'] = df_weather.loc[us_idx, 'timestamp'].apply(lambda x: us_holidays.get(x, default=0))
    
    holiday_idx = df_weather['IsHoliday'] != 0
    df_weather.loc[holiday_idx, 'IsHoliday'] = 1
    df_weather['IsHoliday'] = df_weather['IsHoliday'].astype(np.uint8)
    
    return df_weather
Esempio n. 3
0
	def fit(self, X, y=None):
		self.USh = holidays.UnitedStates()
		self.CAh = holidays.Canada()
		self.UKh = holidays.UnitedKingdom()
		self.IEh = holidays.Ireland()
		return self
Esempio n. 4
0
    weather_df = weather_df.reset_index()
    weather_df = weather_df.drop(['datetime', 'day', 'week', 'month'], axis=1)

    return weather_df


weather['timestamp'] = weather['timestamp'].astype(str)
weather = fill_weather_dataset(weather)
weather['timestamp'] = pd.to_datetime(weather['timestamp'])

# holiday imformation

import holidays

en_holidays = holidays.England()
ir_holidays = holidays.Ireland()
ca_holidays = holidays.Canada()
us_holidays = holidays.UnitedStates()

en_idx = weather.query('site_id == 1 or site_id == 5').index
ir_idx = weather.query('site_id == 12').index
ca_idx = weather.query('site_id == 7 or site_id == 11').index
us_idx = weather.query(
    'site_id == 0 or site_id == 2 or site_id == 3 or site_id == 4 or site_id == 6 or site_id == 8 or site_id == 9 or site_id == 10 or site_id == 13 or site_id == 14 or site_id == 15'
).index

weather['IsHoliday'] = 0
weather.loc[en_idx, 'IsHoliday'] = weather.loc[en_idx, 'timestamp'].apply(
    lambda x: en_holidays.get(x, default=0))
weather.loc[ir_idx, 'IsHoliday'] = weather.loc[ir_idx, 'timestamp'].apply(
    lambda x: ir_holidays.get(x, default=0))
Esempio n. 5
0
import pandas as pd, numpy as np, holidays, seaborn as sns, matplotlib.pyplot as plt, time, xgboost as xgb, json, pickle, sys, os
from sqlalchemy import create_engine
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn import metrics

from config import config
"""This script automates the modelling for the scheduled and actual duration of each bus line.
For more detailed information, please refer to pipeline_automation.ipynb and pipeline_testing.ipynb."""

#initialise Irish Holidays
holidays_IE = holidays.Ireland()


def daystamp_converter(time):
    """extracts and returns tuple of (weekday,month,hour,holiday) from datetime object."""
    global holidays_ie
    date = datetime.fromtimestamp(time)
    if date in holidays_IE:
        holiday = 1
    else:
        holiday = 0
    return (date.weekday(), date.month, date.hour, holiday)


if __name__ == '__main__':
    #check provided arguments and matches them with project member
    # argvs=sys.argv
    # if(len(argvs)<=1 or argvs[1].lower() not in ["yuqian","callum","jakob"]):
Esempio n. 6
0
from datetime import datetime, timedelta

import holidays

BANK_HOLIDAYS = holidays.Ireland(years=[i for i in range(
    2017, datetime.today().year + 1)])

PLUS_1_DAY_SCANS = [
    "U01", "U02", "U03", "U04", "U05", "U06", "U07", "U08",
    "U09", "U10", "U11", "U12", "U13", "U14", "U15", "U16", "U17", "U18",
    "U19", "U20", "U21", "U22", "U23", "U24", "U25", "U26", "U27", "U28",
    "U29", "U30", "U31", "U32", "U33", "U34", "U35", "U36", "U37", "U38",
    "U39", "U40", "U41", "U42", "U43", "U44", "U45", "U46", "U47", "U48",
    "DS1", "R10"]

PLUS_2_DAYS_SCANS = ["R02", "RTD"]

SAME_DAY_SCANS = ["HDN", "ONB", "NEI", "YES"]


def _get_week_day(start_date, plus_days):
    start_date = datetime.strptime(start_date, '%B %d, %Y')
    end_date = start_date + timedelta(days=plus_days)
    delivery_date = start_date
    while delivery_date < end_date:
        if (
            end_date in BANK_HOLIDAYS or
            datetime.weekday(end_date) in set([5, 6])
        ):
            end_date += timedelta(days=1)
        delivery_date += timedelta(days=1)
Esempio n. 7
0
def weather_feature_engineering(df):

    fs = us.get_feature_settings()

    # Humidity

    if fs['do_humidity']:

        saturated_vapor_pressure = 6.11 * (10.0
                                           **(7.5 * df['air_temperature'] /
                                              (237.3 + df['air_temperature'])))
        actual_vapor_pressure = 6.11 * (10.0
                                        **(7.5 * df['dew_temperature'] /
                                           (237.3 + df['dew_temperature'])))
        df['humidity'] = (actual_vapor_pressure /
                          saturated_vapor_pressure) * 100
        df['humidity'] = df['humidity'].astype(np.float)

    feature_cols = fs['weather_lag_vars']
    lag_values = fs['weather_lag_values']

    # lags

    for site_id in range(c.SITE_ID_RANGE):

        mask = df['site_id'] == site_id

        for feature in feature_cols:
            col_names_lags = [
                feature + '_lag_' + str(shift) for shift in lag_values
            ]

            for idx in range(0, len(lag_values)):
                df.loc[mask, col_names_lags[idx]] = df.loc[mask,
                                                           feature].shift(
                                                               lag_values[idx])

    # window_average

    feature_cols = fs['weather_average_vars']
    window = fs['weather_average_window']

    df_site = df.groupby('site_id')

    df_rolled = df_site[feature_cols].rolling(window=window, min_periods=0)
    df_mean = df_rolled.mean().reset_index().astype(np.float16)
    df_std = df_rolled.std().reset_index().astype(np.float16)

    for feature in feature_cols:
        df[f'{feature}_mean_window_{window}'] = df_mean[feature]
        df[f'{feature}_std_window_{window}'] = df_std[feature]

    # holidays

    if fs['do_holidays']:

        en_holidays = holidays.England()
        ir_holidays = holidays.Ireland()
        ca_holidays = holidays.Canada()
        us_holidays = holidays.UnitedStates()

        en_sites = c.SITE_COUNTRIES.get('England')
        ir_sites = c.SITE_COUNTRIES.get('Ireland')
        ca_sites = c.SITE_COUNTRIES.get('Canada')
        us_sites = c.SITE_COUNTRIES.get('United_States')

        en_idx = df.query('site_id in @en_sites').index
        ir_idx = df.query('site_id in @ir_sites').index
        ca_idx = df.query('site_id in @ca_sites').index
        us_idx = df.query('site_id in @us_sites').index

        df['is_holiday'] = 0
        df.loc[en_idx, 'is_holiday'] = df.loc[en_idx, 'timestamp'].apply(
            lambda x: en_holidays.get(x, default=0))
        df.loc[ir_idx, 'is_holiday'] = df.loc[ir_idx, 'timestamp'].apply(
            lambda x: ir_holidays.get(x, default=0))
        df.loc[ca_idx, 'is_holiday'] = df.loc[ca_idx, 'timestamp'].apply(
            lambda x: ca_holidays.get(x, default=0))
        df.loc[us_idx, 'is_holiday'] = df.loc[us_idx, 'timestamp'].apply(
            lambda x: us_holidays.get(x, default=0))

        holiday_idx = df['is_holiday'] != 0
        df.loc[holiday_idx, 'is_holiday'] = 1
        df['is_holiday'] = df['is_holiday'].astype(np.uint8)

    return df
Esempio n. 8
0
 def setUp(self):
     self.holidays = holidays.Ireland()