Ejemplo n.º 1
def add_time_cols(df: pd.DataFrame):
    cal = calendar()
    holidays = cal.holidays(start=df.index.date.min(), end=df.index.date.max())
    df['time_of_day'] = (df.index.hour.values * 100) + df.index.minute.values
    df['weekend_or_holiday'] = df.index.to_series().apply(
        lambda x: (x.weekday() >= 5) or (x.date() in holidays))
    return df
def create_features(df):
    Creates time series features from datetime index.
    df = df.copy()
    weekdays = {
        0: 'Monday',
        1: 'Tuesday',
        2: 'Wednesday',
        3: 'Thursday',
        4: 'Friday',
        5: 'Saturday',
        6: 'Sunday'
    mapped = {True: 1, False: 0}
    df['Date'] = pd.to_datetime(df.date.dt.date)
    df['year'] = df.date.dt.year
    df['month'] = df.date.dt.month
    df['day'] = df.date.dt.dayofyear
    df['hour'] = df.date.dt.hour
    df['weekday'] = df.date.dt.weekday.map(weekdays)
    df['season'] = df.date.dt.month.apply(season_calc)
    cal = calendar()
    holidays = cal.holidays(start=df['Date'].min(), end=df['Date'].max())
    df['holiday'] = df['Date'].isin(holidays)
    df.holiday = df.holiday.map(mapped)

    return df
Ejemplo n.º 3
def set_holiday(data):
    us_hol_cal = calendar()
    holidays = us_hol_cal.holidays(start=data['pickup_datetime'].min(),
    data['isWeekend'] = (data['pickup_datetime'].dt.dayofweek > 5).astype(int)
    data['isUSHoliday'] = (data['pickup_datetime'].isin(holidays)).astype(int)
    data['isHoliday'] = data['isWeekend'] | data['isUSHoliday']
Ejemplo n.º 4
def getHolidays(df):
    # Get Holidays
    cal = calendar()
    dr = pd.date_range(start=df['tpep_pickup_datetime'].min(),
    holidays = cal.holidays(start=dr.min(), end=dr.max())
    return df['tpep_pickup_datetime'].isin(holidays)
Ejemplo n.º 5
def trips_data(filepath):
        Reads trips data and create: date, year, month, day, hour, week, holiday, weekend, duration_m 


    trips = pd.read_csv(filepath, parse_dates=[1,2])

    # standardize column names
    trips.columns = [col.replace(' ', '_', -1).lower() for col in trips.columns.values]

    # feature engineer date variables
    cal = calendar()
    holidays = cal.holidays(min(trips['start_date']), max(trips['start_date']))
    trips['date'] = trips['start_date'].dt.date
    trips['year'] = trips['start_date'].dt.year.astype(int)
    trips['month'] = trips['start_date'].dt.month.astype(int)
    trips['day'] = trips['start_date'].dt.dayofweek.astype(int)
    trips['hour'] = trips['start_date'].dt.hour.astype(int)
    trips['week'] = trips['start_date'].dt.week.astype(int)
    trips['holiday'] = trips['date'].astype('datetime64').isin(holidays)
    trips['weekend'] = trips['day'].isin([6, 7])
    trips['duration_m'] = trips['duration']/60
        }, axis=1, inplace=True)

Ejemplo n.º 6
    def localize_df(self, df, device):
        Data from the VOLTTRON historian will be in UTC timezone.
        Regressions typically are meaningful for localtime as TCC
        agents utilize local time for predictions and control.
        :param df:
        :param device:
        df = df.reset_index()
            # Convert UTC time to local time in configuration file.
            df['Date'] = df['Date'].dt.tz_convert(self.local_tz)
        except Exception as e:
            _log.error('Failed to convert Date column to localtime - {}'.format(e))
        if self.debug:
            filename = '{}/{}-{} - {}.csv'.format(WORKING_DIR, self.start, self.end, device)
                with open(filename, 'w+') as outfile:
                    df.to_csv(outfile, mode='a', index=True)
                    _log.debug('*** Finished outputting data ***')
            except Exception as e:
                _log.error('File output failed, check whether the dataframe is empty - {}'.format(e))

        # Weekends and holidays will only be present if
        # one_shot is true.  For scheduled regression those
        # days are excluded from query to historian.
        if self.exclude_weekends_holidays:
            holiday = CustomBusinessDay(calendar=calendar()).onOffset
            match = df["Date"].map(holiday)
            df = df[match]
        return df
Ejemplo n.º 7
def expand_date(timeseries):
    Expand a pandas datetime series returning a dataframe with these columns:
    - hour : 0 - 23
    - year:
    - month: 1 - 12
    - weekday : 0 Monday - 6 Sunday
    - holiday : 0 - 1 holiday
    - workingday : 0 weekend or holiday - 1 workingday

    from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

    assert type(
        timeseries) == pd.core.series.Series, 'input must be pandas series'
    assert timeseries.dtypes == 'datetime64[ns]', 'input must be pandas datetime'

    df = pd.DataFrame()

    df['hour'] = timeseries.dt.hour

    date = timeseries.dt.date
    df['year'] = pd.DatetimeIndex(date).year
    df['month'] = pd.DatetimeIndex(date).month
    df['day'] = pd.DatetimeIndex(date).day
    df['weekday'] = pd.DatetimeIndex(date).weekday

    holidays = calendar().holidays(start=date.min(), end=date.max())
    hol = date.astype('datetime64[ns]').isin(holidays)
    df['holiday'] = hol.values.astype(int)
    df['workingday'] = ((df['weekday'] < 5) & (df['holiday'] == 0)).astype(int)

    return df
Ejemplo n.º 8
def build_features(time, temp, conditions):
  need ['TMAX', 'some_ppt', 'ppt', 'hour_minute', 'weekday', 'day_of_year', 'total_days', 'Holiday']
    feature_df = pd.DataFrame(columns=[
        'TMAX', 'some_ppt', 'ppt', 'hour_minute', 'weekday', 'day_of_year',
        'total_days', 'Holiday'
    feature_df['hour_minute'] = range(time.hour * 60 + time.minute + 1,
                                      time.hour * 60 + time.minute + 61)
    feature_df['TMAX'] = temp * 10
    feature_df['weekday'] = int(time.weekday < 5)
    feature_df['day_of_year'] = time.dayofyear
    feature_df['total_days'] = (time - pd.to_datetime('July 1, 2013')).days

    heavy_ppt_query = r'(:?thunderstorm)|(:?blizzard)'
    ppt_query = r'(:?rain)|(:?snow)|(:?hail)'
    light_ppt_query = r'(:?drizzle)|(:?showers)'
    if re.match(heavy_ppt_query, conditions):
        feature_df[['some_ppt', 'ppt']] = [0, 1]
    elif re.match(light_ppt_query, conditions):
        feature_df[['some_ppt', 'ppt']] = [1, 0]
    elif re.match(ppt_query, conditions):
        if re.match(r'(:?light)|(:?chance)', conditions):
            feature_df[['some_ppt', 'ppt']] = [1, 0]
            feature_df[['some_ppt', 'ppt']] = [0, 1]
        feature_df[['some_ppt', 'ppt']] = [0, 0]

    cal = calendar()
    holidays = cal.holidays(start=time, end=time)
    feature_df['Holiday'] = time in holidays
    return feature_df
Ejemplo n.º 9
def create_time_feature(df):
    #create created_at_year, created_at_month, created_at_day, created_at_date, created_at_dayOfWeek,
    #created_at_time, created_at_hour, created_at_minute, created_at_second, created_at_isWeekend,
    df['created_at_year'], df['created_at_month'], df['created_at_day'], df[
        'created_at_date'], df['created_at_dayOfWeek'], df['created_at_time'], df[
            'created_at_hour'], df['created_at_minute'], df[
                'created_at_second'] = df['created_at_datetime'].dt.year, df[
                    'created_at_datetime'].dt.month, df[
                        'created_at_datetime'].dt.day, df[
                            'created_at_datetime'].dt.date, df[
                                'created_at_datetime'].dt.dayofweek, df[
                                    'created_at_datetime'].dt.time, df[
                                        'created_at_datetime'].dt.hour, df[
                                            'created_at_datetime'].dt.minute, df[
    df.loc[df['created_at_dayOfWeek'].isin([5, 6]), 'created_at_isWeekend'] = 1
    df.loc[df['created_at_dayOfWeek'].isin([0, 1, 2, 3, 4]),
           'created_at_isWeekend'] = 0
    cal = calendar()
    holidays = cal.holidays(start=df['created_at_date'].min(),
    df['created_at_isHoliday'] = np.where(
        df.created_at_datetime.dt.normalize().isin(holidays), 1, 0)
    return df
Ejemplo n.º 10
def prophetModelandPrediction(train, demandForcastingData_train,
                              demandForcastingData_test, holiday_df):
    cal = calendar()
    #train_holidays = cal.holidays(start=demandForcastingData_train.index.min(),end=demandForcastingData_train.index.max())
    #test_holidays = cal.holidays(start=demandForcastingData_test.index.min(),end=demandForcastingData_test.index.max())
    holiday_df['ds'] = pd.to_datetime(holiday_df['ds'])
        'ArrivalDate': 'ds',
        'Count': 'y'
    model = Prophet(holidays=holiday_df)
        'ArrivalDate': 'ds',
        'Count': 'y'
    #demandForcastingData_test_fcst = model.predict(df=demandForcastingData_train.reset_index().rename(columns={'ArrivalDate':'ds'}))
    #demandForcastingData_test['Count_Prediction4'] = demandForcastingData_test_fcst.yhat.values
    if not train:
        demandForcastingData_test_fcst = model.predict(
                columns={'ArrivalDate': 'ds'}))
            'Count_Prediction4'] = demandForcastingData_test_fcst.yhat.values
        return demandForcastingData_test
        demandForcastingData_test_fcst = model.predict(
                columns={'ArrivalDate': 'ds'}))
            'Count_Prediction4'] = demandForcastingData_test_fcst.yhat.values
        return demandForcastingData_train
Ejemplo n.º 11
def generate_date_features(date_index):
    out_df = pd.DataFrame(index=date_index)
    days = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
    for i in range(len(days)):
        kwargs = {days[i]: date_index.map(lambda row: int(row.weekday() == i))}
        out_df = out_df.assign(**kwargs)

    months = [
        'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct',
        'nov', 'dec'
    for i in range(len(months)):
        kwargs = {
            months[i]: date_index.map(lambda row: int(row.month == i + 1))
        out_df = out_df.assign(**kwargs)

    quarter = ['Q1', 'Q2', 'Q3', 'Q4']
    for i in range(len(quarter)):
        kwargs = {
            quarter[i]: date_index.map(lambda row: int(row.quarter == i))
        out_df = out_df.assign(**kwargs)

    years = ['y14', 'y15', 'y16', 'y17', 'y18']
    for i in range(len(years)):
        kwargs = {
            years[i]: date_index.map(lambda row: int(row.year == i + 2014))
        out_df = out_df.assign(**kwargs)

    weeks = ['w{}'.format(i) for i in range(1, 54)]
    for i in range(len(weeks)):
        kwargs = {
            date_index.map(lambda row: int(row.isocalendar()[1] == i + 1))
        out_df = out_df.assign(**kwargs)

    # TODO: fix this
    def is_xmas_new_year(row):
        ret = ((dt.datetime(row.year, 12, 25) < row
                and row < dt.datetime(row.year + 1, 1, 5))
               or (dt.datetime(row.year - 1, 12, 25) < row
                   and row < dt.datetime(row.year, 1, 5)))
        return int(ret)

    # kwargs = {'xmas': date_index.map(is_xmas_new_year)}
    # out_df = out_df.assign(**kwargs)
    # print(out_df.head())

    dr = pd.to_datetime(pd.to_datetime(date_index))
    cal = calendar()
    holidays = cal.holidays(start=dr.min(), end=dr.max())

    out_df["holiday"] = dr.isin(holidays)
    out_df["holiday"] = out_df.holiday.astype(int)
    # out_df = out_df.assign(**kwargs)

    return out_df
Ejemplo n.º 12
def get_prediction_dataframe(series):
    hour_of_day = series.index.hour
    month_of_year = series.index.month
    day_of_week = series.index.dayofweek
    year_idx = series.index.year
    target = series.values
    cal = calendar()
    holidays = cal.holidays(start=series.index.min(), end=series.index.max())
    df = pd.DataFrame(
    convert_type = {x: "category" for x in df.columns.values[:4]}
    df = df.astype(convert_type)
    return df
Ejemplo n.º 14
 def temporal_features(self, df):
     df['start_year'] = df['start_date'].dt.year
     df['start_month'] = df['start_date'].dt.month
     df['start_weekday'] = df['start_date'].dt.weekday
     df['start_hour'] = df['start_date'].dt.hour
     df['end_year'] = df['end_date'].dt.year
     df['end_month'] = df['end_date'].dt.month
     df['end_weekday'] = df['end_date'].dt.weekday
     df['end_hour'] = df['end_date'].dt.hour
     cal = calendar()
     holidays = cal.holidays(start=df['start_date'].min(),
     df['is_start_holiday'] = np.where(df['start_date'].isin(holidays), 1,
     df['is_start_working_day'] = np.where(
         (df['start_date'].dt.weekday != 5) &
         (df['start_date'].dt.weekday != 6) & (df['is_start_holiday'] == 0),
         1, 0)
     df['is_start_weekend'] = np.where((df['start_date'].dt.weekday == 5) |
                                       (df['start_date'].dt.weekday == 6),
                                       1, 0)
     df['start_year_part'] = df.apply(
         lambda x: self.year_part(x['start_year'], x['start_month']),
     return df
Ejemplo n.º 15
def holiday_indicator(df):
    """Appends holiday (US Federal) indicator column"""
    min_date = df.ts.min()
    max_date = df.ts.max()
    cal = calendar()
    holidays = cal.holidays(start=min_date, end=max_date)
    df['holiday'] = (
Ejemplo n.º 16
def cross_holidays(joint_df):
    joint_df['FlightDate'] = pd.to_datetime(joint_df['FlightDate'],
    cal = calendar()
    holidays = cal.holidays(start=joint_df['FlightDate'].min(),
    joint_df["is_holiday"] = joint_df["FlightDate"].isin(holidays)
    return joint_df
 def process_date(X):
     ts = pd.to_datetime(X[['Year', 'Month', 'Day']])
     X['weekday'] = ts.dt.weekday
     cal = calendar()
     holidays = cal.holidays(start=ts.min(), end=ts.max())
     X['IsHoliday'] = ts.apply(lambda x: int(x in holidays))
     return np.c_[X['Year'], X['Month'], X['Day'], X['Hour'],
                  X['weekday'], X['IsHoliday']]
Ejemplo n.º 18
def prepare_data(data_path):
    """Returns dataframe with features."""

    # Get data
    df = pd.read_csv(data_path)

    # Remove NaNs
    df = df.dropna()

    # Convert date to datetime
    df['date'] = pd.to_datetime(df.date)

    # Create and age variable
    df['age'] = df.index.astype('int')

    # Create a day of week field
    df['day'] = df.date.dt.dayofweek

    # Create a month of year field
    df['month'] = df.date.dt.month

    # Create a boolean for US federal holidays
    holidays = calendar().holidays(start=df.date.min(), end=df.date.max())
    df['holiday'] = df['date'].isin(holidays).apply(int)

    # Rearrange columns
    df = df[

    # Create monthly dummies
    tmp = pd.get_dummies(df.month)
    tmp.columns = ['month' + str(value) for value in tmp.columns]
    df = pd.concat([df, tmp], axis=1)

    # Create daily dummies
    tmp = pd.get_dummies(df.day)
    tmp.columns = ['day' + str(value) for value in tmp.columns]
    df = pd.concat([df, tmp], axis=1)

    # Reset index
    df = df.reset_index(drop=True)

    # Log transform count data
    df['count'] = np.log1p(df['count'])

    # Drop unnecessary columns
    df = df.drop(['month', 'day', 'age'], axis=1)
    df = df.dropna()

    return df
Ejemplo n.º 19
    def is_holiday(self):
        is_hday = 0
        cal = calendar()
        holidays = cal.holidays(start=dt.date(2015, 1, 1),
                                end=dt.date(2020, 12, 31))
        if np.datetime64(self.usage_date) in holidays:
            is_hday = 1

        return is_hday
Ejemplo n.º 20
def is_holiday(date):
    cal = calendar()
    st = datetime.datetime(date.year, date.month, 1)
    ed = st + datetime.timedelta(days=31)
    hols = cal.holidays(start=st, end=ed)
    if date in hols:
        return True
        return False
Ejemplo n.º 21
def add_holidays(train_df,test_df):
    concat = pd.concat([train_df['pickup_datetime'],test_df['pickup_datetime']])
    cal = calendar()
    holidays = cal.holidays(start=concat['pickup_datetime'].min(), end=concat['pickup_datetime'].max())

    train_df['Holiday'] = train_df['pickup_datetime'].isin(holidays)
    test_df['Holiday'] = test_df['pickup_datetime'].isin(holidays)
    return train_df,test_df
Ejemplo n.º 22
def clean_data(df_data, features=None):
    Clean weather data and create features.

    INPUT: dataframe, list
    OUTPUT: dataframe
    df = df_data.copy()
    df['time'] = pd.to_datetime(df['time'], unit='s')
    df.set_index('time', inplace=True)
    df = df.resample('1D', how='mean')

    # feature creation
    df['dayofweek'] = pd.DatetimeIndex(df.index).weekday
    df['dayofyear'] = pd.DatetimeIndex(df.index).dayofyear
    df['weekofyear'] = pd.DatetimeIndex(df.index).weekofyear

    # mark holidays
    cal = calendar()
    holidays = cal.holidays(start=df.index.min(), end=df.index.max())
    df['holiday'] = 0
    df.loc[df.index.isin(holidays), 'holiday'] = 1

    # rolling means
    c = [
        'apparenttemperaturemax', 'apparenttemperaturemin', 'temperaturemax',
    d = ['7']

    for col in c:
        for day in d:
            df[col + day] = pd.rolling_mean(df[col], int(day))

    # create lag features
    c = ['apparenttemperaturemax', 'apparenttemperaturemin', 'windspeed']
    d = ['-3', '-7']

    for col in c:
        for day in d:
            df[col + day] = df[col].shift(int(day))

    # impute null values
    df.fillna(0, inplace=True)

    if features is None:
        # drop unneeded columns
        df.drop(['precipintensity', 'precipintensitymax'],
        # use only specified features
        df = df[features]

    return df
Ejemplo n.º 23
 def dt_range(self):
     today = datetime.datetime.now().date()
     today = datetime.datetime(today.year, today.month, today.day)
     d = datetime.timedelta(days=1)
     cal = calendar()
     holidays = [x.to_pydatetime() for x in cal.holidays((today - d * self.trail_days), today)]
     dt_range_ = sorted([(today - d * i) for i in range(self.trail_days)])
     dt_range_ = [dt for dt in dt_range_ if datetime.date.weekday(dt) < 5 and dt not in holidays]
     return dt_range_
Ejemplo n.º 24
def holiday_checker(release_date):

    vReturn = 0
    cal = calendar()
    startDate = datetime.strptime(release_date, '%m/%d/%Y') - timedelta(days=7)
    endDate = datetime.strptime(release_date, '%m/%d/%Y') + timedelta(days=5)

    holidays = cal.holidays(start=startDate, end=endDate).to_pydatetime()
    if holidays: vReturn = 1
    return vReturn
Ejemplo n.º 25
    def is_holiday(cls):
            Reference - https://stackoverflow.com/questions/64276059
        is_hday = 0
        cal = calendar()
        holidays = cal.holidays(start=dt.date(2015, 1, 1),
                                end=dt.date(2020, 12, 31))

        is_hday = cls.usage_date.in_(holidays)
        return is_hday
Ejemplo n.º 26
def get_features_dataframe(
    series: pd.Series,
    time_features: List[TimeFeature],
    lag_indices: List[int],
    past_data: Optional[pd.Series] = None,
) -> pd.DataFrame:
    """Constructs a DataFrame of features for a given Series.

    Features include some date-time features (like hour-of-day, day-of-week, ...) and
    lagged values from the series itself. Lag indices are specified by `lags`, while
    previous data can be specified by `past_data`: the latter allows to get lags also
    for the initial values of the series.

        Series on which features should be computed.
        List of time features to be included in the data frame.
        List of indices of lagged observations to be included as features.
        Prior data, to be used to compute lagged observations.

        A DataFrame containing the features. This has the same index as `series`.
    # TODO check if anything can be optimized here

    assert past_data is None or series.index.freq == past_data.index.freq
    assert past_data is None or series.index[0] > past_data.index[-1]

    cal = calendar()
    holidays = cal.holidays(start=series.index.min(), end=series.index.max())
    time_feature_columns = {
        feature.__class__.__name__: feature(series.index)
        for feature in time_features

    all_data = (
        if past_data is None
        else past_data.append(series).asfreq(series.index.freq)
    lag_columns = {
        f"lag_{idx}": all_data.shift(idx)[series.index].values
        for idx in lag_indices

    columns = {**time_feature_columns, **lag_columns, "target": series.values}

    return pd.DataFrame(columns, index=series.index)
Ejemplo n.º 27
 class custom_calendar(AbstractHolidayCalendar):
     new_rules = [
         Holiday('Halloween', month=10, day=31),
         Holiday('Christmas Eve', month=12, day=24),
         Holiday('New Years Eve', month=12, day=31),
         Holiday('DST time change',
     rules = calendar().rules + new_rules
def get_holidays(x):
    cal = calendar()
    holidays = cal.holidays(start=x.min(), end=x.max(), return_name=True)
    holidays = cal.holidays(start='2015-01-01',
    holidays = holidays[~holidays.isin(['Presidents Day',
                                        'Columbus Day',
                                        'Veterans Day'])]
    mlk_days = holidays[holidays == 'Dr. Martin Luther King Jr.']
    return holidays, mlk_days
Ejemplo n.º 29
def clean_data(df_data, features=None):
    Clean weather data and create features.

    INPUT: dataframe, list
    OUTPUT: dataframe
    df = df_data.copy()
    df['time'] = pd.to_datetime(df['time'], unit='s')
    df.set_index('time', inplace=True)
    df = df.resample('1D', how='mean')

    # feature creation
    df['dayofweek'] = pd.DatetimeIndex(df.index).weekday
    df['dayofyear'] = pd.DatetimeIndex(df.index).dayofyear
    df['weekofyear'] = pd.DatetimeIndex(df.index).weekofyear

    # mark holidays
    cal = calendar()
    holidays = cal.holidays(start=df.index.min(), end=df.index.max())
    df['holiday'] = 0
    df.loc[df.index.isin(holidays), 'holiday'] = 1

    # rolling means
    c = ['apparenttemperaturemax','apparenttemperaturemin',
         'temperaturemax', 'temperaturemin']
    d = ['7']

    for col in c:
        for day in d:
            df[col+day] = pd.rolling_mean(df[col], int(day))

    # create lag features
    c = ['apparenttemperaturemax','apparenttemperaturemin', 'windspeed']
    d = ['-3', '-7']

    for col in c:
        for day in d:
            df[col+day] = df[col].shift(int(day))

    # impute null values
    df.fillna(0, inplace=True)

    if features is None:
        # drop unneeded columns
        df.drop(['precipintensity', 'precipintensitymax'],
                axis = 1, inplace=True)
        # use only specified features
        df = df[features]

    return df
Ejemplo n.º 30
 def transform(self, X, y=None):
     """Create Squared Variables."""
     cal = calendar()
     holidays = cal.holidays(start='2000-01-01', end='2050-01-01')
     holiday_bin_temp = pd.DataFrame(X.index.date,
     holiday_bin = holiday_bin_temp['date'].astype('datetime64').isin(
     holiday_bin = pd.DataFrame(holiday_bin)
     del holiday_bin_temp
     return holiday_bin
def download_pulse_range(start, end, pulsedir):
    cal = calendar()

    if not os.path.exists(pulsedir):

    for day in _daterange(start, end):
        if day.weekday() < 5: # Monday...Friday == 0..4
            if day in cal.holidays():
                print u'(Holiday)...',
            _download_pulse(day, pulsedir)
            print u'%s (Weekend)' % str(day)
def _remove_WE_holidays_NaN(data):

    no_WE = ~((data.index.weekday == 5) | (data.index.weekday == 6)) # remove if WE

    cal = calendar()
    start = datetime.datetime.strftime(data.index.min(),"%Y-%m-%d")
    end =datetime.datetime.strftime(data.index.max(),"%Y-%m-%d")
    hol_cal = cal.holidays(start=start, end=end)
    no_hol = ~data.index.isin(hol_cal) # remove if it is a national holiday

    no_NaN = ~data.isna().all(axis=1) # remove if has any NaN for any hour

    return data[no_WE & no_hol & no_NaN]
Ejemplo n.º 33
def encode_dataset(train,test,meta,target_model='xgb'):
    y_train = train[meta['target']]
    train = train.drop([meta['target']],axis=1)
    assert train.shape[1] == test.shape[1]
    for i in range(train.shape[1]):
        assert train.columns[i] == test.columns[i]
    train_obs = len(train)
    all_data = pd.concat([train,test],axis=0)
    for i,f in enumerate(meta['cols'].keys()):
        if meta['cols'][f] == 'CAT':
            all_data[f] = all_data[f].fillna('missing')
            encoder = LabelEncoder()
            if target_model == 'xgb':
                all_data[f] = encoder.transform(all_data[f])
                all_data[f] = encoder.transform(all_data[f]).astype(int)
        elif meta['cols'][f] == 'NUM':
            all_data[f] = all_data[f].fillna(-1)
        elif meta['cols'][f] == 'DATE':
            tmp = pd.to_datetime(all_data[f])
            all_data[f] = tmp.dt.weekday
            cal = calendar()
            #holidays = cal.holidays(start=tmp.min(), end=tmp.max())
            #$all_data[f+'_is_holiday'] = 1*tmp.isin(holidays)
        elif meta['cols'][f] == 'REM':
            all_data = all_data.drop(f,axis=1)
        elif meta['cols'][f] == 'LEN':
            all_data[f+'_len'] = all_data[f].apply(count_desc_len)
            all_data = all_data.drop(f,axis=1)
            raise Exception(str(meta['cols'][f])+":unknown mapping")
    assert train_obs == len(y_train)
    return all_data , y_train
# Presumably, crime rates will be different on working days on the
# one hand and weekdays and holidays on the other hand.
# So we'll introduce a column WorkingDay

# Data on holidays and which businesses actually observe them are sketchy
# at best, so we'll only count the most important ones as holidays:
#  - New Year
#  - Memorial Day
#  - Independence Day
#  - Labor Day
#  - Thanksgiving
#  - Black Friday
#  - Christmas

cal = calendar()
# need to remove rules in descending order!
cal.rules.pop(7) # remove Veterans Day
cal.rules.pop(6) # remove Columbus Day
cal.rules.pop(2) # remove President's Day
cal.rules.pop(1) # remove Martin Luther King Day
# create new rule for Black Friday
USBlackFriday = Holiday('BlackFriday', month=11, day=1,

# create own holiday calendar based on the above rules
ownCal = HolidayCalendarFactory('OwnCalendar', cal, USBlackFriday)
cal = ownCal()
#holidays = cal.holidays(start='2003-01-01', end='2015-05-13', return_name=True) # also returns name of holiday
holidays = cal.holidays(start='2003-01-01', end='2015-05-13')