Exemple #1
0
def pre_process_data(data, selected_columns):
    '''
        Does some pre-processing on the existing columns and only keeps
        columns present in [selected_columns].
        Returns a numpy array
    '''

    # Some 'magic' string to datatime function
    data['datetime'] = pd.to_datetime(data['datetime'])

    # Since the hour of day is cyclical, e.g. 01:00 is equaly far from midnight
    # as 23:00 we need to represent this in a meaningful way. We use both sin
    # and cos, to make sure that 12:00 != 00:00 (which we cannot prevent if we only
    # use sin)
    data['hour_of_day'] = data['datetime'].apply(lambda i: i.hour)
    data['hour_of_day_sin'] = data['hour_of_day'].apply(lambda hour: math.sin(2*math.pi*hour/24))
    data['hour_of_day_cos'] = data['hour_of_day'].apply(lambda hour: math.cos(2*math.pi*hour/24))

    # Since it seems the service got more popular over time, we might need some
    # way of telling how far we are from the beginning
    first_day = datetime.strptime('2011-01-01', "%Y-%m-%d").date()
    data['day_since_begin'] = data['datetime'].apply(lambda i: (i.date()-first_day).days)

    # For some reason the dataset didn't indicate new year's day and christmas
    # day as holidays. Therefore we also use this external libraryto check if
    # a day is a holiday
    cal = Maryland()
    holidays = cal.holidays(2011)
    holidays += cal.holidays(2012)

    holidays = set([dt for (dt, name) in holidays])
    data['holiday_external'] = data['datetime'].apply(lambda i: int(i.date() in holidays))

    # Is it a holiday tomorrow or yesterday?
    data['almost_holiday'] = data['datetime'].apply(
        lambda i: int(i.date() - timedelta(days=1) in holidays or
            i.date() + timedelta(days=1) in holidays)
        )

    # Some simple model of rush hour
    data['rush_hour'] = data['datetime'].apply(
        lambda i: min([math.fabs(8-i.hour), math.fabs(18-i.hour)])
    )
    data.ix[data['workingday'] == 0,'rush_hour'] = \
        data['datetime'].apply(
            lambda i: math.fabs(14-i.hour)
        )
    data.ix[data['holiday_external'] == 1,'rush_hour'] = \
        data['datetime'].apply(
            lambda i: math.fabs(14-i.hour)
        )

    # Add the day of the week
    data['weekday'] = data['datetime'].apply(lambda i: i.weekday())

    # Some variables have no numerical value, they are categorical. E.g. the weather
    # variable has numerical values, but they cannot be interpreted as such.
    # In other words value 2 is not two times as small as value 4.
    # A method to deal with this is one-hot-enconding, which splits the existing
    # variable in n variables, where n equals the number of possible values.
    # See
    for column in ['season', 'weather', 'weekday']:
        dummies = pd.get_dummies(data[column])
        # Concat actual column name with index
        new_column_names = [column + str(i) for i in dummies.columns]
        data[new_column_names] = dummies
    
    data.to_csv('/home/bolaka/Bike Sharing/train-arnov.csv', index=False)
    data = data[selected_columns]

    return data.values
## feature engineering
#combined[ 'weekend' ] = 0
#combined.loc[ (combined['holiday'] == 0) & (combined['workingday'] == 0) ,'weekend'] = 1
#combined.loc[ (combined['weekend'] == 1), 'holiday'] = 1
#combined['weekday_holiday'] = combined.holiday * (combined.weekday+1)
#combined['atemp_cat'] = pd.cut(combined.atemp.values, 6, labels=[1, 2, 3, 4, 5, 6])
#combined['temp_cat'] = pd.cut(combined.atemp.values, 6, labels=[1, 2, 3, 4, 5, 6])
#combined['hum_cat'] = pd.cut(combined.humidity.values, 4, labels=[1, 2, 3, 4 ])
#combined['windspeed_cat'] = pd.cut(combined.windspeed.values, 4, labels=[ 1, 2, 3, 4 ])
#dummies = pd.get_dummies(combined['windspeed_cat'], prefix='wind')
#combined = pd.concat([combined, dummies], axis=1)

# For some reason the dataset didn't indicate new year's day and christmas
# day as holidays. Therefore we also use this external libraryto check if
# a day is a holiday
cal = Maryland()
holidays = cal.holidays(2011)
holidays += cal.holidays(2012)

holidays = set([dt for (dt, name) in holidays])
combined['holiday'] = combined['Date'].apply(lambda i: int(i in holidays))
validation['holiday'] = [ int(date.date() in holidays) for (date, hour) in validation.index ]
#print(validation['holiday'].sum())

# Was it a holiday yesterday?
combined['holiday_lag'] = combined['Date'].apply(
    lambda i: int(i - timedelta(days=1) in holidays)
    )

# Is it a holiday tomorrow?
combined['holiday_lead'] = combined['Date'].apply(