import pandas as pd from fastsr.containers.learning_data import LearningData dat = pd.read_csv('data/energydata_complete.csv') cols = dat.columns.tolist() cols_ordered = cols[3:len(cols)] + [cols[1]] trimmed_dat = dat[cols_ordered] learning_data = LearningData() predictor_names = cols_ordered[:-1] # We need to remove '_' as they don't play nice with Range Terminals predictor_names = list(map(lambda x: x.replace('_', ''), predictor_names)) learning_data.from_data(trimmed_dat, predictor_names, 'energy_data') learning_data.lag_predictors(6, column_names=predictor_names) learning_data.to_hdf('data/energy_hour_lagged.hdf5')
import pandas as pd from fastsr.containers.learning_data import LearningData dat = pd.read_csv('data/hour.csv') datetime_index = list() for i, r in dat.iterrows(): datetime_index.append(pd.to_datetime(r[1]) + pd.DateOffset(hours=r[5])) dt_index = pd.DatetimeIndex(datetime_index) dat.set_index(dt_index) dat_onehot = pd.get_dummies( dat, prefix_sep='', columns=['season', 'mnth', 'hr', 'weekday', 'weathersit']) indices = [x for x in range(2, 9)] + [11] + [x for x in range(12, 63)] dat_train = dat_onehot.iloc[:, indices] cols = dat_train.columns.tolist() cols_ordered = cols[0:7] + cols[8:58] + [cols[7]] dat_train_ordered = dat_train[cols_ordered] learning_data = LearningData() learning_data.from_data(dat_train_ordered, cols_ordered[:-1], 'ucibike') lag_variables = ['holiday', 'workingday', 'temp', 'atemp', 'hum', 'windspeed'] learning_data.lag_predictors(6, column_names=lag_variables) learning_data.to_hdf('data/hour_lagged.hdf5')