def main(): # LOAD DATA dir = 'data' data = load_from_pickle(dir, 'data_daily.pkl') # SPLIT DATA # using standard weeks sunday - saturday # first sunday in dataset: day1_train = '2006-12-17' # first sunday in 2010: day1_test = '2010-1-3' # last saturday in dataset: day_last_test = '2010-11-20' data_train = data[data.index >= day1_train] data_train = data_train[data_train.index < day1_test] data_test = data[data.index >= day1_test] data_test = data_test[data_test.index <= day_last_test] # METRICS ndays_train = data_train.shape[0] ndays_test = data_test.shape[0] print('\ntraining set duration: {} days, {} weeks'.format(ndays_train, ndays_train/7)) print('test set duration: {} days, {} weeks\n'.format(ndays_test, ndays_test/7)) save_to_pickle(data_train, 'data', 'data_train.pkl') save_to_pickle(data_test, 'data', 'data_test.pkl')
import random import matplotlib.pyplot as plt from DataTools.pickle import save_to_pickle, load_from_pickle def n_random_integers(n, low=0, high=10): ''' generate random numbers with random.randint''' ii = [] for i in range(n): ii.append(random.randint(low, high)) return np.array(ii) if __name__ == '__main__': df_30 = load_from_pickle('data', 'data_30min.pkl') # single timeseries of global power kw = df_30.Global_active_power # power in kW # clip to whole days firstday = '2006-12-17 00:00:00' lastday = '2010-11-25 23:30:00' kw = kw[kw.index >= firstday] kw = kw[kw.index <= lastday] # array of single day timeseries delta_t = kw.index[1] - kw.index[0] # size of timestep n_ts = int(datetime.timedelta(days=1) / delta_t) # number of timesteps n_rows = int(len(kw) / n_ts) # number of rows
import os import pandas as pd import numpy as np from DataTools.pickle import save_to_pickle, load_from_pickle import matplotlib.pyplot as plt import seaborn as sns if __name__ == '__main__': series = load_from_pickle('data', 'data_369.pkl') vals = series.values # remove zeros before ts data idx = np.argmax(vals > 0) vals = vals[idx:] # smooth with moving average filter mva = pd.rolling_mean(vals, 500) # choose raw or pre-smoothed data data = vals # split dataset split_idx = 80000 window = 150 train = data[:split_idx] test = data[split_idx:] test_window = test[:window] # statsmodels autoregression run_ar = False
model = Sequential() model.add( Conv1D(filters=16, kernel_size=3, activation='relu', input_shape=(n_timesteps, n_features))) model.add(MaxPooling1D(pool_size=2)) model.add(Flatten()) model.add(Dense(10, activation='relu')) model.add(Dense(n_outputs)) model.compile(loss='mse', optimizer='adam') return model if __name__ == '__main__': train_df = load_from_pickle('data', 'data_train.pkl') test_df = load_from_pickle('data', 'data_test.pkl') # TRANSFORM to np array # columns: ['Global_active_power', 'Global_reactive_power', 'Voltage', # 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', # 'Sub_metering_3', 'Sub_metering_4'] train_data = train_df.values test_data = test_df.values # TRANSFORM to CNN input shape # 1D CNN input shape: [n_samples, n_timesteps_per_sample, n_features] # e.g. [159 (week), 7(days), 1 (feature)] (or 8 features) train_data = train_data.reshape(int(train_data.shape[0] / 7), 7, train_data.shape[1]) test_data = test_data.reshape(int(test_data.shape[0] / 7), 7,
# indices that contain independent sets of values ii = np.linspace(n, len(data), int(len(data) / 7)) ii = np.insert(ii, 0, 0) ii = np.delete(ii, -1) ii = ii.astype(int) yy = [] for i in ii: yy.append(list(data[i])) # append lists flat = [item for sublist in yy for item in sublist] return flat if __name__ == '__main__': # load output and test set (true, pred, errors) = load_from_pickle('output', 'output_1.pkl') test_df = load_from_pickle('data', 'data_test.pkl') test = test_df.values # get single timseries for true and pred (pred is first day predition) n_days = 7 yy_true = timeseries_from_staggered_timeseries_sets(true, n_days) yy_pred = timeseries_from_staggered_timeseries_sets(pred, n_days) # plot true vs predicted fname = 'output_1_predictions' plt.plot(yy_true, 'b', label='true') plt.plot(yy_pred, 'orange', label='predicted', linewidth=2) plt.ylabel('power usage [kW]') plt.xlabel('test period [days]') plt.legend()
import matplotlib.pyplot as plt from sklearn.mixture import GaussianMixture from collections import Counter import matplotlib.pyplot as plt from DataTools.pickle import save_to_pickle, load_from_pickle def n_random_integers(n, low=0, high=10): ''' generate random numbers with random.randint''' ii = [] for i in range(n): ii.append(random.randint(low, high)) return np.array(ii) if __name__=='__main__': data = load_from_pickle('data','daily_array_all.pkl') # shape: (1440, 48) # cluster as gaussian mixture X = data n = 10 gmm = GaussianMixture(n_components=n) gmm.fit(X) y = gmm.predict(X) probs = gmm.predict_proba(X) # sort results into clusters based on labels def clusters_from_lables(X,y): labels = np.unique(y) clusters = [] for label in labels:
rmse = np.sqrt(mse) errors.append(rmse) return np.array(errors) if __name__ == '__main__': # load model and compile with open('models/model_1.json', 'r') as f: model_json = json.load(f) model = model_from_json(model_json) model.compile(loss='mse', optimizer='adam') # load model weights model.load_weights('models/model_1.h5') print('model and weights loaded') # load data: train set in inputs/outputs, test set X_train = load_from_pickle('data_Xy', 'X_train.pkl') y_train = load_from_pickle('data_Xy', 'y_train.pkl') test_df = load_from_pickle('data', 'data_test.pkl') test = test_df.values # evaluate on test set: univariate feat_col = 0 test = test[:, feat_col] true, pred = walk_forward_validation(test, model, n_input=7) # score predictions, save to file errors = calc_rmse_error(true, pred) save_to_pickle((true, pred, errors), 'output', 'output_1.pkl')