def ev_dir( pred_dir, true, measures={ 'SMAE': smae, 'SRMSE': srmse, 'SMAPE': smape, 'MASE': partial(mase, shift=7 * 48) }): if os.listdir(true): files = [ os.path.splitext(file)[0] for file in os.listdir(pred_dir) if os.path.isfile(true + os.path.splitext(file)[0] + '/test.csv') ] # files in both directories result = pd.concat([ ev(pred=dp.load( path=pred_dir + file + '.csv', idx='date', dates=True), true=dp.load( path=true + file + '/test.csv', idx='date', dates=True), label=file, parse_label=False, measures=measures) for file in files ], axis=0, join='outer') # merge results else: result = pd.concat([ ev(pred=dp.load(path=pred_dir + name, idx='date', dates=True), true=true, label=re.sub(r',?[^,]*.csv', '', name), parse_label=True, measures=measures) for name in os.listdir(pred_dir) ], axis=0, join='outer') # merge results result = result.fillna(value=False) # replace nans with False return result
loss='mean_squared_error', optimizer='adam'): from keras.models import Sequential from keras.layers.core import Dense model = Sequential() # FFN model.add(Dense(n_hidden, input_dim=n_in, activation=activation)) # input & hidden layers #model.add(Dropout({{uniform(0, 1)}})) # randomly set a number of inputs to 0 to prevent overfitting model.add(Dense(n_out)) # output layer model.compile(loss=loss, optimizer=optimizer) # assemble network return model np.random.seed(0) # fix seed for reprodicibility path = 'C:/Users/SABA/Google Drive/mtsg/data/household_power_consumption.csv' # data path load = dp.load(path) # load data load_with_nans = load.apply( axis=1, func=(lambda x: np.nan if (x.isnull().sum() > 0) else x.mean())).unstack( ) # custom sum function where any Nan in arguments gives Nan as result # set grid search parameters and ranges grid_space = { 'n_hidden': [10, 20, 30], 'nb_epoch': [500, 1000, 1500, 2000], 'batch_size': [1, 5, 10, 20] } for i in range(1, 6): # optimize for number of time steps X, Y = dp.split_X_Y( dp.shift(load_with_nans, n_shifts=i, shift=1).dropna() ) # create patterns & targets in the correct format
if interval > 0 and i % interval == 0: print( 'Epoch: {} | Batch: {}/{} ({:.0f}%) | G Loss: {:.6f} | C Loss: {:.6f}' .format( epoch, batch_size * i, len(train_loader.dataset), 100. * (batch_size * i) / len(train_loader.dataset), g_loss.item(), c_loss.item())) g_train_loss /= g_batches c_train_loss /= len(train_loader) print('* (Train) Epoch: {} | G Loss: {:.4f} | C Loss: {:.4f}'.format( epoch, g_train_loss, c_train_loss)) return (g_train_loss, c_train_loss) train_loader, vocab = load(batch_size, seq_len) autoencoder = Autoencoder(enc_hidden_dim, dec_hidden_dim, embedding_dim, latent_dim, vocab.size(), dropout, seq_len) autoencoder.load_state_dict( torch.load('autoencoder.th', map_location=lambda x, y: x)) generator = Generator(n_layers, block_dim) critic = Critic(n_layers, block_dim) g_optimizer = optim.Adam(generator.parameters(), lr=lr) c_optimizer = optim.Adam(critic.parameters(), lr=lr) if cuda: autoencoder = autoencoder.cuda() generator = generator.cuda() critic = critic.cuda() best_loss = np.inf
prep=prep, postp=postp) # evaluate network pred = pd.concat([pred, new_pred], axis=0) # add new predictions train_loss = pd.concat([train_loss, tl], axis=0, ignore_index=True) # append to old loss val_loss = pd.concat([val_loss, l], axis=0, ignore_index=True) # append to old loss return pred, train_loss, val_loss # SLNs np.random.seed(0) # fix seed for reprodicibility data_dir = 'C:/Users/SABA/Google Drive/mtsg/data/train_test/' # directory containing data exp_dir = 'C:/Users/SABA/Google Drive/mtsg/data/sln/' # directory containing results of experiments true = dp.load(path=data_dir + 'test.csv', idx='date', dates=True) # observations to forecast measures = { 'SRMSE': pf.srmse, 'MASE': partial(pf.mase, shift=48 * 7), 'SMAPE': pf.smape, 'SMAE': pf.smae, } # performance to consider train = dp.load(path=data_dir + 'train.csv', idx='date', dates=True) # load train set test = dp.load(path=data_dir + 'test.csv', idx='date', dates=True) # load test set weather_train = { name: dp.load(path=data_dir + name + '_train.csv', idx='date', dates=True) for name in ['temp', 'hum', 'wind'] } # load weather characteristics for train set weather_test = {
import os import numpy as np import sklearn import matplotlib.pyplot as plt import pandas as pd import dataprep as dp import patsy import importlib from sklearn.metrics import r2_score from unittest.mock import inplace from sklearn import multioutput np.random.seed(0) # fix seed for reprodicibility path = 'C:/Users/SABA/Google Drive/mtsg/data/household_power_consumption.csv' # data path load_raw = dp.load(path) # load data targets = load_raw.apply( axis=1, func=(lambda x: np.nan if (x.isnull().sum() > 0) else x.mean())).unstack( ) # custom sum function where any Nan in arguments gives Nan as result # moving average for i in range(1, 50): pred = targets.rolling(window=i).mean().shift(1) load = pd.concat({'pred': pred, 'targets': targets}, axis=1) load.dropna(inplace=True) print( r2_score(y_pred=load['pred'], y_true=load['targets'], multioutput='uniform_average'))
data = imp.imp( data, alg=impts.na_seadec, freq=1440, **{ 'algorithm': 'ma', 'weighting': 'linear', 'k': 2 }) # impute the whole dataset using three best methods of imputation dp.save(data, path=data_dir + 'data_imp.csv', idx='datetime') # save imputed data # AGGREGATE DATA & CREATE TRAIN & TEST SETS exp_dir = 'C:/Users/SABA/Google Drive/mtsg/data/train_test/' # directory for the results data = dp.load(path=data_dir + 'data_imp.csv', idx='datetime', cols='load', dates=True) # load imputed data data = dp.resample(data, freq=1440) # aggregate minutes to half-hours train, test = dp.train_test(data=data, test_size=0.255, base=7) # split into train & test sets dp.save(data=train, path=exp_dir + 'train.csv', idx='date') # save train set dp.save(data=test, path=exp_dir + 'test.csv', idx='date') # save test set dp.save_dict( dic=dp.split(train, nsplits=7), path=exp_dir + 'train_', idx='date' ) # split train set according to weekdays and save each into a separate file dp.save_dict( dic=dp.split(test, nsplits=7), path=exp_dir + 'test_', idx='date' ) # split test set according to weekdays and save each into a separate file # WEATHER DATA