def agg_data(n_apt, seed): # random.seed(seed) # split = 28 # apts = range(1, split) + range(split+1, 115) # random.shuffle(apts) # apts = apts[:n_apt-1] # apts.append(split) random.seed(seed) apts = range(1, 115) random.shuffle(apts) apts = apts[:n_apt] print('num of apts:', len(apts)) agg_energy = {} for apt in apts: print('reading %d ...' % apt) agg_energy[apt] = load_energy(apt) df = pd.DataFrame(agg_energy) def agg_mean(row): return np.mean(row) df['mean'] = df.apply(agg_mean, axis=1) df = df['mean'] filename = DATA_SET_DIR + 'Mean_seed_%d_apt_%d_2016.pkl' % (seed, n_apt) print('saving to file: %s ...' % filename) df.to_pickle(filename) print('saved.')
def agg_all_sum(freqs): apts = range(1, 115) print('# apartments:', len(apts)) print('freqs:', freqs) agg_energy = {} for apt in apts: print('reading %d ...' % apt) agg_energy[apt] = load_energy(apt) df = pd.DataFrame(agg_energy) for freq in freqs: print('freq:', freq) df_freq = df.resample(freq).mean() df_freq = df_freq.loc[pd.date_range(start='2016-01-01', end='2016-12-01', freq=freq)] df_freq['sum'] = df_freq.apply(lambda x: np.sum(x), axis=1) df_freq = df_freq['sum'] filename = DATA_SET_DIR + 'SUM_%d_%s_2016.pkl' % (len(apts), freq) print('saving to file: %s ...' % filename) df_freq.to_pickle(filename) print('saved.')
def visualize_error_lasso_alpha(): dataset = load_energy() start = time.time() model = linear_model.LassoCV(cv=20) model.fit(dataset.data, dataset.target('Y1')) delta = time.time() - start m_log_alphas = -np.log10(model.alphas_) plt.figure() plt.plot(m_log_alphas, model.mse_path_, ':') plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha: CV estimate') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error') plt.title('Mean square error on each fold: coordinate descent ' '(train time: %.2fs)' % delta) plt.axis('tight') plt.show()
def visualize_error_ridge_alpha(n_alphas=200, n_folds=12): dataset = load_energy() alphas = np.logspace(-10, -2, n_alphas) model = linear_model.Ridge(fit_intercept=False) seed = random.randint(1, 10000) X = dataset.data y = dataset.target('Y1') errors = np.zeros(shape=(n_alphas, n_folds)) for idx, alpha in enumerate(alphas): model.set_params(alpha=alpha) splits = ShuffleSplit(len(y), n_iter=n_folds, test_size=0.2, random_state=seed) for jdx, (train, test) in enumerate(splits): X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] model.fit(X_train, y_train) error = mean_squared_error(y_test, model.predict(X_test)) errors[idx, jdx] = error print errors print errors.shape print alphas print alphas.shape plt.figure() plt.plot(alphas, errors, ':') plt.show()
def build(args): """ Builds the models from the arguments. In a real applciation, would probably arguments: - fixtures (where the training data is) - model_dir (where to write the models out to) - kfolds (number of cross validation folds) For now, just write out the pickles to HEAT_MODEL and COLD_MODEL """ start = time.time() # Load data and estimator dataset = load_energy() alphas = np.logspace(-10, -2, 200) scores = {} for y in ('Y1', 'Y2'): # Perform cross validation, don't worry about Imputation here clf = linear_model.RidgeCV(alphas=alphas) scores[y] = cvs(clf, dataset.data, dataset.target(y), cv=12) # Get the alpha from the ridge by fitting the entire data set. # There are a couple of reasons for this, but mostly to ensure that # we get the desired result pickled (e.g. a ridge with alpha) clf.fit(dataset.data, dataset.target(y)) # Build the model on the entire datset include Imputer pipeline model = linear_model.Ridge(alpha=clf.alpha_) imputer = Imputer(missing_values="NaN", strategy="mean", axis=0) estimator = Pipeline([("imputer", imputer), ("ridge", model)]) estimator.fit(dataset.data, dataset.target(y)) # Dump the model jump = { 'Y1': HEAT_MODEL, 'Y2': COLD_MODEL, } with open(jump[y], 'wb') as f: pickle.dump(estimator, f, protocol=pickle.HIGHEST_PROTOCOL) msg = ("%s trained on %i instances using a %s model\n" " average R2 score of %0.3f using an alpha of %0.5f\n" " model has been dumped to %s\n") print(msg % ( y, len(dataset.data), model.__class__.__name__, scores[y].mean(), clf.alpha_, jump[y], )) build_time = time.time() - start return "Build took %0.3f seconds" % build_time
def build(args): """ Builds the models from the arguments. In a real applciation, would probably arguments: - fixtures (where the training data is) - model_dir (where to write the models out to) - kfolds (number of cross validation folds) For now, just write out the pickles to HEAT_MODEL and COLD_MODEL """ start = time.time() # Load data and estimator dataset = load_energy() alphas = np.logspace(-10, -2, 200) scores = {} for y in ('Y1', 'Y2'): # Perform cross validation, don't worry about Imputation here clf = linear_model.RidgeCV(alphas=alphas) scores[y] = cvs(clf, dataset.data, dataset.target(y), cv=12) # Get the alpha from the ridge by fitting the entire data set. # There are a couple of reasons for this, but mostly to ensure that # we get the desired result pickled (e.g. a ridge with alpha) clf.fit(dataset.data, dataset.target(y)) # Build the model on the entire datset include Imputer pipeline model = linear_model.Ridge(alpha=clf.alpha_) imputer = Imputer(missing_values="NaN", strategy="mean", axis=0) estimator = Pipeline([("imputer", imputer), ("ridge", model)]) estimator.fit(dataset.data, dataset.target(y)) # Dump the model jump = { 'Y1': HEAT_MODEL, 'Y2': COLD_MODEL, } with open(jump[y], 'wb') as f: pickle.dump(estimator, f, protocol=pickle.HIGHEST_PROTOCOL) msg = ( "%s trained on %i instances using a %s model\n" " average R2 score of %0.3f using an alpha of %0.5f\n" " model has been dumped to %s\n" ) print(msg % ( y, len(dataset.data), model.__class__.__name__, scores[y].mean(), clf.alpha_, jump[y], )) build_time = time.time() - start return "Build took %0.3f seconds" % build_time