def transform_data(): from solaris.run import load_data from sklearn.externals import joblib data = load_data('data/data.pkl') kringing = PertubatedKriging() #kringing = PertubatedSpline() data['description'] = '%r: %r' % (kringing, kringing.est) print data['description'] print('_' * 80) print(kringing) print for key in ['train', 'test']: print('_' * 80) print('transforming %s' % key) print X = data['X_%s' % key] X = kringing.fit_transform(X) data['X_%s' % key] = X print print('dumping data') joblib.dump(data, 'data/interp10_data.pkl') IPython.embed()
def _transform_data(): from solaris.run import load_data from solaris.models import LocalModel data = load_data() X = data['X_train'] y = data['y_train'] # no shuffle - past-future split offset = X.shape[0] * 0.5 X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] print('_' * 80) print('transforming data') print tf = LocalModel(None) print('transforming train') X_train, y_train = tf.transform(X_train, y_train) print('transforming test') X_test, y_test = tf.transform(X_test, y_test) print('fin') scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) scaler = StandardScaler() y_train = scaler.fit_transform(y_train) y_test = scaler.transform(y_test) data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test} joblib.dump(data, 'data/dbndata.pkl')
def _transform_data(): from solaris.run import load_data from solaris.models import LocalModel from solaris.models import Baseline data = load_data() X = data['X_train'] y = data['y_train'] # no shuffle - past-future split offset = X.shape[0] * 0.5 X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] print('_' * 80) print('transforming data') print tf = LocalModel(None) #tf = Baseline() print('transforming train') X_train, y_train = tf.transform(X_train, y_train) print('transforming test') X_test, y_test = tf.transform(X_test, y_test) print('fin') data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test} joblib.dump(data, 'data/lcdata.pkl')
def _transform_data(): from solaris.run import load_data from solaris.models import LocalModel from solaris.models import Baseline data = load_data() X = data['X_train'] y = data['y_train'] # no shuffle - past-future split offset = X.shape[0] * 0.5 X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] print('_' * 80) print('transforming data') print tf = LocalModel(None) #tf = Baseline() print('transforming train') X_train, y_train = tf.transform(X_train, y_train) print('transforming test') X_test, y_test = tf.transform(X_test, y_test) print('fin') data = { 'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test } joblib.dump(data, 'data/lcdata.pkl')
def inspect(): from netCDF4 import Dataset from matplotlib import pyplot as plt from solaris.run import load_data data = load_data('data/data.pkl') X = data['X_train'] y = data['y_train'] ## x_train = Interpolate._grid_data() ## fx = 0 ## day = 180 ## y_train = X.nm[day, fx, 0, 3] ## est = GaussianProcess(corr='squared_exponential', ## theta0=4.0) ## est.fit(x_train, y_train) ## n_lat, n_lon = y_train.shape ## m = np.mgrid[0:n_lat:0.5, 0:n_lon:0.5] grid = Dataset('data/gefs_elevations.nc', 'r') lon = np.unique(grid.variables['longitude'][:] - 360) lat = np.unique(grid.variables['latitude'][:]) # take a grid for fx_id in range(3): G = X.nm[0, fx_id, 0, 3] new_lats = np.linspace(lat.min(), lat.max(), 10 * lat.shape[0]) new_lons = np.linspace(lon.min(), lon.max(), 10 * lon.shape[0]) new_lats, new_lons = np.meshgrid(new_lats, new_lons) x = Interpolate._grid_data()[:, [1, 0]] # lat, lon y = G fig, ([ax1, ax2, ax3, ax4]) = plt.subplots(4, 1) plt.title('Feature %d' % fx_id) ax1.imshow(G, interpolation='none') est = GaussianProcess(corr='squared_exponential', theta0=(3.0, 7.0)) est.fit(x, y.ravel()) G = est.predict(np.c_[new_lats.ravel(), new_lons.ravel()]).reshape((10 * lon.shape[0], 10 * lat.shape[0])).T ax2.imshow(G, interpolation='none') est = SplineEstimator() lon = np.unique(x[:, 1]) lat = np.unique(x[:, 0]) est.fit((lon, lat), y) G = est.predict(np.c_[new_lons.ravel(), new_lats.ravel()]).reshape((10 * lon.shape[0], 10 * lat.shape[0])).T ax3.imshow(G, interpolation='none') est = LinearInterpolator() est.fit(x, y.ravel()) G = est.predict(np.c_[new_lats.ravel(), new_lons.ravel()]).reshape((10 * lon.shape[0], 10 * lat.shape[0])).T ax4.imshow(G, interpolation='none') def nugget_kungfu(day=0, fx_id=0, hour=3, theta0=(0.4, 1.0)): G = X.nm[day, fx_id, :, hour] G_m = G.mean(axis=0) G_s = G.std(axis=0) from sklearn.gaussian_process.gaussian_process import MACHINE_EPSILON nugget = (G_s / G_m) ** 2.0 mask = ~np.isfinite(nugget) nugget[mask] = 10. * MACHINE_EPSILON nugget = nugget.ravel() est = GaussianProcess(corr='squared_exponential', theta0=theta0, #thetaL=(.5, 1.0), thetaU=(5.0, 10.0), #random_start=100, nugget=nugget, ) est.fit(x, G_m.ravel()) print('est.theta_: %s' % str(est.theta_)) pred, sigma = est.predict(np.c_[new_lats.ravel(), new_lons.ravel()], eval_MSE=True) pred = pred.reshape((10 * lon.shape[0], 10 * lat.shape[0])).T sigma = sigma.reshape((10 * lon.shape[0], 10 * lat.shape[0])).T fig, ([ax1, ax2, ax3, ax4]) = plt.subplots(4, 1) ax1.imshow(G_m, interpolation='none') ax1.set_ylabel('Ens mean') ax2.imshow(G_s, interpolation='none') ax2.set_ylabel('Ens std') ax3.imshow(pred, interpolation='none') ax3.set_ylabel('GP mean') ax4.imshow(sigma, interpolation='none') ax4.set_ylabel('GP sigma') IPython.embed()
def benchmark(): from solaris.run import load_data from sklearn import grid_search from sklearn import metrics def rmse(y_true, pred): return np.sqrt(metrics.mean_squared_error(y_true, pred)) data = load_data() X = data['X_train'] y = data['y_train'] x = Interpolate._grid_data() fx = 0 day = 180 y = X.nm[day, fx].mean(axis=0)[3] #nugget = X.nm[day, fx].std(axis=0)[3] mask = np.ones_like(y, dtype=np.bool) rs = np.random.RandomState(5) test_idx = np.c_[rs.randint(2, 7, 20), rs.randint(3, 13, 20)] print test_idx.shape mask[test_idx[:, 0], test_idx[:, 1]] = False mask = mask.ravel() y = y.ravel() print '_' * 80 est = GaussianProcess(corr='squared_exponential', theta0=(10, 10, 10)) est.fit(x[mask], y[mask]) pred = est.predict(x[~mask]) print 'MAE: %.2f' % metrics.mean_absolute_error(y[~mask], pred) print '_' * 80 sys.exit(0) #import IPython #IPython.embed() class KFold(object): n_folds = 1 def __iter__(self): yield mask, ~mask def __len__(self): return 1 est = Ridge() params = {'normalize': [True, False], 'alpha': 10.0 ** np.arange(-7, 1, 1)} gs = grid_search.GridSearchCV(est, params, cv=KFold(), scoring='mean_squared_error').fit(x, y) print gs.grid_scores_ print gs.best_score_ est = GaussianProcess() params = {'corr': ['squared_exponential'], 'theta0': MultivariateNormal(), } ## params = {'corr': ['squared_exponential'], ## #'regr': ['constant', 'linear', 'quadratic'], ## 'theta0': np.arange(4, 11), ## } # gs = grid_search.GridSearchCV(est, params, cv=KFold(), # loss_func=rmse).fit(x, y) gs = grid_search.RandomizedSearchCV(est, params, cv=KFold(), scoring='mean_squared_error', n_iter=100).fit(x, y) print gs.grid_scores_ print gs.best_params_ print gs.best_score_