Example #1
0
def transform_data():
    from solaris.run import load_data
    from sklearn.externals import joblib

    data = load_data('data/data.pkl')

    kringing = PertubatedKriging()
    #kringing = PertubatedSpline()

    data['description'] = '%r: %r' % (kringing, kringing.est)
    print data['description']

    print('_' * 80)
    print(kringing)
    print

    for key in ['train', 'test']:
        print('_' * 80)
        print('transforming %s' % key)
        print
        X = data['X_%s' % key]

        X = kringing.fit_transform(X)
        data['X_%s' % key] = X

    print
    print('dumping data')
    joblib.dump(data, 'data/interp10_data.pkl')
    IPython.embed()
Example #2
0
def _transform_data():
    from solaris.run import load_data
    from solaris.models import LocalModel

    data = load_data()
    X = data['X_train']
    y = data['y_train']

    # no shuffle - past-future split
    offset = X.shape[0] * 0.5
    X_train, y_train = X[:offset], y[:offset]
    X_test, y_test = X[offset:], y[offset:]

    print('_' * 80)
    print('transforming data')
    print
    tf = LocalModel(None)
    print('transforming train')
    X_train, y_train = tf.transform(X_train, y_train)
    print('transforming test')
    X_test, y_test = tf.transform(X_test, y_test)
    print('fin')

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    scaler = StandardScaler()
    y_train = scaler.fit_transform(y_train)
    y_test = scaler.transform(y_test)

    data = {'X_train': X_train, 'X_test': X_test,
            'y_train': y_train, 'y_test': y_test}
    joblib.dump(data, 'data/dbndata.pkl')
def _transform_data():
    from solaris.run import load_data
    from solaris.models import LocalModel
    from solaris.models import Baseline

    data = load_data()
    X = data['X_train']
    y = data['y_train']

    # no shuffle - past-future split
    offset = X.shape[0] * 0.5
    X_train, y_train = X[:offset], y[:offset]
    X_test, y_test = X[offset:], y[offset:]

    print('_' * 80)
    print('transforming data')
    print
    tf = LocalModel(None)
    #tf = Baseline()
    print('transforming train')
    X_train, y_train = tf.transform(X_train, y_train)
    print('transforming test')
    X_test, y_test = tf.transform(X_test, y_test)
    print('fin')

    data = {'X_train': X_train, 'X_test': X_test,
            'y_train': y_train, 'y_test': y_test}
    joblib.dump(data, 'data/lcdata.pkl')
Example #4
0
def _transform_data():
    from solaris.run import load_data
    from solaris.models import LocalModel
    from solaris.models import Baseline

    data = load_data()
    X = data['X_train']
    y = data['y_train']

    # no shuffle - past-future split
    offset = X.shape[0] * 0.5
    X_train, y_train = X[:offset], y[:offset]
    X_test, y_test = X[offset:], y[offset:]

    print('_' * 80)
    print('transforming data')
    print
    tf = LocalModel(None)
    #tf = Baseline()
    print('transforming train')
    X_train, y_train = tf.transform(X_train, y_train)
    print('transforming test')
    X_test, y_test = tf.transform(X_test, y_test)
    print('fin')

    data = {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test
    }
    joblib.dump(data, 'data/lcdata.pkl')
Example #5
0
def inspect():
    from netCDF4 import Dataset
    from matplotlib import pyplot as plt
    from solaris.run import load_data
    data = load_data('data/data.pkl')
    X = data['X_train']
    y = data['y_train']

    ## x_train = Interpolate._grid_data()

    ## fx = 0
    ## day = 180
    ## y_train = X.nm[day, fx, 0, 3]
    ## est = GaussianProcess(corr='squared_exponential',
    ##                       theta0=4.0)
    ## est.fit(x_train, y_train)

    ## n_lat, n_lon = y_train.shape
    ## m = np.mgrid[0:n_lat:0.5, 0:n_lon:0.5]

    grid = Dataset('data/gefs_elevations.nc', 'r')
    lon = np.unique(grid.variables['longitude'][:] - 360)
    lat = np.unique(grid.variables['latitude'][:])

    # take a grid
    for fx_id in range(3):
        G = X.nm[0, fx_id, 0, 3]

        new_lats = np.linspace(lat.min(), lat.max(), 10 * lat.shape[0])
        new_lons = np.linspace(lon.min(), lon.max(), 10 * lon.shape[0])
        new_lats, new_lons = np.meshgrid(new_lats, new_lons)

        x = Interpolate._grid_data()[:, [1, 0]]  # lat, lon
        y = G
        fig, ([ax1, ax2, ax3, ax4]) = plt.subplots(4, 1)
        plt.title('Feature %d' % fx_id)
        ax1.imshow(G, interpolation='none')

        est = GaussianProcess(corr='squared_exponential', theta0=(3.0, 7.0))
        est.fit(x, y.ravel())
        G = est.predict(np.c_[new_lats.ravel(),
                              new_lons.ravel()]).reshape((10 * lon.shape[0],
                                                          10 * lat.shape[0])).T
        ax2.imshow(G, interpolation='none')
        est = SplineEstimator()
        lon = np.unique(x[:, 1])
        lat = np.unique(x[:, 0])
        est.fit((lon, lat), y)
        G = est.predict(np.c_[new_lons.ravel(),
                              new_lats.ravel()]).reshape((10 * lon.shape[0],
                                                          10 * lat.shape[0])).T
        ax3.imshow(G, interpolation='none')

        est = LinearInterpolator()
        est.fit(x, y.ravel())
        G = est.predict(np.c_[new_lats.ravel(),
                              new_lons.ravel()]).reshape((10 * lon.shape[0],
                                                          10 * lat.shape[0])).T
        ax4.imshow(G, interpolation='none')

    def nugget_kungfu(day=0, fx_id=0, hour=3, theta0=(0.4, 1.0)):
        G = X.nm[day, fx_id, :, hour]

        G_m = G.mean(axis=0)
        G_s = G.std(axis=0)

        from sklearn.gaussian_process.gaussian_process import MACHINE_EPSILON
        nugget = (G_s / G_m) ** 2.0
        mask = ~np.isfinite(nugget)
        nugget[mask] = 10. * MACHINE_EPSILON
        nugget = nugget.ravel()
        est = GaussianProcess(corr='squared_exponential',
                              theta0=theta0,
                              #thetaL=(.5, 1.0), thetaU=(5.0, 10.0),
                              #random_start=100,
                              nugget=nugget,
                              )
        est.fit(x, G_m.ravel())
        print('est.theta_: %s' % str(est.theta_))

        pred, sigma = est.predict(np.c_[new_lats.ravel(), new_lons.ravel()],
                                  eval_MSE=True)
        pred = pred.reshape((10 * lon.shape[0], 10 * lat.shape[0])).T
        sigma = sigma.reshape((10 * lon.shape[0], 10 * lat.shape[0])).T

        fig, ([ax1, ax2, ax3, ax4]) = plt.subplots(4, 1)
        ax1.imshow(G_m, interpolation='none')
        ax1.set_ylabel('Ens mean')
        ax2.imshow(G_s, interpolation='none')
        ax2.set_ylabel('Ens std')
        ax3.imshow(pred, interpolation='none')
        ax3.set_ylabel('GP mean')
        ax4.imshow(sigma, interpolation='none')
        ax4.set_ylabel('GP sigma')


    IPython.embed()
Example #6
0
def benchmark():
    from solaris.run import load_data
    from sklearn import grid_search
    from sklearn import metrics

    def rmse(y_true, pred):
        return np.sqrt(metrics.mean_squared_error(y_true, pred))

    data = load_data()
    X = data['X_train']
    y = data['y_train']

    x = Interpolate._grid_data()

    fx = 0
    day = 180
    y = X.nm[day, fx].mean(axis=0)[3]
    #nugget = X.nm[day, fx].std(axis=0)[3]
    mask = np.ones_like(y, dtype=np.bool)
    rs = np.random.RandomState(5)
    test_idx = np.c_[rs.randint(2, 7, 20),
                     rs.randint(3, 13, 20)]
    print test_idx.shape
    mask[test_idx[:, 0], test_idx[:, 1]] = False
    mask = mask.ravel()
    y = y.ravel()

    print '_' * 80
    est = GaussianProcess(corr='squared_exponential', theta0=(10, 10, 10))
    est.fit(x[mask], y[mask])
    pred = est.predict(x[~mask])
    print 'MAE: %.2f' % metrics.mean_absolute_error(y[~mask], pred)

    print '_' * 80

    sys.exit(0)

    #import IPython
    #IPython.embed()

    class KFold(object):

        n_folds = 1

        def __iter__(self):
            yield mask, ~mask

        def __len__(self):
            return 1

    est = Ridge()
    params = {'normalize': [True, False],
              'alpha': 10.0 ** np.arange(-7, 1, 1)}
    gs = grid_search.GridSearchCV(est, params, cv=KFold(),
                                  scoring='mean_squared_error').fit(x, y)
    print gs.grid_scores_
    print gs.best_score_

    est = GaussianProcess()
    params = {'corr': ['squared_exponential'],
               'theta0': MultivariateNormal(),
               }

    ## params = {'corr': ['squared_exponential'],
    ##           #'regr': ['constant', 'linear', 'quadratic'],
    ##           'theta0': np.arange(4, 11),
    ##           }

    # gs = grid_search.GridSearchCV(est, params, cv=KFold(),
    #                               loss_func=rmse).fit(x, y)
    gs = grid_search.RandomizedSearchCV(est, params, cv=KFold(),
                                        scoring='mean_squared_error',
                                        n_iter=100).fit(x, y)
    print gs.grid_scores_
    print gs.best_params_
    print gs.best_score_