Example #1
0
# ker_expsine = ConstantKernel(1.0, (1e-4, 1e4)) * ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1))

kernel_list = [
    # ConstantKernel(1.0, (1e-4, 1e4)) * ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)),
    ConstantKernel(1.0, (1e-4, 1e4)) *
    RBF(length_scale=1.0, length_scale_bounds=(1e-1, 10.0)),
    ConstantKernel(1.0, (1e-4, 1e4)) *
    RationalQuadratic(length_scale=1.0, alpha=0.1),
    ConstantKernel(1.0, (1e-4, 1e4)) *
    Matern(length_scale=1.0, length_scale_bounds=(1e-1, 10.0), nu=1.5)
]

param_grid = {
    "kernel": kernel_list,
    "optimizer": ["fmin_l_bfgs_b"],
    "alpha": [1e-9, 1e-8]
}

gp = GaussianProcessRegressor()
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=50)

for k in kernel_list:
    gpk = GaussianProcessRegressor(kernel=k, alpha=1e-8)
    gpk.fit(X_train, y_train)
    y_pred, sigma = gpk.predict(X_test, return_std=True)
    print np.average(np.apply_along_axis(np.linalg.norm, 1, y_test - y_pred))
    print 1.96 * np.average(sigma)
    def fit(self, tr_chunk, ts_chunk):

        model_Y = tr_chunk['y'].values
        model_X = tr_chunk.drop(['id', 'y'], 1)

        test_Y = ts_chunk['y'].values
        test_X = ts_chunk.drop(['id', 'y'], 1)
        #%
        '''
         2. replace NaN with median value
        '''
        X = model_X
        self.imputer = Imputer(missing_values='NaN', strategy='median', axis=0)
        self.imputer.fit(X)
        model_X_imputed = self.imputer.transform(X)

        test_X_imputed = self.imputer.transform(test_X)
        #%
        '''
         3. normalization
        '''

        self.normalization = preprocessing.StandardScaler().fit(
            model_X_imputed)
        model_X_norm = self.normalization.transform(model_X_imputed)

        test_X_norm = self.normalization.transform(test_X_imputed)
        '''
        Apply regression
        '''

        X = model_X_norm
        Y = model_Y

        skf = KFold(n_splits=self.kf_k, shuffle=True, random_state=self.seed)
        kf_i = 0
        self.models = []
        for tr_idx, val_idx in skf.split(X):
            print('KF', kf_i, end=',')
            tr_X, tr_Y = X[tr_idx, :], Y[tr_idx]
            val_X, val_Y = X[val_idx, :], Y[val_idx]

            model = GaussianProcessRegressor(
                alpha=self.model_config['alpha'],
                random_state=self.model_config['random_state'])
            model.fit(tr_X, tr_Y)
            tr_Y_pred = model.predict(tr_X)
            tr_r = utils.cal_r(tr_Y, tr_Y_pred)
            print('tr:', tr_r, end=',')
            val_Y_pred = model.predict(val_X)
            val_r = utils.cal_r(val_Y, val_Y_pred)
            print('val:', val_r, end=',')

            test_Y_pred = model.predict(test_X_norm)
            test_r = utils.cal_r(test_Y, test_Y_pred)
            print('test:', test_r, end='')

            # discard kf model whose val R is too low
            if val_r > -1:
                self.models.append(model)
                print(' (SAVED)')
            else:
                print(' (DISCARD)')
            kf_i += 1

        tr_Y_sum = np.zeros(Y.shape[0])
        test_Y_sum = np.zeros(test_Y.shape[0])
        for model in self.models:
            tr_Y_sum += model.predict(X)
            test_Y_sum += model.predict(test_X_norm)

        tr_Y_pred = tr_Y_sum / len(self.models)
        test_Y_pred = test_Y_sum / len(self.models)

        tr_r = utils.cal_r(Y, tr_Y_pred)
        print('AVERAGE tr:', tr_r, end=',')
        test_r = utils.cal_r(test_Y, test_Y_pred)
        print('test:', test_r)

        return self
Example #3
0
def fitGaussianProc(patDXdTdata, patAvgXdata, dXdTdata, avgXdata, diag,
                    lengthScaleFactors, plotTrajParams):
    '''
  Fits a GP on the change data (x, dx/dt)

  Parameters
  ----------
  patDXdTdata
  patAvgXdata
  dXdTdata
  avgXdata
  diag
  estimNoise
  lengthScaleFactors

  Returns
  -------

  '''

    # Mesh the input space for evaluations of the real function, the prediction and
    # its MSE
    assert (CTL == 1)
    nrBiomk = patDXdTdata.shape[1]
    #minX = np.amin(patAvgXdata, axis=0)
    #maxX = np.amax(patAvgXdata, axis=0)
    minX = np.nanmin(avgXdata, axis=0)
    maxX = np.nanmax(avgXdata, axis=0)
    assert not any(np.isnan(minX))
    assert not any(np.isnan(maxX))

    intervalSize = maxX - minX
    minX -= intervalSize / 0.5
    maxX += intervalSize / 0.5

    #print minX.shape, maxX.shape
    nrPointsToEval = 500
    x_pred = np.zeros((nrPointsToEval, nrBiomk), float)
    dXdT_pred = np.zeros((nrPointsToEval, nrBiomk), float)
    sigma_pred = np.zeros((nrPointsToEval, nrBiomk), float)
    nrSamples = 20
    posteriorSamples = np.zeros((nrSamples, nrPointsToEval, nrBiomk), float)

    # print(avgXdata.shape, diag.shape)
    # print(avgXdata[diag == CTL,:].shape)
    ctlXMean = np.nanmean(avgXdata[diag == CTL, :], axis=0)
    ctlXStd = np.nanstd(avgXdata[diag == CTL, :], axis=0)

    ctldXdTMean = np.nanmean(dXdTdata[diag == CTL, :], axis=0)
    ctldXdTStd = np.nanstd(dXdTdata[diag == CTL, :], axis=0)

    allXMean = np.nanmean(avgXdata, axis=0)
    allXStd = np.nanstd(avgXdata, axis=0)

    alldXdTMean = np.nanmean(dXdTdata, axis=0)
    alldXdTStd = np.nanstd(dXdTdata, axis=0)

    patXMean = np.nanmean(patAvgXdata, axis=0)
    patXStd = np.nanstd(patAvgXdata, axis=0)

    patdXdTMean = np.nanmean(patDXdTdata, axis=0)
    patdXdTStd = np.nanstd(patDXdTdata, axis=0)

    gpList = []

    for b in range(nrBiomk):
        points = np.linspace(minX[b], maxX[b], nrPointsToEval)
        #print points.shape

        X = patAvgXdata[:, b]
        Y = patDXdTdata[:, b]
        notNanInd = np.logical_not(np.isnan(X))
        X = X[notNanInd]
        Y = Y[notNanInd]

        X = X.reshape(-1, 1)
        Y = Y.reshape(-1, 1)

        # X = (X - allXMean[b]) / allXStd[b] # standardizing the inputs and outputs
        # Y = (Y - alldXdTMean[b]) / alldXdTStd[b]
        # minX[b] = (minX[b] - allXMean[b]) / allXStd[b]
        # maxX[b] = (maxX[b] - allXMean[b]) / allXStd[b]

        X = (X -
             patXMean[b]) / patXStd[b]  # standardizing the inputs and outputs
        # Y = (Y - patdXdTMean[b]) / patdXdTStd[b]
        Y = Y / patdXdTStd[b]
        minX[b] = (minX[b] - patXMean[b]) / patXStd[b]
        maxX[b] = (maxX[b] - patXMean[b]) / patXStd[b]

        #print 'Xshape, Yshape', X.shape, Y.shape
        lower, upper = np.abs(1 / np.max(X)), np.abs(1 / (np.min(X) + 1e-6))
        if lower > upper:
            lower, upper = upper, lower
        mid = 1 / np.abs(np.mean(X))

        # print("X", X[:20],'Y', Y[:20])
        # print(minX, maxX)

        #lengthScale = (np.max(X)-np.min(X))
        lengthScale = lengthScaleFactors[b] * (np.max(X) - np.min(X)) / 2
        estimNoise = np.var(
            Y
        ) / 2  # this should be variance, as it is placed as is on the diagonal of the kernel, which is a covariance matrix
        #estimAlpha = np.ravel((np.std(Y))**2)
        #estimAlpha = np.var(Y)/2
        estimAlpha = np.std(Y) * 2
        boundsFactor = 2.0
        #estimAlpha = 0
        #need to specity bounds as the lengthScale is optimised in the fit
        rbfKernel = ConstantKernel(1.0, constant_value_bounds="fixed") * RBF(
            length_scale=lengthScale,
            length_scale_bounds=(float(lengthScale) / boundsFactor,
                                 1 * lengthScale))
        whiteKernel = ConstantKernel(
            1.0, constant_value_bounds="fixed") * WhiteKernel(
                noise_level=estimNoise,
                noise_level_bounds=(float(estimNoise) / boundsFactor,
                                    boundsFactor * estimNoise))
        #rbfKernel = 1 * RBF(length_scale=lengthScale)
        #whiteKernel = 1 * WhiteKernel(noise_level=estimNoise)
        kernel = rbfKernel + whiteKernel
        #kernel = 1.0 * RBF(length_scale=lengthScale)
        print('\nbiomk %d  lengthScale %f  noise %f alpha %f' %
              (b, lengthScale, estimNoise, estimAlpha))
        #print estimAlpha.shape
        normalizeYflag = False
        #normalizeYflag = True

        gp = GaussianProcessRegressor(kernel=rbfKernel,
                                      alpha=estimAlpha,
                                      optimizer='fmin_l_bfgs_b',
                                      n_restarts_optimizer=100,
                                      normalize_y=normalizeYflag)

        #gp = GaussianProcessRegressor(kernel=rbfKernel, alpha=estimAlpha, optimizer=None, n_restarts_optimizer=100, normalize_y=True)

        assert not any(np.isnan(X))
        assert not any(np.isnan(Y))
        # Fit to data using Maximum Likelihood Estimation of the parameters
        gp.fit(X, Y)
        print("optimised kernel", gp.kernel_
              )  #, "  theta", gp.kernel_.theta, " bounds", gp.kernel_.bounds)

        #gpNonOpt = GaussianProcessRegressor(kernel=rbfKernel, alpha=estimAlpha, optimizer=None, normalize_y=False)
        #gpNonOpt.fit(X,Y)
        #print("non-optimised kernel", gpNonOpt.kernel_)#,  "  theta", gpNonOpt.kernel_.theta, " bounds", gpNonOpt.kernel_.bounds)

        #gp = gpNonOpt

        # Make the prediction on the meshed x-axis (ask for Cov matrix as well)
        x_pred[:, b] = np.linspace(minX[b], maxX[b], nrPointsToEval)
        assert not any(np.isnan(x_pred[:, b]))
        dXdT_predCurr, cov_matrix = gp.predict(x_pred[:, b].reshape(-1, 1),
                                               return_cov=True)

        # make sure dXdT is not too low, otherwise truncate the [minX, maxX] interval
        dXdTthresh = 1e-10
        tooLowMask = np.abs(np.ravel(dXdT_predCurr)) < dXdTthresh
        print(tooLowMask.shape)
        if np.sum(tooLowMask) > nrPointsToEval / 10:
            print(
                "Warning dXdT is too low, will restict the [minxX, maxX] interval"
            )
            goodIndicesMask = np.logical_not(tooLowMask)
            #print(x_pred.shape, goodIndicesMask.shape)
            #print(x_pred[goodIndicesMask, b])
            minX[b] = min(x_pred[goodIndicesMask, b])
            maxX[b] = max(x_pred[goodIndicesMask, b])
            x_pred[:, b] = np.linspace(minX[b], maxX[b], nrPointsToEval)
            dXdT_predCurr, cov_matrix = gp.predict(x_pred[:, b].reshape(-1, 1),
                                                   return_cov=True)

        MSE = np.diagonal(cov_matrix)

        dXdT_pred[:, b] = np.ravel(dXdT_predCurr)
        sigma_pred[:, b] = np.ravel(np.sqrt(MSE))
        samples = gp.sample_y(x_pred[:, b].reshape(-1, 1),
                              n_samples=nrSamples,
                              random_state=0)
        posteriorSamples[:, :, b] = np.squeeze(samples).T

        # renormalize the Xs and Ys
        # x_pred[:,b] = x_pred[:,b] * allXStd[b] + allXMean[b]
        # dXdT_pred[:,b] = dXdT_pred[:,b] * alldXdTStd[b] + alldXdTMean[b]
        # sigma_pred[:,b] = sigma_pred[:,b] * alldXdTStd[b]
        # posteriorSamples[:,:,b] = posteriorSamples[:,:,b]*alldXdTStd[b] + alldXdTMean[b]

        # renormalize the Xs and Ys
        # x_pred[:, b] = x_pred[:, b] * patXStd[b] + patXMean[b]
        # dXdT_pred[:, b] = dXdT_pred[:, b] * patdXdTStd[b] + patdXdTMean[b]
        # sigma_pred[:, b] = sigma_pred[:, b] * patdXdTStd[b]
        # posteriorSamples[:, :, b] = posteriorSamples[:, :, b] * patdXdTStd[b] + patdXdTMean[b]

        x_pred[:, b] = x_pred[:, b] * patXStd[b] + patXMean[b]
        dXdT_pred[:, b] = dXdT_pred[:, b] * patdXdTStd[b]
        sigma_pred[:, b] = sigma_pred[:, b] * patdXdTStd[b]
        posteriorSamples[:, :, b] = posteriorSamples[:, :, b] * patdXdTStd[b]

        # diagCol = plotTrajParams['diagColors']
        # fig = pl.figure(1)
        # nrDiags = np.unique(diag).shape[0]
        # for diagNr in range(1, nrDiags + 1):
        #   print(avgXdata.shape, diag.shape, dXdTdata.shape, diagCol, diagNr)
        #   pl.scatter(avgXdata[diag == diagNr, b], dXdTdata[diag == diagNr, b], color = diagCol[diagNr - 1])
        #
        # modelCol = 'r' # red
        # pl.plot(x_pred[:, b], dXdT_pred[:, b], '%s-' % modelCol, label = u'Prediction')
        # pl.fill(np.concatenate([x_pred[:, b], x_pred[::-1, b]]), np.concatenate(
        #   [dXdT_pred[:, b] - 1.9600 * sigma_pred[:, b], (dXdT_pred[:, b] + 1.9600 * sigma_pred[:, b])[::-1]]), alpha = .5,
        #         fc = modelCol, ec = 'None', label = '95% confidence interval')
        # for s in range(nrSamples):
        #   pl.plot(x_pred[:, b], posteriorSamples[s, :, b])
        # fig.show()

        params = gp.get_params(deep=True)
        #print 'kernel', gp.kernel
        #print 'params', params

        gpList.append(gp)

    #print(adsa)

    return x_pred, dXdT_pred, sigma_pred, gpList, posteriorSamples
Example #4
0
fq["Day"] = fq["Dates"].dt.day

def setting_train_test(mid,end,column):
    X = (fq["Year"],fq["Mo"],fq["Day"])
    X = np.array(X).T
    Y = fq[column]
    x_whole = X[:end]
    y_whole = Y[:end]
    x_train = X[:mid]
    x_test = X[mid:end]
    y_train = Y[:mid]
    y_test = Y[mid:end]
    return mid,end,x_whole,x_train,x_test,y_whole,y_train,y_test

kernel = C(1, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
gp = GaussianProcessRegressor(kernel=RBF(length_scale=20),alpha = .1,normalize_y=True,n_restarts_optimizer=15)

mid,end,x_whole,x_train,x_test,y_whole,y_train,y_test = setting_train_test(754,858,column = "bam")

gp.fit(x_train, y_train)
y_pred, sigma = gp.predict(x_whole, return_std=True)

plt.plot(fq.index,y_pred,color="G")
# plt.plot(test_bam["ds"],test_bam["bam"])
# plt.plot(train_bam["ds"],train_bam["y"])
plt.show()

print("The Root Mean Squared Error is: "+str(np.sqrt(metrics.mean_squared_error(y_whole,y_pred))))
print("The Mean Squared Error is: "+str(metrics.mean_squared_error(y_whole,y_pred)))
print("The MAPE is: "+str(mean_absolute_percentage_error(y_whole,y_pred)))
print("The Explained Variance Score is: "+str(metrics.explained_variance_score(y_whole,y_pred)))
        y_grid.append(i - abs(f_min))
        j = j + 1
    i += 1

X_grid = np.array(x_grid).reshape(-1, 1)
X_grid_shape = X_grid.shape
x_zero = X_grid_shape[0]
Y_grid = np.array(y_grid).reshape(-1, 1)
Y_grid_shape = Y_grid.shape

X_test = np.concatenate([X_grid, Y_grid], axis=1).reshape(-1, 2)

res = ker(X_test)

noise = 1
gpr = GaussianProcessRegressor(kernel=ker, alpha=noise**2)


def gaussian_regression(x_train, y_train):
    gpr.fit(x_train, y_train)
    gpr.get_params()

    mu, sigma = gpr.predict(X_test, return_std=True)

    Z_var = sigma.reshape(dimx, dimy)
    Z_mean = mu.reshape(dimx, dimy)

    return sigma, mu, Z_var, Z_mean


def plot_gaussian(x_a, y_a, n, Z_var, Z_mean, Benchmark_plot):
Example #6
0
################# Implement gaussian process regression
# adapted from sklearn example code

# Generate sample data
X = t[:, np.newaxis]
y = X_percent

# randomly choose data points as exp data points
x = np.random.randint(0, high=len(X), size=20, dtype='l')
X_sample = X[x]
y_sample = y[x]

######## New GP
# Instantiate a Gaussian Process model
kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)

# Fit to data using Maximum Likelihood Estimation of the parameters
gp.fit(X_sample, y_sample)

# Make the prediction on the meshed x-axis (ask for MSE as well)
y_pred, sigma = gp.predict(X, return_std=True)

# Plot the function, the prediction and the 95% confidence interval based on
# the MSE
plt.figure()
plt.plot(X, y, '.', markersize=3, label=u'Observations')
plt.plot(X, y_pred, '-', label=u'Prediction')
for i in range(0, y.shape[1]):
    plt.fill(np.concatenate([X, X[::-1]]),
             np.concatenate([
Example #7
0
    def __init__(self, f, pbounds, verbose=1):
        """
        :param f:
            Function to be maximized.

        :param pbounds:
            Dictionary with parameters names as keys and a tuple with minimum
            and maximum values.

        :param verbose:
            Whether or not to print progress.

        """
        # Store the original dictionary
        self.pbounds = pbounds

        # Get the name of the parameters
        self.keys = list(pbounds.keys())

        # Find number of parameters
        self.dim = len(pbounds)

        # Create an array with parameters bounds
        self.bounds = []
        for key in self.pbounds.keys():
            self.bounds.append(self.pbounds[key])
        self.bounds = np.asarray(self.bounds)

        # Some function to be optimized
        self.f = f

        # Initialization flag
        self.initialized = False

        # Initialization lists --- stores starting points before process begins
        self.init_points = []
        self.x_init = []
        self.y_init = []

        # Numpy array place holders
        self.X = None
        self.Y = None

        # Counter of iterations
        self.i = 0

        # Internal GP regressor
        self.gp = GaussianProcessRegressor(
            kernel=Matern(nu=2.5),
            n_restarts_optimizer=25,
        )

        # Utility Function placeholder
        self.util = None

        # PrintLog object
        self.plog = PrintLog(self.keys)

        # Output dictionary
        self.res = {}
        # Output dictionary
        self.res['max'] = {'max_val': None,
                           'max_params': None}
        self.res['all'] = {'values': [], 'params': []}

        # Verbose
        self.verbose = verbose
print(house_y_regr)

accuracy = regr.score(y_val, house_y_regr)
print(accuracy)

house_y_pred = regr.predict(X_test)


r2(y_val, house_y_regr)
mse(y_val, house_y_regr)
mae(y_val, house_y_regr)

###Gaussian Process####
kernel = DotProduct() + WhiteKernel()
reg_GP = GaussianProcessRegressor(kernel = kernel)

reg_GP.fit(X_train, y_train)

house_y_gp_regr = reg_GP.predict(X_val)

r2(y_val, house_y_gp_regr)
mse(y_val, house_y_gp_regr)
mae(y_val, house_y_gp_regr)

###Bayesian Regression###

reg_BR = linear_model.BayesianRidge()
reg_BR.fit(X_train, y_train)

house_y_br_regr = reg_BR.predict(X_val)
Example #9
0
ax.scatter(dev_inp[:,0], dev_inp[:,1], dev_oup[:,i],label="training set")
ax.scatter(test_inp[:,0], test_inp[:,1], test_oup[:,i],label="test set")
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
ax.view_init(30, 210)
plt.legend()
plt.show()

# GPR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
#kernel = ConstantKernel(1.0) * RBF(length_scale=1.0)
kernel = 1.0 * Matern(length_scale=1.0, length_scale_bounds=(1e-1, 10.0),
                        nu=1.5)
gpr = GaussianProcessRegressor(kernel=kernel).fit(train_inp, train_oup)
print (gpr.score(train_inp, train_oup))
predicted_gp1 = gpr.predict(train_inp) 
predicted_gp2 = gpr.predict(dev_inp) 
predicted_gp3 = gpr.predict(test_inp) 
#plot 1
plt.figure(figsize=(10,7))
s = 3
for i in range(t_d_oup.shape[1]):
    plt.subplot(2,3,i+1)
    plt.scatter(train_oup[:,i],predicted_gp1[:,i],s=s,label="training set")
    plt.scatter(  dev_oup[:,i],predicted_gp2[:,i],s=s,label="development set")
    plt.scatter( test_oup[:,i],predicted_gp3[:,i],s=s,label="test set")
    train_error = ((predicted_gp1[:,i] - train_oup[:,i])**2).mean()
    dev_error = ((predicted_gp2[:,i] - dev_oup[:,i])**2).mean()
    test_error = ((predicted_gp3[:,i] - test_oup[:,i])**2).mean()
Example #10
0
def make_plot(days_ago, dates, mag):
    print('Making plot...')
    time_span = np.max(dates) - np.min(dates)
    min_plot = -0.5
    max_plot = 1.5
    x_days = -120

    # Make daily bins
    nights = np.arange(0, 120, 1)
    daily_mags = []
    errors = []
    for night in nights:
        selector = np.where((days_ago < night + 1) & (days_ago > night))
        n_obs = np.size(mag[selector])
        flux = biweight_location(mag[selector])
        error = np.std(mag[selector]) / np.sqrt(n_obs)
        if error > 0.75:
            error = 0
        daily_mags.append(flux)
        errors.append(error)
        print(night, flux, error, n_obs, np.std(mag[selector]))
    nights_all = nights.copy()
    daily_mags_all = daily_mags.copy()
    errors_all = errors.copy()

    lookback = np.arange(1, 20, 1)

    for missing_days in lookback:
        nights = nights_all.copy()[missing_days:]
        daily_mags = daily_mags_all.copy()[missing_days:]
        errors = errors_all.copy()[missing_days:]
        plt.errorbar(-(nights + 0.5),
                     daily_mags,
                     yerr=errors,
                     fmt='.k',
                     alpha=0.5)
        plt.xlabel('Days from today')
        plt.ylabel('Visual magnitude')
        mid = biweight_location(mag)
        plt.ylim(min_plot, max_plot)
        plt.xlim(-100, 100)
        plt.gca().invert_yaxis()
        date_text = datetime.datetime.now().strftime("%d %b %Y")
        plt.text(95,
                 min_plot + 0.1,
                 'AAVSO visual (by-eye) daily bins',
                 ha='right')
        plt.text(95,
                 min_plot + 0.2,
                 'Gaussian process regression, Matern 3/2 kernel',
                 ha='right')
        plt.text(95,
                 min_plot + 0.3,
                 '@betelbot update ' + date_text,
                 ha='right')
        use_days = 100 - missing_days
        X = np.array(nights + 0.5)
        X = X[:use_days]
        y = np.array(daily_mags)
        y = y[:use_days]
        X, y = cleaned_array(X, y)
        length_scale = 2
        kernel = ConstantKernel() + Matern(
            length_scale=length_scale, nu=3 / 2) + WhiteKernel(noise_level=1)
        X = X.reshape(-1, 1)
        gp = gaussian_process.GaussianProcessRegressor(kernel=kernel)
        gp.fit(X, y)
        GaussianProcessRegressor(alpha=1e-10,
                                 copy_X_train=True,
                                 kernel=1**2 +
                                 Matern(length_scale=length_scale, nu=1.5) +
                                 WhiteKernel(noise_level=1),
                                 n_restarts_optimizer=0,
                                 normalize_y=False,
                                 optimizer='fmin_l_bfgs_b',
                                 random_state=None)
        x_pred = np.linspace(60, -120, 250).reshape(-1, 1)
        y_pred, sigma = gp.predict(x_pred, return_std=True)
        plt.plot(-x_pred, y_pred, linestyle='dashed', color='blue')
        plt.fill_between(-x_pred.ravel(),
                         y_pred + sigma,
                         y_pred - sigma,
                         alpha=0.5)
        idx = 20 - missing_days
        if idx < 10:
            filename = "0" + str(idx) + '.png'
        else:
            filename = str(idx) + '.png'

        plt.savefig(filename, bbox_inches='tight', dpi=100)
        print('Plot made', filename)
        plt.clf()
# Fit KernelRidge with parameter selection based on 5-fold cross validation
param_grid = {
    "alpha": [1e0, 1e-1, 1e-2, 1e-3],
    "kernel": [
        ExpSineSquared(l, p) for l in np.logspace(-2, 2, 10)
        for p in np.logspace(0, 2, 10)
    ]
}
kr = GridSearchCV(KernelRidge(), cv=5, param_grid=param_grid)
stime = time.time()
kr.fit(X, y)
print("Time for KRR fitting: %.3f" % (time.time() - stime))

gp_kernel = ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) \
    + WhiteKernel(1e-1)
gpr = GaussianProcessRegressor(kernel=gp_kernel)
stime = time.time()
gpr.fit(X, y)
print("Time for GPR fitting: %.3f" % (time.time() - stime))

# Predict using kernel ridge
X_plot = np.linspace(0, 20, 10000)[:, None]
stime = time.time()
y_kr = kr.predict(X_plot)
print("Time for KRR prediction: %.3f" % (time.time() - stime))

# Predict using kernel ridge
stime = time.time()
y_gpr = gpr.predict(X_plot, return_std=False)
print("Time for GPR prediction: %.3f" % (time.time() - stime))
Example #12
0
def kriging(st=0,
            et=24,
            sample_step=-1,
            skip=1,
            num_restarts=30,
            save=1,
            output='testModel',
            gpy=0):
    """
   Estimate velocity field using rbf kernels on t,y,x (K = Kt*Ky*Kx)
   st,et = initial,final time step of drifter data
   sample_step = pick data each sample_step. If negative, pick data at 
each time step = sample_step*(-1)     
   skip = skip drifters. If skip ==1, use all drifters
   num_restart: number of restarts on optimization
   save: if == 0, return values, if not, save as output+'.mat' and output+'.pkl' 
   """
    #   st = 0; et = 144
    startTime = datetime.now()
    time, latt, lont, vob, uob, validPoints = getData(st, et)

    # origin of cartesian coord.
    lat0 = 28.8
    lon0 = -88.55
    NAD83 = pyproj.Proj("+init=EPSG:3452")  #Louisiana South (ftUS)
    xob, yob = NAD83(lont, latt)
    tob = time[:, None]
    tob = np.repeat(tob, np.size(latt, 1), axis=1)
    yob[np.where(np.isnan(lont))] = np.nan
    xob[np.where(np.isnan(lont))] = np.nan

    x_ori = np.nanmin(xob) + 2
    y_ori = np.nanmin(yob) + 2
    xob = (xob - x_ori) / 1000.  # in km
    yob = (yob - y_ori) / 1000.  # in km

    Nd = np.size(xob, 1) / skip  # number of drifters
    if (sample_step < 0) | (skip > 1):  # get samples per time step
        ss = np.abs(sample_step)
        sample_step = 1
        samples = np.arange(0, np.size(tob, 0), ss)

        if (skip > 1):  # !!!!
            testt = np.arange(np.size(tob, 0))  #
            testd = set(np.arange(0, np.size(tob, 1))) - set(
                np.arange(0, np.size(tob, 1), skip))
            testd = np.array(list(testd))
        else:
            testd = np.arange(0, np.size(tob, 1))
            if ss > 1:
                testt = set(np.arange(0, np.size(tob, 0))) - set(samples)
                testt = np.array(list(testt))
            else:
                testt = samples
        to = np.reshape(tob[samples, ::skip], [-1, 1])
        yo = np.reshape(yob[samples, ::skip], [-1, 1])
        xo = np.reshape(xob[samples, ::skip], [-1, 1])
        uo = np.reshape(uob[samples, ::skip], [-1, 1])
        vo = np.reshape(vob[samples, ::skip], [-1, 1])
        tt = np.reshape(tob[testt[:, None], testd], [-1, 1])
        yt = np.reshape(yob[testt[:, None], testd], [-1, 1])
        xt = np.reshape(xob[testt[:, None], testd], [-1, 1])
        ut = np.reshape(uob[testt[:, None], testd], [-1, 1])
        vt = np.reshape(vob[testt[:, None], testd], [-1, 1])

    else:
        ss = 0
        samples = np.arange(
            0, xob.size, sample_step)  #np.random.randint(0,xo.size,nsamples)
        test = set(np.arange(xob.size)) - set(samples)
        test = np.array(list(test))
        xt = np.reshape(xob, [-1])[test, None]
        yt = np.reshape(yob, [-1])[test, None]
        tt = np.reshape(tob, [-1])[test, None]
        ut = np.reshape(uob, [-1])[test, None]
        vt = np.reshape(vob, [-1])[test, None]
        xo = np.reshape(xob, [-1])[samples, None]
        yo = np.reshape(yob, [-1])[samples, None]
        to = np.reshape(tob, [-1])[samples, None]
        uo = np.reshape(uob, [-1])[samples, None]
        vo = np.reshape(vob, [-1])[samples, None]

    validPoints = np.where((~np.isnan(xo)) & (~np.isnan(yo)))
    to = to[validPoints][:, None]
    xo = xo[validPoints][:, None]
    yo = yo[validPoints][:, None]
    uo = uo[validPoints][:, None]
    vo = vo[validPoints][:, None]
    validPoints = np.where((~np.isnan(xt)) & (~np.isnan(yt)))
    tt = tt[validPoints][:, None]
    xt = xt[validPoints][:, None]
    yt = yt[validPoints][:, None]
    ut = ut[validPoints][:, None]
    vt = vt[validPoints][:, None]

    print 'number of observations: ' + str(np.size(vo))
    output_mat = output + '.mat'
    output_obj_u = output + '_u.pkl'
    output_obj_v = output + '_v.pkl'
    # From here on, always use T,Y,X order
    X = np.concatenate([to, yo, xo], axis=1)
    obs = np.concatenate([vo, uo], axis=1)
    Xt = np.concatenate([tt, yt, xt], axis=1)
    obst = np.concatenate([vt, ut], axis=1)
    #########
    # Compute covariances
    # Pay attention on the order T,Y,X
    priors = np.array([1., 1., 1.])
    bounds = np.array([[0.5, 10.], [0.3, 10], [0.3, 10]])
    if gpy == 1:
        kv = GPy.kern.RBF(input_dim=3, ARD=True) + GPy.kern.RBF(input_dim=3,
                                                                ARD=True)
        model = GPy.models.GPRegression(X, vo, kv)
        model.optimize_restarts(messages=False, num_restarts=num_restarts)
        hypv = model.param_array
        del model, kv
        print gc.collect(2)
        #      print model
        #      print kv.parameters

        kv = GPy.kern.RBF(input_dim=3, ARD=True) + GPy.kern.RBF(input_dim=3,
                                                                ARD=True)
        model = GPy.models.GPRegression(X, uo, kv)
        model.optimize_restarts(messages=False, num_restarts=num_restarts)
        hypu = model.param_array
#      print model
#      print kv.parameters

    else:

        noise = kernels.WhiteKernel(
            noise_level=0.0001)  #, noise_level_bounds=(1e-06, 1.0))
        kv = kernels.RBF(length_scale=priors)  #,length_scale_bounds=bounds)
        k = kv + kv + noise  #kernels.Sum(kv,noise)
        model = GaussianProcessRegressor(kernel=k,
                                         n_restarts_optimizer=num_restarts)
        model.fit(X, vo)
        hypv = np.zeros(8)
        hypv[1:4] = model.kernel_.k1.k1.length_scale
        hypv[5:8] = model.kernel_.k1.k2.length_scale
        hypv[-1] = model.kernel_.k2.noise_level
        #      print model_v.kernel_
        del model, kv, k
        print gc.collect(2)

        ku = kernels.RBF(length_scale=priors)  # ,length_scale_bounds=bounds)
        k = ku + ku + noise  #kernels.Sum(ku,noise)
        model = GaussianProcessRegressor(kernel=k,
                                         n_restarts_optimizer=num_restarts)
        model.fit(X, uo)
        hypu = np.zeros(9)
        hypu[1:4] = model.kernel_.k1.k1.length_scale
        hypu[5:8] = model.kernel_.k1.k2.length_scale
        hypu[-1] = model.kernel_.k2.noise_level

    print 'Optimized Hyperparameters ================================================='
    nKernels = 2
    for i in range(nKernels):
        print 'Var.' + str(i + 1) + ' (u,v) = ' + str(
            hypu[4 * i]) + ' , ' + str(hypv[4 * i])
        print 'Lt ' + str(i + 1) + '  (u,v) = ' + str(
            hypu[4 * i + 1]) + ' , ' + str(hypv[4 * i + 1])
        print 'Ly ' + str(i + 1) + '  (u,v) = ' + str(
            hypu[4 * i + 2]) + ' , ' + str(hypv[4 * i + 2])
        print 'Lx ' + str(i + 1) + '  (u,v) = ' + str(
            hypu[4 * i + 3]) + ' , ' + str(hypv[4 * i + 3])

#   print 'Var. (u,v) = ' + str(hypu[0])   + ' , ' + str(hypv[0])
#   print 'Lt   (u,v) = ' + str(hypu[1]) + ' , ' + str(hypv[1])
#   print 'Ly   (u,v) = ' + str(hypu[2]) + ' , ' + str(hypv[2])
#   print 'Lx   (u,v) = ' + str(hypu[3]) + ' , ' + str(hypv[3])
    print 'Noise (u,v) = ' + str(hypu[-1]) + ' , ' + str(hypv[-1])
    print '==========================================================================='
    print 'End of script, time : ' + str(datetime.now() - startTime)
Example #13
0
data_test['furniture'] = label_furniture.transform(data_test['furniture'])
data_test['district'] = label_district.transform(data_test['district'])

X = data_test.drop(['pricePerM'], axis=1)
y = data_test['pricePerM']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel

kernel = DotProduct() + WhiteKernel()
gpr = GaussianProcessRegressor(kernel=kernel,
                               random_state=0).fit(X_train, y_train)
print(gpr.score(X_test, y_test))

# names = ['SVR', 'SGDRegressor', 'BayesianRidge', 'LassoLars', 'ARDRegression', 'PassiveAggressiveRegressor',
#          'TheilSenRegressor', 'LinearRegression']
#
# classifiers = [
#     svm.SVR(),
#     linear_model.SGDRegressor(),
#     linear_model.BayesianRidge(),
#     linear_model.LassoLars(),
#     linear_model.ARDRegression(),
#     linear_model.PassiveAggressiveRegressor(),
#     linear_model.TheilSenRegressor(),
#     linear_model.LinearRegression()]
#
# dataset = 'BANC_freesurf'
dataset = 'BANC_freesurf'
resamplefactor = 4

project_wd, project_data, project_sink = get_paths(debug, dataset)
demographics, imgs, data = get_data(project_data, dataset, debug, project_wd,
                                    resamplefactor)

targetAttribute = np.array(demographics['Age'])
# To ensure the example runs quickly, we'll make the training dataset relatively small
Xtrain, Xtest, Ytrain, Ytest = train_test_split(data, targetAttribute, test_size=.25,
                                                                random_state=random_seed)

# Perform analysis with custom Linear Kernel
kernel1 = LinearKernel()
gp1 = GaussianProcessRegressor(kernel=kernel1, normalize_y=True)
# gp.fit(Xtrain, Ytrain)
# Ypredict = gp.predict(Xtest)

cv_results1 = cross_validate(gp1, Xtrain, Ytrain,
                            scoring='neg_mean_absolute_error', cv=10)
print('The mean absolute error over the different cross-validations is:')
print(np.mean(cv_results1['test_score']))

# Perform analysis with dot product kernel and set the sigma to 0
kernel2 = DotProduct(sigma_0=0)
gp2 = GaussianProcessRegressor(kernel=kernel2, normalize_y=True)
cv_results2 = cross_validate(gp2, Xtrain, Ytrain,
                            scoring='neg_mean_absolute_error', cv=10)
print('The mean absolute error over the different cross-validations is:')
print(np.mean(cv_results2['test_score']))
print("test length:", len(test))

# print("window:\n", window[33])
print('window shape:', window[33].shape)

# print("test:\n", test[33])
print('test shape:', test[33].shape)

# kernel setup
gpr = []
kernel = 1.0 * RBF(length_scale=10, length_scale_bounds=(1e-1, 1e6))
for j in range(len(window)):
    gpr.append(
        GaussianProcessRegressor(kernel=kernel,
                                 alpha=1e-3,
                                 optimizer='fmin_l_bfgs_b',
                                 n_restarts_optimizer=20,
                                 random_state=0))

print("gpr:", gpr[1])

# fitting
k1 = []
k2 = []
for j in range(len(window)):
    train = window[j]
    X = np.array(train['elapsed_time']).reshape(-1, 1)
    y = np.array(train.loc[:, ['8_x', '8_y', '8_z']])
    gpr[j].fit(X, y)
    print("kernel", j, "params:", gpr[j].kernel_.get_params())
    k1.append(gpr[j].kernel_.get_params()['k1__constant_value'])
Example #16
0
def test_no_optimizer():
    # Test that kernel parameters are unmodified when optimizer is None.
    kernel = RBF(1.0)
    gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None).fit(X, y)
    assert_equal(np.exp(gpr.kernel_.theta), 1.0)
Example #17
0
        logger.info('build the model')
        model.fit_generator(generator,
                            steps_per_epoch=steps_per_epoch,
                            epochs=args.max_epochs,
                            callbacks=callbacks)

        logger.info('save the model')
        model.save(os.path.join(args.output_dir, 'model'))
    else:
        logger.info('build the model')
        import dill as pickle
        if args.model_name == 'gpr':
            from sklearn.gaussian_process import GaussianProcessRegressor
            from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
            kernel = DotProduct() + WhiteKernel()
            model = GaussianProcessRegressor(kernel=kernel)
        elif args.model_name == 'ridge':
            from sklearn.linear_model import Ridge
            model = Ridge(alpha=1000)
        indices_train = np_setdiff(parent_table_train, samples_na)
        logger.info('train the model')
        model.fit(X[indices_train], y[indices_train])
        logger.info('save the model')
        with open(os.path.join(args.output_dir, 'model'), 'wb') as fout:
            pickle.dump(model, fout)

    if args.bootstrap:
        logger.info('test the model')
        y_pred = np.ravel(model.predict(X_test))

        logger.info('save predictions on the test set')
Example #18
0
def test_lml_precomputed():
    # Test that lml of optimized kernel is stored correctly.
    for kernel in kernels:
        gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
        assert_equal(gpr.log_marginal_likelihood(gpr.kernel_.theta),
                     gpr.log_marginal_likelihood())
Example #19
0
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=123)

# Average CV score on the training set was: 0.9645623341853348
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        FunctionTransformer(copy)
    ),
    Nystroem(gamma=0.75, kernel="additive_chi2", n_components=10),
    GaussianProcessRegressor(kernel=Matern(length_scale=2.6, nu=2.5), n_restarts_optimizer=210, normalize_y=True)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 123)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
X, y = load_mauna_loa_atmospheric_co2()

# Kernel with parameters given in GPML book
k1 = 66.0**2 * RBF(length_scale=67.0)  # long term smooth rising trend
k2 = 2.4**2 * RBF(length_scale=90.0) \
    * ExpSineSquared(length_scale=1.3, periodicity=1.0)  # seasonal component
# medium term irregularity
k3 = 0.66**2 \
    * RationalQuadratic(length_scale=1.2, alpha=0.78)
k4 = 0.18**2 * RBF(length_scale=0.134) \
    + WhiteKernel(noise_level=0.19**2)  # noise terms
kernel_gpml = k1 + k2 + k3 + k4

gp = GaussianProcessRegressor(kernel=kernel_gpml,
                              alpha=0,
                              optimizer=None,
                              normalize_y=True)
gp.fit(X, y)

print("GPML kernel: %s" % gp.kernel_)
print("Log-marginal-likelihood: %.3f" %
      gp.log_marginal_likelihood(gp.kernel_.theta))

# Kernel with optimized parameters
k1 = 50.0**2 * RBF(length_scale=50.0)  # long term smooth rising trend
k2 = 2.0**2 * RBF(length_scale=100.0) \
    * ExpSineSquared(length_scale=1.0, periodicity=1.0,
                     periodicity_bounds="fixed")  # seasonal component
# medium term irregularities
k3 = 0.5**2 * RationalQuadratic(length_scale=1.0, alpha=1.0)
k4 = 0.1**2 * RBF(length_scale=0.1) \
n_samples = 5

# %%
# Kernel cookbook
# ---------------
#
# In this section, we illustrate some samples drawn from the prior and posterior
# distributions of the Gaussian process with different kernels.
#
# Radial Basis Function kernel
# ............................
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

kernel = 1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-1, 10.0))
gpr = GaussianProcessRegressor(kernel=kernel, random_state=0)

fig, axs = plt.subplots(nrows=2, sharex=True, sharey=True, figsize=(10, 8))

# plot prior
plot_gpr_samples(gpr, n_samples=n_samples, ax=axs[0])
axs[0].set_title("Samples from prior distribution")

# plot posterior
gpr.fit(X_train, y_train)
plot_gpr_samples(gpr, n_samples=n_samples, ax=axs[1])
axs[1].scatter(X_train[:, 0],
               y_train,
               color="red",
               zorder=10,
               label="Observations")
linear_r_param_grid = {
    'X_transform__num__si__strategy': ['mean', 'median'],
    # 'mlp__solver': ['sgd', 'adam', 'lbfgs'],
    # 'mlp__alpha': [1e-1, 1e-3, 1e-5],
    # 'mlp__hidden_layer_sizes': [(10,), (20,), (5, 2), (4, 3), (4, 4)],
    # 'mlp__activation': ['identity', 'logistic', 'tanh', 'relu'],
}
k1 = 1**2 * RBF(length_scale=100) + WhiteKernel(noise_level=1)
# k2 = 1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
#     + WhiteKernel(noise_level=1e-5, noise_level_bounds=(1e-10, 1e+1))
# k3 = DotProduct(sigma_0=1) + WhiteKernel(noise_level=1)
# k4 = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
gp_pipe = Pipeline([
    ('X_transform', ct),
    ('gp', GaussianProcessRegressor(kernel=k1)),
])

gp_param_grid = {
    'X_transform__num__si__strategy': ['median'],
    # 'gp__alpha': [1e-01, 1e-03, 1e-05],
    # 'gp__kernel': [k2]
    # 'rf__n_estimators': range(1, 20),
    # 'rf__criterion': ['mse', 'mae'],
}

scoring = 'r2'
gs_linear_r = GridSearchCV(linear_r_pipe,
                           linear_r_param_grid,
                           cv=kf,
                           scoring=scoring)
Example #23
0
    x_fill = np.linspace(X[0, 0], X[-1, 0], 1000).reshape(-1, 1)
    y_pred, sigma = gpr.predict(x_fill, return_std=True)

    y_pred = y_pred.reshape(-1, 1)
    sigma = sigma.reshape(-1, 1)

    fig = plt.figure()
    plt.scatter(X, y, color='r', label='observations')
    plt.plot(x_fill, y_pred, 'b-', label='prediction')
    upper, lower = y_pred + 1.96 * sigma, y_pred - 1.96 * sigma
    plt.fill_between(x_fill.squeeze(),
                     upper.squeeze(),
                     lower.squeeze(),
                     color='r',
                     alpha='0.2')
    plt.xlabel('time')
    plt.ylabel('price')
    plt.legend(loc='upper left')

    plt.show()


if __name__ == '__main__':
    f = lambda x: x * np.sin(x) + 2
    X = np.atleast_2d([0.3, 1.2, 2.5, 4., 6.2])
    obs = f(X)
    gpr = GaussianProcessRegressor(kernel=kernels.Matern(nu=2.5))
    gpr.fit(X.reshape(-1, 1), obs.reshape(-1, 1))
    plot_gpr(gpr)
Example #24
0
# for matern
gp_lenscale = 10.0
gp_scale = 1.0
gp_noise = 1.0
gp_alpha = 0.8

# ---------------------------------------- GP baseline (just lags)

# GP (just lags)
print "\nrunning GP with just lags..."
#kernel = gp_scale * RBF(length_scale=gp_lenscale, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(noise_level=gp_noise, noise_level_bounds=(1e-10, 1e+1))
kernel = gp_scale * Matern(
    length_scale=gp_lenscale, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(
        noise_level=gp_noise, noise_level_bounds=(1e-10, 1e+1))
gp = GaussianProcessRegressor(kernel=kernel, alpha=gp_alpha)
gp.fit(lags_train, y_train)
preds_lr = gp.predict(lags_test)
preds_lr = preds_lr * std_test + trend_test
y_true = y_test * std_test + trend_test
corr, mae, rae, rmse, rrse, mape, r2 = compute_error(y_true, preds_lr)
print "MAE:  %.3f\tRMSE: %.3f\tR2:   %.3f" % (mae, rmse, r2)
fw_mae.write("%.3f," % (mae, ))
fw_rae.write("%.3f," % (rae, ))
fw_rmse.write("%.3f," % (rmse, ))
fw_rrse.write("%.3f," % (rrse, ))
fw_mape.write("%.3f," % (mape, ))
fw_r2.write("%.3f," % (r2, ))

# ---------------------------------------- GP with weather
for i_fold, (train_idx, test_idx) in enumerate(kf.split(x, y)):
    x_train, x_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    print('CV iteration: %d' % (i_fold + 1))

    # --------------------------------------------------------------------------
    # Normalization/Scaling/Standardization
    scaler = RobustScaler()

    x_train_norm = scaler.fit_transform(x_train)
    x_test_norm = scaler.transform(x_test)

    # --------------------------------------------------------------------------
    # Model
    gpr = GaussianProcessRegressor()

    # --------------------------------------------------------------------------
    # Model selection
    # Search space
    param_grid = [
        {
            'kernel': [RBF(), DotProduct()],
            'alpha': [1e0, 1e-1, 1.5e-1, 1e-2, 1.5e-2]
        },
    ]

    # Gridsearch
    internal_cv = KFold(n_splits=5)
    grid_cv = GridSearchCV(estimator=gpr,
                           param_grid=param_grid,
Example #26
0
# RandomForestRegressor Accuracy: 0.999510651417654
# RandomForestRegressor Root Mean Squared Error: 20.751314844821263
# RandomForestRegressor R-squared for Train: 0.99
# RandomForestRegressor R-squared for Test: 1.00

# random_state=43일때
# RandomForestRegressor Mean Absolute Error: 2.8778434940855315
# RandomForestRegressor Mean Squared Error: 57230.85101455866
# RandomForestRegressor Accuracy: 0.9573706279271497
# RandomForestRegressor Root Mean Squared Error: 239.22970345372806
# RandomForestRegressor R-squared for Train: 1.00
# RandomForestRegressor R-squared for Test: 0.96

# ------------------------ GaussianProcessRegressor

Gaussian = GaussianProcessRegressor()
# GaussianProcessRegressor 알고리즘 선언

Gaussian.fit(X_train,y_train)
# GaussianProcessRegressor 알고리즘에 나의 데이터를 적용시켜 본다.

Gaussian_y_pred = Gaussian.predict(X_test)
# GaussianProcessRegressor 알고리즘을 사용해서 Y값 예측한다.


print('GaussianProcessRegressor Mean Absolute Error:', metrics.mean_absolute_error(y_test,Gaussian_y_pred))
print('GaussianProcessRegressor Mean Squared Error:', metrics.mean_squared_error(y_test,Gaussian_y_pred))
print('GaussianProcessRegressor Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test,Gaussian_y_pred)))
print('GaussianProcessRegressor Accuracy:', metrics.r2_score(y_test,Gaussian_y_pred))

Gaussian_df = pd.DataFrame({'Actual':y_test, 'Predicted':Gaussian_y_pred})
Example #27
0
#%%
# -------------------------------------------------------------------------------
# First the noiseless case
X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T

# Observations
y = f(X).ravel()

# Mesh the input space for evaluation of the real funtion, the prediction and
# its MSE
x = np.atleast_2d(np.linspace(0, 10, 1000)).T

# Instantiate the GPR model
kernel = C(1.0, (1e-2, 1e3)) * RBF(10, (1e-2, 1e2))
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)

# Fit the data through ML estimation of the parameters
gp.fit(X, y)

# Make predictions on the meshed x-axis
y_pred, sigma = gp.predict(x, return_std=True)
#%% Plot the figure
X = X.ravel()
x = x.ravel()
y_grid = f(x).ravel()
fig = go.Figure()
t1 = go.Scatter(x=X,
                y=y,
                name="$f(x) = x \\sin(x)$",
                mode="markers",
Example #28
0
def main(path, task, representation, use_pca, n_trials, test_set_size):
    """
    :param path: str specifying path to dataset.
    :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n']
    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
    :param use_pca: bool. If True apply PCA to perform Principal Components Regression.
    :param n_trials: int specifying number of random train/test splits to use
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set
    """

    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()
    X = featurise_mols(smiles_list, representation)

    # If True we perform Principal Components Regression

    if use_pca:
        n_components = 100
    else:
        n_components = None

    r2_list = []
    rmse_list = []
    mae_list = []

    print('\nBeginning training loop...')

    for i in range(0, n_trials):

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_set_size, random_state=i)

        y_train = y_train.reshape(-1, 1)
        y_test = y_test.reshape(-1, 1)

        #  We standardise the outputs but leave the inputs unchanged

        _, y_train, _, y_test, y_scaler = transform_data(
            X_train,
            y_train,
            X_test,
            y_test,
            n_components=n_components,
            use_pca=use_pca)

        X_train = X_train.astype(np.float64)
        X_test = X_test.astype(np.float64)

        gp_kernel = TanimotoKernel()
        gpr = GaussianProcessRegressor(kernel=gp_kernel)
        gpr.fit(X_train, y_train)

        # mean GP prediction

        X_test = np.tile(X_test, (10000, 1))

        import time
        start = time.time()

        y_pred = gpr.predict(X_test, return_std=False)

        end = time.time()
        print(f'time elapsed is {end - start}')
        y_pred = y_scaler.inverse_transform(y_pred)
        y_test = y_scaler.inverse_transform(y_test)

        # Output Standardised RMSE and RMSE on Train Set

        y_pred_train = gpr.predict(X_train, return_std=False)
        train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
        train_rmse = np.sqrt(
            mean_squared_error(y_scaler.inverse_transform(y_train),
                               y_scaler.inverse_transform(y_pred_train)))
        print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
        print("Train RMSE: {:.3f}".format(train_rmse))

        score = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)

        print("\nR^2: {:.3f}".format(score))
        print("RMSE: {:.3f}".format(rmse))
        print("MAE: {:.3f}".format(mae))

        r2_list.append(score)
        rmse_list.append(rmse)
        mae_list.append(mae)

    r2_list = np.array(r2_list)
    rmse_list = np.array(rmse_list)
    mae_list = np.array(mae_list)

    print("\nmean R^2: {:.4f} +- {:.4f}".format(
        np.mean(r2_list),
        np.std(r2_list) / np.sqrt(len(r2_list))))
    print("mean RMSE: {:.4f} +- {:.4f}".format(
        np.mean(rmse_list),
        np.std(rmse_list) / np.sqrt(len(rmse_list))))
    print("mean MAE: {:.4f} +- {:.4f}\n".format(
        np.mean(mae_list),
        np.std(mae_list) / np.sqrt(len(mae_list))))
Example #29
0
tr = 0.1
PI_tr = partial(optimizer_PI, tradeoff=tr)
PI_tr.__name__ = 'PI, tradeoff = %1.1f' % tr
max_PI_tr = partial(max_PI, tradeoff=tr)

acquisitions = zip(
    [PI_tr, optimizer_EI, optimizer_UCB],
    [max_PI_tr, max_EI, max_UCB],
)

for acquisition, query_strategy in acquisitions:

    # initializing the optimizer
    optimizer = BayesianOptimizer(
        estimator=GaussianProcessRegressor(kernel=kernel),
        X_training=X_initial,
        y_training=y_initial,
        query_strategy=query_strategy)

    # plotting the initial estimation
    with plt.style.context('seaborn-white'):
        plt.figure(figsize=(35, 7))
        for n_query in range(5):
            # plot current prediction
            plt.subplot(2, 5, n_query + 1)
            plt.title('Query no. %d' % (n_query + 1))
            if n_query == 0:
                plt.ylabel('Predictions')
            plt.xlim([-1.0, 21.0])
            plt.ylim([-1.5, 3])
Example #30
0
import numpy as np
import pandas as pd
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=123)

# Average CV score on the training set was: 0.9803749056546571
exported_pipeline = GaussianProcessRegressor(kernel=Matern(length_scale=4.0, nu=2.5), n_restarts_optimizer=105, normalize_y=False)
# Fix random state in exported estimator
if hasattr(exported_pipeline, 'random_state'):
    setattr(exported_pipeline, 'random_state', 123)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)