class LocalLengthScalesCorrelation(object): """ Non-stationary correlation model based on local smoothness estimates. This non-stationary correlation model learns internally point estimates of local smoothness using a second-level Gaussian Process. For this, it selects a subset of the training data and learns length-scales at this specific points. These length scales are generalized using the second-level Gaussian Process. Furthermore, global (isotropic or anisotropic) length scales are learned for both the top-level GP and the length-scale GP. The correlation model is based on the family of (stationary) Matern kernels. The parameter nu of the Matern kernels (governing the smoothness of the GP prior) can either be set or learned jointly with the remaining parameters. Parameters ---------- isotropic : bool, default=True Whether the global length-scales of the top-level GP are isotropic or anisotropic nu: float, default=1.5 The parameter nu of the Matern kernels (governing the smoothness of the GP prior). If None, nu is learned along with the other hyperparameters. l_isotropic : bool, default=True Whether the global length-scales of the length-scale GP are isotropic or anisotropic l_samples: int, default=10 How many datapoints from the training data are selected as support points for learning the length-scale GP prior_b: float, default=inf The variance of the log-normal prior distribution on the length scales. If set to infinity, the distribution is assumed to be uniform. .. seealso:: "Nonstationary Gaussian Process Regression using Point Estimates of Local Smoothness", Christian Plagemann, Kristian Kersting, and Wolfram Burgard, ECML 2008 """ def __init__(self, isotropic=True, nu=1.5, l_isotropic=True, l_samples=10, prior_b=np.inf, X_=None): self.isotropic = isotropic self.nu = nu self.l_isotropic = l_isotropic self.l_samples = l_samples self.prior_b = prior_b self.X_ = X_ if self.X_ is not None: assert self.X_.shape[0] == self.l_samples def fit(self, X, nugget=10. * MACHINE_EPSILON): """ Fits the correlation model for training data X Parameters ---------- X : array_like, shape=(n_samples, n_features) An array of training datapoints at which observations were made, i.e., where the outputs y are known nugget : double or ndarray, optional The Gaussian Process nugget parameter The nugget is added to the diagonal of the assumed training covariance; in this way it acts as a Tikhonov regularization in the problem. In the special case of the squared exponential correlation function, the nugget mathematically represents the variance of the input values. Default assumes a nugget close to machine precision for the sake of robustness (nugget = 10. * MACHINE_EPSILON). """ self.X = X self.nugget = nugget self.n_samples = X.shape[0] self.n_dims = X.shape[1] # Determine how many entries in theta belong to the different # categories (used later for parsing theta) self.theta_gp_size = 1 if self.isotropic else self.n_dims self.theta_l_size = 1 if self.l_isotropic else self.n_dims self.nu_size = 1 if not self.nu else 0 self.theta_size = self.theta_gp_size + self.theta_l_size \ + self.l_samples + self.nu_size # Calculate array with shape (n_eval, n_features) giving the # componentwise distances between locations x and x' at which the # correlation model should be evaluated. self.D, self.ij = l1_cross_differences(self.X) if self.X_ is None: # Select subset of X for which length scales are optimized. # Generalization of length scales to other datapoints is acheived # by means of a separate Gaussian Process (gp_l) if self.X.shape[0] >= self.l_samples: kmeans = KMeans(n_clusters=self.l_samples) self.X_ = kmeans.fit(self.X).cluster_centers_ else: # Fallback to select centers using sampling with replacement self.X_ = self.X[np.random.choice(np.arange(self.X.shape[0]), self.l_samples)] return self def __call__(self, theta, X=None): """ Compute correlation for given correlation parameter(s) theta. Parameters ---------- theta : array_like An array giving the autocorrelation parameter(s). X : array_like, shape(n_eval, n_features) An array containing the n_eval query points whose correlation with the training datapoints shall be computed. If None, autocorrelation of the training datapoints is computed instead. Returns ------- r : array_like, shape=(n_eval, n_samples) if X != None (n_samples, n_samples) if X == None An array containing the values of the correlation model. """ # Parse theta into its components theta_gp, theta_l, length_scales, nu = self._parse_theta(theta) # Train length-scale Gaussian Process from skgp.estimators import GaussianProcess self.gp_l = \ GaussianProcess(corr="matern_1.5", theta0=theta_l).fit(self.X_, np.log10(length_scales)) l_train = 10**self.gp_l.predict(self.X) # Prepare distances and length scale information for any pair of # datapoints, whose correlation shall be computed if X is not None: # Get pairwise componentwise L1-differences to the input training # set d = X[:, np.newaxis, :] - self.X[np.newaxis, :, :] d = d.reshape((-1, X.shape[1])) # Predict length scales for query datapoints l_query = 10**self.gp_l.predict(X) l = np.transpose([np.tile(l_train, len(l_query)), np.repeat(l_query, len(l_train))]) else: # No external datapoints given; auto-correlation of training set # is used instead d = self.D l = l_train[self.ij] # Compute general Matern kernel if d.ndim > 1 and theta_gp.size == d.ndim: activation = np.sum(theta_gp.reshape(1, d.ndim) * d ** 2, axis=1) else: activation = theta_gp[0] * np.sum(d ** 2, axis=1) tmp = 0.5*(l**2).sum(1) tmp2 = np.maximum(2*np.sqrt(nu * activation / tmp), 1e-5) r = np.sqrt(l[:, 0]) * np.sqrt(l[:, 1]) / (gamma(nu) * 2**(nu - 1)) r /= np.sqrt(tmp) r *= tmp2**nu * kv(nu, tmp2) # Convert correlations to 2d matrix if X is not None: return r.reshape(-1, self.n_samples) else: # exploit symmetry of auto-correlation R = np.eye(self.n_samples) * (1. + self.nugget) R[self.ij[:, 0], self.ij[:, 1]] = r R[self.ij[:, 1], self.ij[:, 0]] = r return R def log_prior(self, theta): """ Returns the (log) prior probability of parameters theta. The prior is assumed to be uniform over the parameter space except for the length-scales dimensions. These are assumed to be log-normal distributed with mean 0 and variance self.prior_b. If self.prior_b is np.inf, the log length-scales are assumed to be uniformly distributed as well. NOTE: The returned quantity is an improper prior as its integral over the parameter space is not equal to 1. Parameters ---------- theta : array_like An array giving the autocorrelation parameter(s). Returns ------- log_p : float The (log) prior probability of parameters theta. An improper probability. """ if self.prior_b == np.inf: return 0.0 _, _, length_scales, _ = self._parse_theta(theta) squared_dist = (np.log10(length_scales)**2).sum() return -squared_dist / self.prior_b def _parse_theta(self, theta): """ Parse parameter vector theta into its components. Parameters ---------- theta : array_like An array containing all hyperparameters. Returns ------- theta_gp : array_like An array containing the hyperparameters of the main GP. theta_l : array_like An array containing the hyperparameters of the length-scale GP. length_scales : array_like An array containing the length-scales for the length-scale GP. nu : float The parameter nu controlling the smoothness of the Matern kernel. """ theta = np.asarray(theta, dtype=np.float) assert (theta.size == self.theta_size), \ "theta does not have the expected size (expected: %d, " \ "actual size %d). Expected: %d entries for main GP, " \ "%d entries for length-scale GP, %d entries containing the "\ "length scales, and %d entries for nu." \ % (self.theta_size, theta.size, self.theta_gp_size, self.theta_l_size, self.l_samples, self.nu_size) # Split theta in its components theta_gp = theta[:self.theta_gp_size] theta_l = \ theta[self.theta_gp_size:][:self.theta_l_size] length_scales = \ theta[self.theta_gp_size+self.theta_l_size:][:self.l_samples] nu = self.nu if self.nu else theta[-1] return theta_gp, theta_l, length_scales, nu @classmethod def create(cls, dims, isotropic=True, theta0=1e-1, thetaL=None, thetaU=None, l_isotropic=True, theta_l_0=1e-1, theta_l_L=None, theta_l_U=None, l_samples=20, l_0=1.0, l_L=None, l_U=None, nu_0=1.5, nu_L=None, nu_U=None, prior_b=np.inf, *args, **kwargs): """ Factory method for creating non-stationary correlation models. ..note:: In addtion to returning an instance of NonStationaryCorrelation, the specification of the search space for the hyperparameters theta of the Gaussian process is returned. This includes the start point of the search (theta0) as well as the lower and upper boundaries thetaL and thetaU for the values of theta. """ theta0 = [theta0] * (1 if isotropic else dims) thetaL = [thetaL] * (1 if isotropic else dims) thetaU = [thetaU] * (1 if isotropic else dims) theta0 += [theta_l_0] * (1 if l_isotropic else dims) thetaL += [theta_l_L] * (1 if l_isotropic else dims) thetaU += [theta_l_U] * (1 if l_isotropic else dims) theta0 += [l_0] * l_samples thetaL += [l_L] * l_samples thetaU += [l_U] * l_samples if nu_L is not None: theta0 += [nu_0] thetaL += [nu_L] thetaU += [nu_U] corr = cls(isotropic=isotropic, nu=None if nu_L else nu_0, l_isotropic=l_isotropic, l_samples=l_samples, prior_b=prior_b) return corr, theta0, thetaL, thetaU
def __call__(self, theta, X=None): """ Compute correlation for given correlation parameter(s) theta. Parameters ---------- theta : array_like An array giving the autocorrelation parameter(s). X : array_like, shape(n_eval, n_features) An array containing the n_eval query points whose correlation with the training datapoints shall be computed. If None, autocorrelation of the training datapoints is computed instead. Returns ------- r : array_like, shape=(n_eval, n_samples) if X != None (n_samples, n_samples) if X == None An array containing the values of the correlation model. """ # Parse theta into its components theta_gp, theta_l, length_scales, nu = self._parse_theta(theta) # Train length-scale Gaussian Process from skgp.estimators import GaussianProcess self.gp_l = \ GaussianProcess(corr="matern_1.5", theta0=theta_l).fit(self.X_, np.log10(length_scales)) l_train = 10**self.gp_l.predict(self.X) # Prepare distances and length scale information for any pair of # datapoints, whose correlation shall be computed if X is not None: # Get pairwise componentwise L1-differences to the input training # set d = X[:, np.newaxis, :] - self.X[np.newaxis, :, :] d = d.reshape((-1, X.shape[1])) # Predict length scales for query datapoints l_query = 10**self.gp_l.predict(X) l = np.transpose([np.tile(l_train, len(l_query)), np.repeat(l_query, len(l_train))]) else: # No external datapoints given; auto-correlation of training set # is used instead d = self.D l = l_train[self.ij] # Compute general Matern kernel if d.ndim > 1 and theta_gp.size == d.ndim: activation = np.sum(theta_gp.reshape(1, d.ndim) * d ** 2, axis=1) else: activation = theta_gp[0] * np.sum(d ** 2, axis=1) tmp = 0.5*(l**2).sum(1) tmp2 = np.maximum(2*np.sqrt(nu * activation / tmp), 1e-5) r = np.sqrt(l[:, 0]) * np.sqrt(l[:, 1]) / (gamma(nu) * 2**(nu - 1)) r /= np.sqrt(tmp) r *= tmp2**nu * kv(nu, tmp2) # Convert correlations to 2d matrix if X is not None: return r.reshape(-1, self.n_samples) else: # exploit symmetry of auto-correlation R = np.eye(self.n_samples) * (1. + self.nugget) R[self.ij[:, 0], self.ij[:, 1]] = r R[self.ij[:, 1], self.ij[:, 0]] = r return R
#---------------------------------------------------------------------- # Actual test data X = np.random.random(50)[:, None] * 4 - 2 # Observations y = f(X).ravel() # Mesh the input space for evaluations of the real function, the prediction and # its MSE x = np.atleast_2d(np.linspace(-2, 2, 1000)).T # Instanciate one Gaussian Process model for the stationary Matern kernel and # one for the non-stationary one gp_stationary = \ GaussianProcess(corr='matern_1.5', theta0=1e0, thetaL=1e-2, thetaU=1e+2, random_start=100) gp_non_stationary = \ GaussianProcess(corr=NonStationaryCorrelation(), theta0=1e0, thetaL=1e-2, thetaU=1e+2, random_start=100) # Fit to data using Maximum Likelihood Estimation of the parameters gp_stationary.fit(X, y) gp_non_stationary.fit(X, y) print("Theta:\n\tStationary: {:.3f} \t Non-stationary: {:.3f}" .format(gp_stationary.theta_[0], gp_non_stationary.theta_[0])) print("Posterior probability (negative, average, log):\n\t" "Stationary: {:.5f} \t Non-stationary: {:.5f}" .format(gp_stationary.posterior_function_value_, gp_non_stationary.posterior_function_value_))
Xtrain = np.random.random((200, 4)) * 2 - 1 ytrain = f(Xtrain) plt.figure() colors = ['r', 'g', 'b', 'c', 'm'] labels = { 1: "Isotropic", 4: "Automatic Relevance Determination", 8: "Factor Analysis" } for i, n in enumerate(labels.keys()): train_sizes, train_scores, test_scores = \ learning_curve(GaussianProcess(corr='squared_exponential', theta0=[1.0] * n, thetaL=[1e-4] * n, thetaU=[1e2] * n), Xtrain, ytrain, scoring="mean_squared_error", cv=10, n_jobs=4) test_scores = -test_scores # Scores correspond to negative MSE test_scores_mean = np.mean(test_scores, axis=1) test_scores_min = np.min(test_scores, axis=1) test_scores_max = np.max(test_scores, axis=1) plt.plot(train_sizes, test_scores_mean, label=labels[n], color=colors[i]) plt.fill_between(train_sizes, test_scores_min, test_scores_max, alpha=0.2, color=colors[i])
by ARD. Furthermore, the values x in R^3 and x + \alpha (1, 2 , 0) + \beta (1, 0, 2) have the same value for all x and all alpha and beta. This can be exploited by FAD. """ return np.tanh(2 * X[:, 0] - X[:, 1] - X[:, 2]) Xtrain = np.random.random((100, 6)) * 2 - 1 ytrain = f(Xtrain) plt.figure() colors = ['r', 'g', 'b', 'c', 'm'] labels = {True: "Bayesian GP", False: "Standard GP"} for i, bayesian in enumerate(labels.keys()): model = GaussianProcess(corr='squared_exponential', theta0=[1.0] * 12, thetaL=[1e-4] * 12, thetaU=[1e2] * 12) if bayesian: model = BayesianGaussianProcess(model, n_posterior_samples=25, n_burnin=250, n_sampling_steps=25) train_sizes, train_scores, test_scores = \ learning_curve(model, Xtrain, ytrain, scoring="mean_squared_error", cv=10, n_jobs=1) test_scores = -test_scores # Scores correspond to negative MSE test_scores_mean = np.mean(test_scores, axis=1) test_scores_min = np.min(test_scores, axis=1) test_scores_max = np.max(test_scores, axis=1)
#---------------------------------------------------------------------- # Actual test data X = np.random.random(50)[:, None] * 4 - 2 # Observations y = f(X).ravel() # Mesh the input space for evaluations of the real function, the prediction and # its MSE x = np.atleast_2d(np.linspace(-2, 2, 1000)).T # Instanciate one Gaussian Process model for the stationary Matern kernel and # one for the non-stationary one gp_stationary = \ GaussianProcess(corr='matern_1.5', theta0=1e0, thetaL=1e-2, thetaU=1e+2, random_start=100) gp_non_stationary = \ GaussianProcess(corr=NonStationaryCorrelation(), theta0=1e0, thetaL=1e-2, thetaU=1e+2, random_start=100) # Fit to data using Maximum Likelihood Estimation of the parameters gp_stationary.fit(X, y) gp_non_stationary.fit(X, y) print("Theta:\n\tStationary: {:.3f} \t Non-stationary: {:.3f}".format( gp_stationary.theta_[0], gp_non_stationary.theta_[0])) print("Posterior probability (negative, average, log):\n\t" "Stationary: {:.5f} \t Non-stationary: {:.5f}".format( gp_stationary.posterior_function_value_, gp_non_stationary.posterior_function_value_))