class BayesianNeuralNetwork(BaseModel): def __init__(self, sampling_method="sghmc", n_nets=100, l_rate=1e-3, mdecay=5e-2, n_iters=5 * 10**4, bsize=20, burn_in=1000, precondition=True, normalize_output=True, normalize_input=True, rng=None, get_net=get_default_net): """ Bayesian Neural Networks use Bayesian methods to estimate the posterior distribution of a neural network's weights. This allows to also predict uncertainties for test points and thus makes Bayesian Neural Networks suitable for Bayesian optimization. This module uses stochastic gradient MCMC methods to sample from the posterior distribution together See [1] for more details. [1] J. T. Springenberg, A. Klein, S. Falkner, F. Hutter Bayesian Optimization with Robust Bayesian Neural Networks. In Advances in Neural Information Processing Systems 29 (2016). Parameters ---------- sampling_method : str Determines the MCMC strategy: "sghmc" = Stochastic Gradient Hamiltonian Monte Carlo "sgld" = Stochastic Gradient Langevin Dynamics n_nets : int The number of samples (weights) that are drawn from the posterior l_rate : float The step size parameter for SGHMC mdecay : float Decaying term for the momentum in SGHMC n_iters : int Number of MCMC sampling steps without burn in bsize : int Batch size to form a mini batch burn_in : int Number of burn-in steps before the actual MCMC sampling begins precondition : bool Turns on / off preconditioning. See [1] for more details normalize_input : bool Turns on / off zero mean unit variance normalization of the input data normalize_output : bool Turns on / off zero mean unit variance normalization of the output data rng : np.random.RandomState() Random number generator get_net : func function that returns a network specification. """ if rng is None: self.rng = np.random.RandomState(np.random.randint(100000)) else: self.rng = rng lasagne.random.set_rng(self.rng) self.sampling_method = sampling_method self.n_nets = n_nets self.l_rate = l_rate self.mdecay = mdecay self.n_iters = n_iters self.bsize = bsize self.burn_in = burn_in self.precondition = precondition self.is_trained = False self.normalize_output = normalize_output self.normalize_input = normalize_input self.get_net = get_net self.samples = deque(maxlen=n_nets) self.variance_prior = LogVariancePrior(1e-6, prior_out_std_prec=0.01) self.weight_prior = WeightPrior(alpha=1., beta=1.) self.Xt = T.matrix() self.Yt = T.matrix() self.X = None self.x_mean = None self.x_std = None self.y = None self.y_mean = None self.y_std = None @BaseModel._check_shapes_train def train(self, X, y, *args, **kwargs): """ Trains the model on the provided data. Parameters ---------- X: np.ndarray (N, D) Input data points. The dimensionality of X is (N, D), with N as the number of points and D is the number of features. y: np.ndarray (N,) The corresponding target values. """ # Clear old samples start_time = time.time() self.net = self.get_net(n_inputs=X.shape[1]) nll, mse = self.negativ_log_likelihood(self.net, self.Xt, self.Yt, X.shape[0], self.weight_prior, self.variance_prior) params = lasagne.layers.get_all_params(self.net, trainable=True) seed = self.rng.randint(1, 100000) srng = theano.sandbox.rng_mrg.MRG_RandomStreams(seed) if self.sampling_method == "sghmc": self.sampler = SGHMCSampler(rng=srng, precondition=self.precondition, ignore_burn_in=False) elif self.sampling_method == "sgld": self.sampler = SGLDSampler(rng=srng, precondition=self.precondition) else: logging.error("Sampling Strategy % does not exist!" % self.sampling_method) self.compute_err = theano.function([self.Xt, self.Yt], [mse, nll]) self.single_predict = theano.function([self.Xt], lasagne.layers.get_output( self.net, self.Xt)) self.samples.clear() if self.normalize_input: self.X, self.x_mean, self.x_std = zero_mean_unit_var_normalization( X) else: self.X = X if self.normalize_output: self.y, self.y_mean, self.y_std = zero_mean_unit_var_normalization( y) else: self.y = y self.sampler.prepare_updates(nll, params, self.l_rate, mdecay=self.mdecay, inputs=[self.Xt, self.Yt], scale_grad=X.shape[0]) logging.info("Starting sampling") # Check if we have enough data points to form a minibatch # otherwise set the batchsize equal to the number of input points if self.X.shape[0] < self.bsize: self.bsize = self.X.shape[0] logging.error("Not enough datapoint to form a minibatch. " "Set the batchsize to {}".format(self.bsize)) i = 0 while i < self.n_iters and len(self.samples) < self.n_nets: if self.X.shape[0] == self.bsize: start = 0 else: start = np.random.randint(0, self.X.shape[0] - self.bsize) xmb = floatX(self.X[start:start + self.bsize]) ymb = floatX(self.y[start:start + self.bsize, None]) if i < self.burn_in: _, nll_value = self.sampler.step_burn_in(xmb, ymb) else: _, nll_value = self.sampler.step(xmb, ymb) if i % 1000 == 0: total_err, total_nll = self.compute_err( floatX(self.X), floatX(self.y).reshape(-1, 1)) t = time.time() - start_time logging.info("Iter {} : NLL = {} MSE = {} " "Collected samples= {} Time = {}".format( i, total_nll, total_err, len(self.samples), t)) if i % 100 == 0 and i >= self.burn_in: self.samples.append( lasagne.layers.get_all_param_values(self.net)) i += 1 self.is_trained = True def negativ_log_likelihood(self, f_net, X, y, n_examples, weight_prior, variance_prior): f_out = lasagne.layers.get_output(f_net, X) f_mean = f_out[:, 0].reshape((-1, 1)) f_log_var = f_out[:, 1].reshape((-1, 1)) f_var_inv = 1. / (T.exp(f_log_var) + 1e-16) mse = T.square(y - f_mean) log_like = T.sum( T.sum(-mse * (0.5 * f_var_inv) - 0.5 * f_log_var, axis=1)) # scale by batch size to make this work nicely with the updaters above log_like /= T.cast(X.shape[0], theano.config.floatX) # scale the priors by the dataset size for the same reason # prior for the variance tn_examples = T.cast(n_examples, theano.config.floatX) log_like += variance_prior.log_like(f_log_var, n_examples) # prior for the weights params = lasagne.layers.get_all_params(f_net, trainable=True) log_like += weight_prior.log_like(params) / tn_examples return -log_like, T.mean(mse) @BaseModel._check_shapes_predict def predict(self, X_test, return_individual_predictions=False, *args, **kwargs): """ Returns the predictive mean and variance of the objective function at the given test points. Parameters ---------- X_test: np.ndarray (N, D) Input test points return_individual_predictions: bool If set to true than the individual predictions of all samples are returned. Returns ---------- np.array(N,) predictive mean np.array(N,) predictive variance """ if not self.is_trained: logging.error("Model is not trained!") return # Normalize input if self.normalize_input: X_, _, _ = zero_mean_unit_var_normalization( X_test, self.x_mean, self.x_std) else: X_ = X_test f_out = [] theta_noise = [] for sample in self.samples: lasagne.layers.set_all_param_values(self.net, sample) out = self.single_predict(X_) f_out.append(out[:, 0]) theta_noise.append(np.exp(out[:, 1])) f_out = np.asarray(f_out) theta_noise = np.asarray(theta_noise) if return_individual_predictions: return f_out, theta_noise m = np.mean(f_out, axis=0) # Total variance # v = np.mean(f_out ** 2 + theta_noise, axis=0) - m ** 2 v = np.mean((f_out - m)**2, axis=0) if self.normalize_output: m = zero_mean_unit_var_unnormalization(m, self.y_mean, self.y_std) v *= self.y_std**2 return m, v def sample_functions(self, X_test, n_funcs=1): """ Samples F function values from the current posterior at the N specified test point. Parameters ---------- X_test: np.ndarray (N, D) Input test points n_funcs: int Number of function values that are drawn at each test point. Returns ---------- np.array(F, N) The F function values drawn at the N test points. """ if self.normalize_input: X_test_norm, _, _ = zero_mean_unit_var_normalization( X_test, self.x_mean, self.x_std) else: X_test_norm = X_test f = np.zeros([n_funcs, X_test_norm.shape[0]]) for i in range(n_funcs): lasagne.layers.set_all_param_values(self.net, self.samples[i]) out = self.single_predict(X_test_norm)[:, 0] if self.normalize_output: f[i, :] = zero_mean_unit_var_unnormalization( out, self.y_mean, self.y_std) else: f[i, :] = out return f def get_incumbent(self): """ Returns the best observed point and its function value Returns ---------- incumbent: ndarray (D,) current incumbent incumbent_value: ndarray (N,) the observed value of the incumbent """ if self.normalize_input: X = zero_mean_unit_var_unnormalization(self.X, self.x_mean, self.x_std) m = self.predict(X)[0] else: m = self.predict(self.X)[0] best_idx = np.argmin(self.y) inc = self.X[best_idx] inc_value = m[best_idx] if self.normalize_input: inc = zero_mean_unit_var_unnormalization(inc, self.x_mean, self.x_std) if self.normalize_output: inc_value = zero_mean_unit_var_unnormalization( inc_value, self.y_mean, self.y_std) return inc, inc_value
def train(self, X, y, *args, **kwargs): """ Trains the model on the provided data. Parameters ---------- X: np.ndarray (N, D) Input data points. The dimensionality of X is (N, D), with N as the number of points and D is the number of features. y: np.ndarray (N,) The corresponding target values. """ # Clear old samples start_time = time.time() self.net = self.get_net(n_inputs=X.shape[1]) nll, mse = self.negativ_log_likelihood(self.net, self.Xt, self.Yt, X.shape[0], self.weight_prior, self.variance_prior) params = lasagne.layers.get_all_params(self.net, trainable=True) seed = self.rng.randint(1, 100000) srng = theano.sandbox.rng_mrg.MRG_RandomStreams(seed) if self.sampling_method == "sghmc": self.sampler = SGHMCSampler(rng=srng, precondition=self.precondition, ignore_burn_in=False) elif self.sampling_method == "sgld": self.sampler = SGLDSampler(rng=srng, precondition=self.precondition) else: logging.error("Sampling Strategy % does not exist!" % self.sampling_method) self.compute_err = theano.function([self.Xt, self.Yt], [mse, nll]) self.single_predict = theano.function([self.Xt], lasagne.layers.get_output( self.net, self.Xt)) self.samples.clear() if self.normalize_input: self.X, self.x_mean, self.x_std = zero_mean_unit_var_normalization( X) else: self.X = X if self.normalize_output: self.y, self.y_mean, self.y_std = zero_mean_unit_var_normalization( y) else: self.y = y self.sampler.prepare_updates(nll, params, self.l_rate, mdecay=self.mdecay, inputs=[self.Xt, self.Yt], scale_grad=X.shape[0]) logging.info("Starting sampling") # Check if we have enough data points to form a minibatch # otherwise set the batchsize equal to the number of input points if self.X.shape[0] < self.bsize: self.bsize = self.X.shape[0] logging.error("Not enough datapoint to form a minibatch. " "Set the batchsize to {}".format(self.bsize)) i = 0 while i < self.n_iters and len(self.samples) < self.n_nets: if self.X.shape[0] == self.bsize: start = 0 else: start = np.random.randint(0, self.X.shape[0] - self.bsize) xmb = floatX(self.X[start:start + self.bsize]) ymb = floatX(self.y[start:start + self.bsize, None]) if i < self.burn_in: _, nll_value = self.sampler.step_burn_in(xmb, ymb) else: _, nll_value = self.sampler.step(xmb, ymb) if i % 1000 == 0: total_err, total_nll = self.compute_err( floatX(self.X), floatX(self.y).reshape(-1, 1)) t = time.time() - start_time logging.info("Iter {} : NLL = {} MSE = {} " "Collected samples= {} Time = {}".format( i, total_nll, total_err, len(self.samples), t)) if i % 100 == 0 and i >= self.burn_in: self.samples.append( lasagne.layers.get_all_param_values(self.net)) i += 1 self.is_trained = True