def predict(self, x_test: np.ndarray, return_individual_predictions: bool = False): """ Predicts mean and variance for the given test point :param x_test: test datapoint :param return_individual_predictions: if True also the predictions of the individual models are returned :return: mean and variance """ x_test_ = np.asarray(x_test) if self.do_normalize_input: x_test_, *_ = self.normalize_input(x_test_, self.x_mean, self.x_std) def network_predict(x_test_, weights): with torch.no_grad(): self.network_weights = weights if self.use_double_precision: return self.model( torch.from_numpy(x_test_).double()).numpy() else: return self.model( torch.from_numpy(x_test_).float()).numpy() logging.debug("Predicting with %d networks." % len(self.sampled_weights)) network_outputs = np.array([ network_predict(x_test_, weights=weights) for weights in self.sampled_weights ]) mean_prediction = np.mean(network_outputs[:, :, 0], axis=0) # variance_prediction = np.mean((network_outputs[:, :, 0] - mean_prediction) ** 2, axis=0) # Total variance variance_prediction = np.mean( (network_outputs[:, :, 0] - mean_prediction)**2 + np.exp(network_outputs[:, :, 1]), axis=0) if self.do_normalize_output: mean_prediction = zero_mean_unit_var_denormalization( mean_prediction, self.y_mean, self.y_std) variance_prediction *= self.y_std**2 for i in range(len(network_outputs)): network_outputs[i] = zero_mean_unit_var_denormalization( network_outputs[i], self.y_mean, self.y_std) if return_individual_predictions: return mean_prediction, variance_prediction, network_outputs[:, :, 0] return mean_prediction, variance_prediction
def predict_single(self, x_test: np.ndarray, sample_index: int): """ Compute the prediction of a single weight sample :param x_test: test datapoint :param sample_index: specifies the index of the weight sample :return: mean and variance of the neural network """ x_test_ = np.asarray(x_test) if self.do_normalize_input: x_test_, *_ = self.normalize_input(x_test_, self.x_mean, self.x_std) def network_predict(x_test_, weights): with torch.no_grad(): self.network_weights = weights if self.use_double_precision: return self.model( torch.from_numpy(x_test_).double()).numpy() else: return self.model( torch.from_numpy(x_test_).float()).numpy() logging.debug("Predicting with %d networks." % len(self.sampled_weights)) function_value = np.array( network_predict(x_test_, weights=self.sampled_weights[sample_index])) if self.do_normalize_output: function_value = zero_mean_unit_var_denormalization( function_value, self.y_mean, self.y_std) return function_value
def predict(self, X_test): r""" Returns the predictive mean and variance of the objective function at the given test points. Parameters ---------- X_test: np.ndarray (N, D) N input test points Returns ---------- np.array(N,) predictive mean np.array(N,) predictive variance """ # Normalize inputs if self.normalize_input: X_, _, _ = zero_mean_unit_var_normalization( X_test, self.X_mean, self.X_std) else: X_ = X_test # Get features from the net if self.gpu: network = self.network.cpu() else: network = self.network theta = network.basis_funcs(torch.Tensor(X_)).data.numpy() # Marginalise predictions over hyperparameters of the BLR mu = np.zeros([len(self.models), X_test.shape[0]]) var = np.zeros([len(self.models), X_test.shape[0]]) for i, m in enumerate(self.models): mu[i], var[i] = m.predict(theta) # See the algorithm runtime prediction paper by Hutter et al # for the derivation of the total variance m = np.mean(mu, axis=0) v = np.mean(mu**2 + var, axis=0) - m**2 # Clip negative variances and set them to the smallest # positive float value if v.shape[0] == 1: v = np.clip(v, np.finfo(v.dtype).eps, np.inf) else: v = np.clip(v, np.finfo(v.dtype).eps, np.inf) v[np.where((v < np.finfo(v.dtype).eps) & (v > -np.finfo(v.dtype).eps))] = 0 if self.normalize_output: m = zero_mean_unit_var_denormalization(m, self.y_mean, self.y_std) v *= self.y_std**2 return m, v
def predict(self, X_test): r""" Returns the predictive mean and variance of the objective function at the given test points. Parameters ---------- X_test: np.ndarray (N, D) N input test points Returns ---------- np.array(N,) predictive mean np.array(N,) predictive variance """ # Normalize inputs if self.normalize_input: X_, _, _ = zero_mean_unit_var_normalization( X_test, self.X_mean, self.X_std) else: X_ = X_test # Perform MC dropout model = self.model T = self.T # Yt_hat: T x N x 1 Yt_hat = np.array( [model(torch.Tensor(X_)).data.numpy() for _ in range(T)]) # Yt_hat = Yt_hat * self.std_y_train + self.mean_y_train # T x N TODO check with Adam MC_pred_mean = np.mean(Yt_hat, 0) # N x 1 Second_moment = np.mean(Yt_hat**2, 0) # N x 1 # MC_pred_var = Second_moment + np.eye(Yt_hat.shape[-1]) / self.tau - (MC_pred_mean ** 2) MC_pred_var = Second_moment - (MC_pred_mean**2) m = MC_pred_mean.flatten() if MC_pred_var.shape[0] == 1: v = np.clip(MC_pred_var, np.finfo(MC_pred_var.dtype).eps, np.inf) else: v = np.clip(MC_pred_var, np.finfo(MC_pred_var.dtype).eps, np.inf) v[np.where((v < np.finfo(v.dtype).eps) & (v > -np.finfo(v.dtype).eps))] = 0 if self.normalize_output: m = zero_mean_unit_var_denormalization(m, self.y_mean, self.y_std) v *= self.y_std**2 m = m.flatten() v = v.flatten() return m, v
def get_incumbent(self): """ Returns the best observed point and its function value Returns ---------- incumbent: ndarray (D,) current incumbent incumbent_value: ndarray (N,) the observed value of the incumbent """ inc, inc_value = super(LCCD, self).get_incumbent() if self.normalize_input: inc = zero_mean_unit_var_denormalization(inc, self.X_mean, self.X_std) if self.normalize_output: inc_value = zero_mean_unit_var_denormalization(inc_value, self.y_mean, self.y_std) return inc, inc_value
def train(self, x_train: np.ndarray, y_train: np.ndarray, num_steps: int = 13000, keep_every: int = 100, num_burn_in_steps: int = 3000, lr: float = 1e-2, batch_size=20, epsilon: float = 1e-10, mdecay: float = 0.05, continue_training: bool = False, verbose: bool = False, **kwargs): """ Train a BNN using input datapoints `x_train` with corresponding targets `y_train`. :param x_train: input training datapoints. :param y_train: input training targets. :param num_steps: Number of sampling steps to perform after burn-in is finished. In total, `num_steps // keep_every` network weights will be sampled. :param keep_every: Number of sampling steps (after burn-in) to perform before keeping a sample. In total, `num_steps // keep_every` network weights will be sampled. :param num_burn_in_steps: Number of burn-in steps to perform. This value is passed to the given `optimizer` if it supports special burn-in specific behavior. Networks sampled during burn-in are discarded. :param lr: learning rate :param batch_size: batch size :param epsilon: epsilon for numerical stability :param mdecay: momemtum decay :param continue_training: defines whether we want to continue from the last training run :param verbose: verbose output """ logging.debug("Training started.") start_time = time.time() num_datapoints, input_dimensionality = x_train.shape logging.debug("Processing %d training datapoints " " with %d dimensions each." % (num_datapoints, input_dimensionality)) assert batch_size >= 1, "Invalid batch size. Batches must contain at least a single sample." assert len(y_train.shape) == 1 or ( len(y_train.shape) == 2 and y_train.shape[1] == 1), "Targets need to be in vector format, i.e (N,) or (N,1)" if x_train.shape[0] < batch_size: logging.warning( "Not enough datapoints to form a batch. Use all datapoints in each batch" ) batch_size = x_train.shape[0] self.X = x_train if len(y_train.shape) == 2: self.y = y_train[:, 0] else: self.y = y_train if self.do_normalize_input: logging.debug("Normalizing training datapoints to " " zero mean and unit variance.") x_train_, self.x_mean, self.x_std = self.normalize_input(x_train) if self.use_double_precision: x_train_ = torch.from_numpy(x_train_).double() else: x_train_ = torch.from_numpy(x_train_).float() else: if self.use_double_precision: x_train_ = torch.from_numpy(x_train).double() else: x_train_ = torch.from_numpy(x_train).float() if self.do_normalize_output: logging.debug( "Normalizing training labels to zero mean and unit variance.") y_train_, self.y_mean, self.y_std = self.normalize_output(self.y) if self.use_double_precision: y_train_ = torch.from_numpy(y_train_).double() else: y_train_ = torch.from_numpy(y_train_).float() else: if self.use_double_precision: y_train_ = torch.from_numpy(y_train).double() else: y_train_ = torch.from_numpy(y_train).float() if self.use_double_precision: dtype = np.float64 else: dtype = np.float32 if not continue_training: logging.debug("Clearing list of sampled weights.") self.sampled_weights.clear() if self.use_double_precision: self.model = self.get_network(n_curves=num_datapoints).double() else: self.model = self.get_network(n_curves=num_datapoints).float() if self.sampling_method == "adaptive_sghmc": self.sampler = AdaptiveSGHMC( self.model.parameters(), scale_grad=dtype(num_datapoints), num_burn_in_steps=num_burn_in_steps, lr=dtype(lr), mdecay=dtype(mdecay), epsilon=dtype(epsilon)) elif self.sampling_method == "sgld": self.sampler = SGLD(self.model.parameters(), lr=dtype(lr), scale_grad=num_datapoints) elif self.sampling_method == "preconditioned_sgld": self.sampler = PreconditionedSGLD( self.model.parameters(), lr=dtype(lr), num_train_points=num_datapoints) elif self.sampling_method == "sghmc": self.sampler = SGHMC(self.model.parameters(), scale_grad=dtype(num_datapoints), mdecay=dtype(mdecay), lr=dtype(lr)) data_loader = data_utils.DataLoader(data_utils.TensorDataset( x_train_, y_train_), batch_size=batch_size, shuffle=True) train_loader = infinite_dataloader(data_loader) batch_generator = islice(enumerate(train_loader), num_steps) for step, (x_batch, y_batch) in batch_generator: # print(step, (step - num_burn_in_steps) % keep_every, keep_every, num_burn_in_steps, flush=True) self.sampler.zero_grad() loss = self.likelihood_function(input=self.model(x_batch), target=y_batch) # Add prior. Note the gradient is computed by: g_prior + N/n sum_i grad_theta_xi see Eq 4 # in Welling and Whye The 2011. Because of that we divide here by N=num of datapoints since # in the sample we rescale the gradient by N again loss -= log_variance_prior( self.model(x_batch)[:, 1].view((-1, 1))) / num_datapoints loss -= weight_prior(self.model.parameters(), dtype=dtype) / num_datapoints loss.backward() self.sampler.step() if verbose and step > 0 and step % self.print_every_n_steps == 0: # compute the training performance of the ensemble if len(self.sampled_weights) > 1: mu, var = self.predict(x_train) total_nll = -np.mean( norm.logpdf(y_train, loc=mu, scale=np.sqrt(var))) total_mse = np.mean((y_train - mu)**2) # in case we do not have an ensemble we compute the performance of the last weight sample else: f = self.model(x_train_) if self.do_normalize_output: mu = zero_mean_unit_var_denormalization( f[:, 0], self.y_mean, self.y_std).data.numpy() var = torch.exp(f[:, 1]) * self.y_std**2 var = var.data.numpy() else: mu = f[:, 0].data.numpy() var = np.exp(f[:, 1].data.numpy()) total_nll = -np.mean( norm.logpdf(y_train, loc=mu, scale=np.sqrt(var))) total_mse = np.mean((y_train - mu)**2) t = time.time() - start_time if step < num_burn_in_steps: print("Step {:8d} : NLL = {:11.4e} MSE = {:.4e} " "Time = {:5.2f}".format(step, float(total_nll), float(total_mse), t)) if step > num_burn_in_steps: print("Step {:8d} : NLL = {:11.4e} MSE = {:.4e} " "Samples= {} Time = {:5.2f}".format( step, float(total_nll), float(total_mse), len(self.sampled_weights), t)) if step > num_burn_in_steps and ( step - num_burn_in_steps) % keep_every == 0: # print('appending wts') weights = self.network_weights self.sampled_weights.append(weights) self.is_trained = True
def predict(self, X_test): r""" Returns the predictive mean and variance of the objective function at the given test points. Parameters ---------- X_test: np.ndarray (N, D) N input test points Returns ---------- np.array(N,) predictive mean np.array(N,) predictive variance """ # Normalize inputs if self.normalize_input: X_, _, _ = zero_mean_unit_var_normalization(X_test, self.X_mean, self.X_std) else: X_ = X_test # Perform MC dropout model = self.model model.eval() T = self.T # model.eval() # MC_samples : list T x N x 1 # Yt_hat = np.array([model(torch.Tensor(X_)).data.numpy() for _ in range(T)]) # start_mc=time.time() gpu_test = False if gpu_test: X_tensor = Variable(torch.FloatTensor(X_)).to(self.device) MC_samples = [model(X_tensor) for _ in range(T)] means = torch.stack([tup[0] for tup in MC_samples]).view(T, X_.shape[0]).cpu().data.numpy() # logvar = torch.stack([tup[1] for tup in MC_samples]).view(T, X_.shape[0]).cpu().data.numpy() else: model.cpu() MC_samples = [model(Variable(torch.FloatTensor(X_))) for _ in range(T)] means = torch.stack([tup[0] for tup in MC_samples]).view(T, X_.shape[0]).data.numpy() # logvar = torch.stack([tup[1] for tup in MC_samples]).view(T, X_.shape[0]).data.numpy() # mc_time = time.time() - start_mc # print(f'mc_time={mc_time}') # logvar = np.mean(logvar,0) # aleatoric_uncertainty = np.exp(logvar).mean(0) # epistemic_uncertainty = np.var(means, 0).mean(0) aleatoric_uncertainty = self.aleatoric_uncertainty MC_pred_mean = np.mean(means, 0) # N x 1 means_var = np.var(means, 0) MC_pred_var = means_var + aleatoric_uncertainty # MC_pred_var = means_var + np.mean(np.exp(logvar), 0) m = MC_pred_mean.flatten() if MC_pred_var.shape[0] == 1: v = np.clip(MC_pred_var, np.finfo(MC_pred_var.dtype).eps, np.inf) else: v = np.clip(MC_pred_var, np.finfo(MC_pred_var.dtype).eps, np.inf) v[np.where((v < np.finfo(v.dtype).eps) & (v > -np.finfo(v.dtype).eps))] = 0 if self.normalize_output: m = zero_mean_unit_var_denormalization(m, self.y_mean, self.y_std) v *= self.y_std ** 2 m = m.flatten() v = v.flatten() return m, v
def predict(self, X_test): r""" Returns the predictive mean and variance of the objective function at the given test points. Parameters ---------- X_test: np.ndarray (N, D) N input test points Returns ---------- np.array(N,) predictive mean np.array(N,) predictive variance """ # Normalize inputs if self.normalize_input: X_, _, _ = zero_mean_unit_var_normalization( X_test, self.X_mean, self.X_std) else: X_ = X_test # Perform MC dropout model = self.model T = self.T model.eval() # MC_samples : list T x N x 1 # Yt_hat = np.array([model(torch.Tensor(X_)).data.numpy() for _ in range(T)]) MC_samples = [model(Variable(torch.FloatTensor(X_))) for _ in range(T)] means = torch.stack([tup[0] for tup in MC_samples ]).view(T, X_.shape[0]).data.numpy() logvar = torch.stack([tup[1] for tup in MC_samples ]).view(T, X_.shape[0]).data.numpy() # Yt_hat = Yt_hat * self.std_y_train + self.mean_y_train # T x N TODO check with Adam aleatoric_uncertainty = np.exp(logvar).mean(0) epistemic_uncertainty = np.var(means, 0).mean(0) MC_pred_mean = np.mean(means, 0) # N x 1 Second_moment = np.mean(means**2, 0) # N x 1 MC_pred_var = Second_moment + epistemic_uncertainty - (MC_pred_mean**2) m = MC_pred_mean.flatten() if MC_pred_var.shape[0] == 1: v = np.clip(MC_pred_var, np.finfo(MC_pred_var.dtype).eps, np.inf) else: v = np.clip(MC_pred_var, np.finfo(MC_pred_var.dtype).eps, np.inf) v[np.where((v < np.finfo(v.dtype).eps) & (v > -np.finfo(v.dtype).eps))] = 0 if self.normalize_output: m = zero_mean_unit_var_denormalization(m, self.y_mean, self.y_std) v *= self.y_std**2 m = m.flatten() v = v.flatten() return m, v