def __init__( self, input_shape=[1], layer_units=[200, 100, 1], layer_activations=["relu", "relu", "linear"], initial_unconstrained_scale=None, transform_unconstrained_scale_factor=0.05, # factor to be used in the calculation of the actual noise std. l2_weight_lambda=None, # float or list of floats l2_bias_lambda=None, preprocess_x=False, preprocess_y=False, learning_rate=0.01, # can be float or an instance of tf.keras.optimizers.schedules last_layer_prior="non-informative", last_layer_prior_params=None, seed=0, ): self.input_shape = input_shape self.layer_units = layer_units self.layer_activations = layer_activations self.initial_unconstrained_scale = initial_unconstrained_scale self.transform_unconstrained_scale_factor = transform_unconstrained_scale_factor self.l2_weight_lambda = l2_weight_lambda self.l2_bias_lambda = l2_bias_lambda self.preprocess_x = preprocess_x self.preprocess_y = preprocess_y self.learning_rate = learning_rate self.last_layer_prior = last_layer_prior self.last_layer_prior_params = last_layer_prior_params self.seed = seed if self.preprocess_y: self.y_preprocessor = StandardizePreprocessor() names = [None] * (len(self.layer_units) - 2) + ["feature_extractor", "output"] tf.random.set_seed(self.seed) # if self.initial_unconstrained_scale is None: # self.network = MapNetwork( # self.input_shape, # self.layer_units, # self.layer_activations, # self.l2_weight_lambda, # self.l2_bias_lambda, # preprocess_x=self.preprocess_x, # learning_rate=self.learning_rate, # names=names, # seed=self.seed, # ) # else: self.network = MapDensityNetwork( self.input_shape, self.layer_units, self.layer_activations, self.initial_unconstrained_scale, self.transform_unconstrained_scale_factor, self.l2_weight_lambda, self.l2_bias_lambda, preprocess_x=self.preprocess_x, learning_rate=self.learning_rate, names=names, seed=self.seed, )
# %% # General training epochs = 100 batch_size = n_train # %% markdown # # MAP Density Model # %% initial_learning_rate = 0.01 lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate, decay_steps=n_train, decay_rate=0.9, staircase=True) net = MapDensityNetwork( input_shape=[1], layer_units=layer_units, layer_activations=layer_activations, learning_rate=lr_schedule, ) net.fit(x_train=x_train, y_train=y_train, batch_size=batch_size, epochs=epochs, verbose=0) # %% prediction = net.predict(x_plot) # Mixture Of Gaussian prediction fig, ax = plt.subplots(figsize=figsize) plot_moment_matched_predictive_normal_distribution( x_plot=_x_plot, predictive_distribution=prediction,
y_lim = [-5, 7] fig, ax = plt.subplots(figsize=(8, 8)) plot_training_data(x_train, y_train, fig=fig, ax=ax, y_lim=y_lim) plot_ground_truth(x_plot, y_ground_truth, fig=fig, ax=ax) ax.legend() # %% initial_learning_rate = 0.05 lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate, decay_steps=20, decay_rate=0.9, staircase=True) net = MapDensityNetwork( input_shape=input_shape, layer_units=layer_units, layer_activations=layer_activations, weight_prior=weight_prior, bias_prior=bias_prior, n_train=n_train, learning_rate=0.01, ) prior_predictive_distributions = net.predict_with_prior_samples(x_plot, n_samples=4) plot_distribution_samples( x_plot=x_plot, distribution_samples=prior_predictive_distributions, x_train=x_train, y_train=y_train, y_ground_truth=y_ground_truth, # y_lim=[-30, 30],
bias_priors = [tfd.Normal(0, bias_prior_scale)] * len(layer_units) l2_weight_lambda = prior_scale_to_regularization_lambda( weight_prior_scale, n_train) l2_bias_lambda = prior_scale_to_regularization_lambda(bias_prior_scale, n_train) assert np.isclose( weight_prior_scale, regularization_lambda_to_prior_scale(l2_weight_lambda, n_train)) # %% seed = 0 model = MapDensityNetwork( input_shape=[1], layer_units=layer_units, layer_activations=layer_activations, initial_unconstrained_sigma=0.0, l2_weight_lambda=l2_weight_lambda, l2_bias_lambda=l2_bias_lambda, seed=seed, ) models = [] n_models = 4 seeds = np.arange(n_models) initial_unconstrained_sigmas = seeds + 0.1 for seed, initial_unconstrained_sigma in zip(seeds, initial_unconstrained_sigmas): m = MapDensityNetwork( input_shape=[1], layer_units=layer_units,
# save_path=figure_dir.joinpath(f"llb_moment_matched_{experiment_name}.pdf") ) # %% markdown # # Using pretrained network # %% initial_learning_rate = 0.01 lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate, decay_steps=n_train, decay_rate=0.9, staircase=True) net = MapDensityNetwork( input_shape=input_shape, layer_units=layer_units, layer_activations=layer_activations, initial_unconstrained_scale=initial_unconstrained_scale, transform_unconstrained_scale_factor=transform_unconstrained_scale_factor, preprocess_x=preprocess_x, preprocess_y=preprocess_y, learning_rate=lr_schedule, names=[None, "feature_extractor", "output"], seed=0, ) net.fit(x_train=x_train, y_train=y_train, batch_size=batch_size, epochs=epochs, verbose=0) prediction = net.predict(x_plot) plot_moment_matched_predictive_normal_distribution( x_plot=x_plot, predictive_distribution=prediction, x_train=x_train,
bias_prior = weight_prior network_prior = make_independent_gaussian_network_prior( input_shape=input_shape, layer_units=layer_units, loc=0.0, scale=1.0 ) # %% markdown # ### Let's first train a map network # %% net = MapDensityNetwork( input_shape=input_shape, layer_units=layer_units, layer_activations=layer_activations, transform_unconstrained_scale_factor=transform_unconstrained_scale_factor, weight_prior=weight_prior, bias_prior=bias_prior, n_train=n_train, preprocess_y=False, learning_rate=0.01, ) early_stop_callback = tf.keras.callbacks.EarlyStopping( monitor="loss", patience=20, verbose=1, restore_best_weights=False ) net.fit( x_train=x_train, y_train=y_train, batch_size=10, epochs=10000, early_stop_callback=early_stop_callback,
def map_network_likelihood_loss(net, x_train, y_train): loss = tf.reduce_mean( MapDensityNetwork.negative_log_likelihood(y_train, net.network(x_train))) return loss
batch_size = n_train weight_prior_scale = 2 bias_prior_scale = weight_prior_scale weight_prior = tfd.Normal(0, weight_prior_scale) bias_prior = tfd.Normal(0, bias_prior_scale) weight_priors = [weight_prior] * len(layer_units) bias_priors = [bias_prior] * len(layer_units) # %% seed = 0 model = MapDensityNetwork( input_shape=[1], layer_units=layer_units, layer_activations=layer_activations, initial_unconstrained_scale=0.0, weight_prior=weight_prior, bias_prior=bias_prior, scale_prior=tfd.InverseGamma(0.1, 0.1), n_train=n_train, seed=seed, ) models = [] n_models = 4 seeds = np.arange(n_models) initial_unconstrained_scales = seeds + 0.1 for seed, initial_unconstrained_scale in zip(seeds, initial_unconstrained_scales): m = MapDensityNetwork( input_shape=[1],
class PostHocLastLayerBayesianNetwork: def __init__( self, input_shape=[1], layer_units=[200, 100, 1], layer_activations=["relu", "relu", "linear"], initial_unconstrained_scale=None, transform_unconstrained_scale_factor=0.05, # factor to be used in the calculation of the actual noise std. l2_weight_lambda=None, # float or list of floats l2_bias_lambda=None, preprocess_x=False, preprocess_y=False, learning_rate=0.01, # can be float or an instance of tf.keras.optimizers.schedules last_layer_prior="non-informative", last_layer_prior_params=None, seed=0, ): self.input_shape = input_shape self.layer_units = layer_units self.layer_activations = layer_activations self.initial_unconstrained_scale = initial_unconstrained_scale self.transform_unconstrained_scale_factor = transform_unconstrained_scale_factor self.l2_weight_lambda = l2_weight_lambda self.l2_bias_lambda = l2_bias_lambda self.preprocess_x = preprocess_x self.preprocess_y = preprocess_y self.learning_rate = learning_rate self.last_layer_prior = last_layer_prior self.last_layer_prior_params = last_layer_prior_params self.seed = seed if self.preprocess_y: self.y_preprocessor = StandardizePreprocessor() names = [None] * (len(self.layer_units) - 2) + ["feature_extractor", "output"] tf.random.set_seed(self.seed) # if self.initial_unconstrained_scale is None: # self.network = MapNetwork( # self.input_shape, # self.layer_units, # self.layer_activations, # self.l2_weight_lambda, # self.l2_bias_lambda, # preprocess_x=self.preprocess_x, # learning_rate=self.learning_rate, # names=names, # seed=self.seed, # ) # else: self.network = MapDensityNetwork( self.input_shape, self.layer_units, self.layer_activations, self.initial_unconstrained_scale, self.transform_unconstrained_scale_factor, self.l2_weight_lambda, self.l2_bias_lambda, preprocess_x=self.preprocess_x, learning_rate=self.learning_rate, names=names, seed=self.seed, ) @property def total_epochs(self): return self.network.total_epochs def fit_preprocessing(self, y_train): if self.preprocess_y: self.y_preprocessor.fit(y_train) def fit( self, x_train, y_train, batch_size=1, epochs=1, early_stop_callback=None, validation_split=0.0, validation_data=None, verbose=1, pretrained_network=None, ): tf.random.set_seed(self.seed) self.fit_preprocessing(y_train) if self.preprocess_y: y_train = self.y_preprocessor.transform(y_train) if pretrained_network is None: self.network.fit( x_train, y_train, batch_size=batch_size, epochs=epochs, early_stop_callback=early_stop_callback, validation_split=validation_split, validation_data=validation_data, verbose=verbose, ) else: self.network = pretrained_network self.feature_extractor = tf.keras.Model( self.network.network.inputs, self.network.network.get_layer("feature_extractor").output, ) features_train = self.feature_extractor(x_train).numpy() features_train = np.hstack((features_train, np.ones((x_train.shape[0], 1)))) # "fit" bayesian linear regression n_features = features_train.shape[1] if self.last_layer_prior_params is None: if self.last_layer_prior == "non-informative": self.last_layer_prior_params = { "mu_0": np.zeros((n_features, 1)), "V_0": 1e3 * np.eye(n_features), "a_0": -n_features / 2, "b_0": 0, } elif ( self.last_layer_prior == "standard-normal-weights-non-informative-scale" ): ml_noise_sigma = self.network.noise_sigma self.last_layer_prior_params = { "mu_0": np.zeros((n_features, 1)), "V_0": (1 / ml_noise_sigma ** 2) * np.eye(n_features), "a_0": -n_features / 2, "b_0": 0, } elif self.last_layer_prior == "weakly-informative": a = 0.5 b = 0.01 self.last_layer_prior_params = { "mu_0": np.zeros((n_features, 1)), "V_0": (a / b) * np.eye(n_features), "a_0": a, "b_0": b, } else: raise ValueError( f'When not specifying last_layer_prior_params, you can pass either "non-informative" or "standard-normal-weights-non-informative-scale" as last_layer_prior. You instead passed "{self.last_layer_prior}"' ) self.blr_model = BayesianLinearRegression(**self.last_layer_prior_params) self.blr_model.fit(features_train, y_train) return self def predict(self, x): features_test = self.feature_extractor(x).numpy().astype("float32") features_test = np.hstack((features_test, np.ones((x.shape[0], 1)))) df, loc, scale = self.blr_model.predict(features_test) df = np.float32(df) loc = loc.astype("float32") scale = scale.astype("float32") scale = np.expand_dims(scale, axis=scale.ndim) if self.preprocess_y: loc = self.y_preprocessor.inverse_transform(loc) if self.y_preprocessor.std is not None: scale *= self.y_preprocessor.std return tfd.StudentT(df=df, loc=loc, scale=scale) def __call__(self, x): return self.predict(x) def get_weights(self): """ Returns the weights of all layers including the one that is discarded and the marginal t distribution of the last layer weights. """ df, loc, dispersion = self.blr_model.unconditional_w_t() last_layer_weight_distribution = tfd.StudentT( df=df, loc=loc, scale=tf.linalg.tensor_diag_part(dispersion) ** 0.5 ) return self.network.get_weights(), last_layer_weight_distribution