def __init__(self, data: Tuple[tf.Tensor, tf.Tensor], m: int = 20, alpha: np.float = 1./np.sqrt(2.), eps_sq: np.float = 1, sigma_n_sq: np.float = 1, sigma_f_sq: np.float = 1): self.num_data = tf.cast(data[1].shape[0], default_float()) self.data = (tf.cast(tf.squeeze(data[0]), default_float()), tf.cast(data[1], default_float())) self.const = tf.cast(0.5*data[1].size*np.log(2*np.pi), default_float()) D = data[0].shape[1] self.flag_1d = D == 1 self.alpha = tf.cast(alpha, default_float()) self.alpha_sq = tf.square(self.alpha) self.m = tf.cast(m, default_float()) self.this_range = tf.constant(np.asarray(list(product(range(1, m + 1), repeat=D))).squeeze(), dtype=default_float()) self.this_range_1 = self.this_range - 1. self.this_range_1_2 = self.this_range_1 if self.flag_1d else tf.range(m, dtype=default_float()) self.this_range_1_int = tf.cast(self.this_range_1, tf.int32) self.tf_range_dnn_out = tf.range(D) self.this_range_1_ln2 = np.log(2.)*self.this_range_1 self.vander_range = tf.range(m+1, dtype=default_float()) self.eye_k = tf.eye(m**D, dtype=default_float()) self.yTy = tf.reduce_sum(tf.math.square(self.data[1])) self.coeff_n_tf = tf.constant(np.load(os.path.dirname(os.path.realpath(__file__)) + '/hermite_coeff.npy')[:m, :m], dtype=default_float()) eps_sq = eps_sq*np.ones(D) if D > 1 else eps_sq self.eps_sq = Parameter(eps_sq, transform=positive(), dtype=default_float()) self.sigma_f_sq = Parameter(sigma_f_sq, transform=positive(), dtype=default_float()) self.sigma_n_sq = Parameter(sigma_n_sq, transform=positive(), dtype=default_float())
def __init__(self, kernel, inducing_variables, q_mu_initial, q_sqrt_initial, mean_function, white=False, **kwargs): super().__init__(**kwargs) self.inducing_points = inducing_variables self.num_inducing = inducing_variables.shape[0] # Initialise q_mu to y^2_pi(i) q_mu = q_mu_initial[:, None] self.q_mu = Parameter(q_mu, dtype=default_float()) # Initialise q_sqrt to near deterministic. Store as lower triangular matrix L. q_sqrt = 1e-4 * np.eye(self.num_inducing, dtype=default_float()) #q_sqrt = np.diag(q_sqrt_initial) self.q_sqrt = Parameter(q_sqrt, transform=triangular()) self.kernel = kernel self.mean_function = mean_function self.white = white
def __init__(self, kernel, inducing_variables, mean_function, white=False, **kwargs): super().__init__(**kwargs) self.inducing_points = inducing_variables self.num_inducing = inducing_variables.shape[0] m = inducing_variables.shape[1] # Initialise q_mu to y^2_pi(i) q_mu = np.zeros((self.num_inducing, 1)) self.q_mu = Parameter(q_mu, dtype=default_float()) # Initialise q_sqrt to near deterministic. Store as lower triangular matrix L. q_sqrt = 1e-4 * np.eye(self.num_inducing, dtype=default_float()) self.q_sqrt = Parameter(q_sqrt, transform=triangular()) self.kernel = kernel self.mean_function = mean_function self.white = white # Initialise to prior (Ku) + jitter. if not self.white: Ku = self.kernel(self.inducing_points) Ku += default_jitter() * tf.eye(self.num_inducing, dtype=Ku.dtype) Lu = tf.linalg.cholesky(Ku) q_sqrt = Lu self.q_sqrt = Parameter(q_sqrt, transform=triangular())
def _create_network(self): self.Ws, self.bs = [], [] for dim_in, dim_out in zip(self.dims[:-1], self.dims[1:]): init_xavier_std = (2.0 / (dim_in + dim_out))**0.5 self.Ws.append( Parameter(np.random.randn(dim_in, dim_out) * init_xavier_std)) self.bs.append(Parameter(np.zeros(dim_out)))
def _init_variational_parameters(self, num_inducing, q_mu, q_sqrt, q_diag): """ Constructs the mean and cholesky of the covariance of the variational Gaussian posterior. If a user passes values for `q_mu` and `q_sqrt` the routine checks if they have consistent and correct shapes. If a user does not specify any values for `q_mu` and `q_sqrt`, the routine initializes them, their shape depends on `num_inducing` and `q_diag`. Note: most often the comments refer to the number of observations (=output dimensions) with P, number of latent GPs with L, and number of inducing points M. Typically P equals L, but when certain multioutput kernels are used, this can change. Parameters ---------- :param num_inducing: int Number of inducing variables, typically refered to as M. :param q_mu: np.array or None Mean of the variational Gaussian posterior. If None the function will initialise the mean with zeros. If not None, the shape of `q_mu` is checked. :param q_sqrt: np.array or None Cholesky of the covariance of the variational Gaussian posterior. If None the function will initialise `q_sqrt` with identity matrix. If not None, the shape of `q_sqrt` is checked, depending on `q_diag`. :param q_diag: bool Used to check if `q_mu` and `q_sqrt` have the correct shape or to construct them with the correct shape. If `q_diag` is true, `q_sqrt` is two dimensional and only holds the square root of the covariance diagonal elements. If False, `q_sqrt` is three dimensional. """ q_mu = np.zeros( (num_inducing, self.num_latent_gps)) if q_mu is None else q_mu self.q_mu = Parameter(q_mu, dtype=default_float()) # [M, P] if q_sqrt is None: if self.q_diag: ones = np.ones((num_inducing, self.num_latent_gps), dtype=default_float()) self.q_sqrt = Parameter(ones, transform=positive()) # [M, P] else: q_sqrt = [ np.eye(num_inducing, dtype=default_float()) for _ in range(self.num_latent_gps) ] q_sqrt = np.array(q_sqrt) self.q_sqrt = Parameter(q_sqrt, transform=triangular()) # [P, M, M] else: if q_diag: assert q_sqrt.ndim == 2 self.num_latent_gps = q_sqrt.shape[1] self.q_sqrt = Parameter(q_sqrt, transform=positive()) # [M, L|P] else: assert q_sqrt.ndim == 3 self.num_latent_gps = q_sqrt.shape[0] num_inducing = q_sqrt.shape[1] self.q_sqrt = Parameter(q_sqrt, transform=triangular()) # [L|P, M, M]
def __init__(self, kern, Z, num_outputs, mean_function, white=False, input_prop_dim=None, **kwargs): """ A sparse variational GP layer in whitened representation. This layer holds the kernel, variational parameters, inducing points and mean function. The underlying model at inputs X is f = Lv + mean_function(X), where v \sim N(0, I) and LL^T = kern.K(X) The variational distribution over the inducing points is q(v) = N(q_mu, q_sqrt q_sqrt^T) The layer holds D_out independent GPs with the same kernel and inducing points. :param kern: The kernel for the layer (input_dim = D_in) :param Z: Inducing points (M, D_in) :param num_outputs: The number of GP outputs (q_mu is shape (M, num_outputs)) :param mean_function: The mean function :return: """ super().__init__(input_prop_dim=input_prop_dim, **kwargs) self.num_inducing = Z.shape[0] # Inducing points prior mean q_mu = np.zeros((self.num_inducing, num_outputs)) self.q_mu = Parameter(q_mu, name="q_mu") # Square-root of inducing points prior covariance q_sqrt = np.tile( np.eye(self.num_inducing)[None, :, :], [num_outputs, 1, 1]) self.q_sqrt = Parameter(q_sqrt, transform=triangular(), name="q_sqrt") self.feature = InducingPoints(Z) self.kern = kern self.mean_function = mean_function self.num_outputs = num_outputs self.white = white if not self.white: # initialize to prior Ku = self.kern.K(Z) Lu = np.linalg.cholesky(Ku + np.eye(Z.shape[0]) * gpflow.default_jitter()) self.q_sqrt = Parameter(np.tile(Lu[None, :, :], [num_outputs, 1, 1]), transform=triangular(), name="q_sqrt") self.Ku, self.Lu, self.Ku_tiled, self.Lu_tiled = None, None, None, None self.needs_build_cholesky = True
def __init__(self, variance=1.0, lengthscale=1.0, f_list=None): """ :param variance: the (initial) value for the variance parameter :param lengthscale: the (initial) value for the lengthscale parameter(s), to induce ARD behaviour this must be initialised as an array the same length as the the number of active dimensions e.g. [1., 1., 1.] :param f_list: list with information of the functional inputs """ self.variance = Parameter(variance, transform=positive()) self.lengthscale = Parameter(lengthscale, transform=positive()) self.f_list = f_list # list with functional information
def __init__(self, kernel, inducing_variables, num_outputs, mean_function, input_prop_dim=None, white=False, **kwargs): super().__init__(input_prop_dim, **kwargs) self.num_inducing = inducing_variables.shape[0] self.mean_function = mean_function self.num_outputs = num_outputs self.white = white self.kernels = [] for i in range(self.num_outputs): self.kernels.append(copy.deepcopy(kernel)) # Initialise q_mu to all zeros q_mu = np.zeros((self.num_inducing, num_outputs)) self.q_mu = Parameter(q_mu, dtype=default_float()) # Initialise q_sqrt to identity function #q_sqrt = tf.tile(tf.expand_dims(tf.eye(self.num_inducing, # dtype=default_float()), 0), (num_outputs, 1, 1)) q_sqrt = [ np.eye(self.num_inducing, dtype=default_float()) for _ in range(num_outputs) ] q_sqrt = np.array(q_sqrt) # Store as lower triangular matrix L. self.q_sqrt = Parameter(q_sqrt, transform=triangular()) # Initialise to prior (Ku) + jitter. if not self.white: Kus = [ self.kernels[i].K(inducing_variables) for i in range(self.num_outputs) ] Lus = [ np.linalg.cholesky(Kus[i] + np.eye(self.num_inducing) * default_jitter()) for i in range(self.num_outputs) ] q_sqrt = Lus q_sqrt = np.array(q_sqrt) self.q_sqrt = Parameter(q_sqrt, transform=triangular()) self.inducing_points = [] for i in range(self.num_outputs): self.inducing_points.append( inducingpoint_wrapper(inducing_variables))
def __init__(self, data: Tuple[tf.Tensor, tf.Tensor], m: int = 20, d: int = 1, alpha: np.float = 1./np.sqrt(2.), eps_sq: np.float = 1, sigma_n_sq: np.float = 1, sigma_f_sq: np.float = 1, dir_weights: str = None): if data[1].dtype == np.float64: K_bd.set_floatx('float64') else: set_default_float(np.float32) self.num_data = tf.cast(data[1].shape[0], default_float()) self.data = (tf.cast(data[0], default_float()), tf.cast(data[1], default_float())) self.const = tf.cast(0.5*data[1].size*np.log(2*np.pi), default_float()) self.flag_1d = d == 1 self.alpha = tf.cast(alpha, default_float()) self.alpha_sq = tf.square(self.alpha) self.m = tf.cast(m, default_float()) self.this_range = tf.constant(np.asarray(list(product(range(1, m + 1), repeat=d))).squeeze(), dtype=default_float()) self.this_range_1 = self.this_range - 1. self.this_range_1_2 = self.this_range_1 if self.flag_1d else tf.range(m, dtype=default_float()) self.this_range_1_int = tf.cast(self.this_range_1, tf.int32) self.tf_range_dnn_out = tf.range(d) self.this_range_1_ln2 = np.log(2.)*self.this_range_1 self.vander_range = tf.range(m+1, dtype=default_float()) self.eye_k = tf.eye(m**d, dtype=default_float()) self.yTy = tf.reduce_sum(tf.math.square(self.data[1])) self.coeff_n_tf = tf.constant(np.load(os.path.dirname(os.path.realpath(__file__)) + '/hermite_coeff.npy')[:m, :m], dtype=default_float()) eps_sq = eps_sq*np.ones(d) if d > 1 else eps_sq self.eps_sq = Parameter(eps_sq, transform=positive(), dtype=default_float()) self.sigma_f_sq = Parameter(sigma_f_sq, transform=positive(), dtype=default_float()) self.sigma_n_sq = Parameter(sigma_n_sq, transform=positive(), dtype=default_float()) model = models.Sequential() model.add(layers.Dense(512, activation='tanh', input_dim=data[0].shape[1])) model.add(layers.Dense(256, activation='tanh')) model.add(layers.Dense(64, activation='tanh')) model.add(layers.Dense(d)) if dir_weights is not None: model.load_weights(dir_weights) self.neural_net = model
def __init__( self, data: OutputData, Xp_mean: tf.Tensor, Xp_var: tf.Tensor, pi: tf.Tensor, kernel_K: List[Kernel], Zp: tf.Tensor, Xs_mean=None, Xs_var=None, kernel_s=None, Zs=None, Xs_prior_mean=None, Xs_prior_var=None, Xp_prior_mean=None, Xp_prior_var=None, pi_prior=None ): super().__init__( data=data, split_space=True, Xp_mean=Xp_mean, Xp_var=Xp_var, pi=pi, kernel_K=kernel_K, Zp=Zp, Xs_mean=Xs_mean, Xs_var=Xs_var, kernel_s=kernel_s, Zs=Zs, Xs_prior_mean=Xs_prior_mean, Xs_prior_var=Xs_prior_var, Xp_prior_mean=Xp_prior_mean, Xp_prior_var=Xp_prior_var, pi_prior=pi_prior ) # q(Us | Ms, Ss) q_mu = np.zeros((self.M, self.D)) self.q_mu_s = Parameter(q_mu, dtype=default_float()) # [M, D] q_sqrt = [ np.eye(self.M, dtype=default_float()) for _ in range(self.D) ] q_sqrt = np.array(q_sqrt) self.q_sqrt_s = Parameter(q_sqrt, transform=triangular()) # [D, M, M]
def __init__(self, variance=1.0, lengthscale=1.0, alpha=1.0, active_dims=None): super().__init__(variance=variance, lengthscale=lengthscale, active_dims=active_dims) self.alpha = Parameter(alpha, transform=positive())
def __init__(self, data: Tuple[tf.Tensor, tf.Tensor], m: int = 100, lengthscales = None, sigma_n_sq: np.float = 1, sigma_f_sq: np.float = 1, randn = None): self.num_data = tf.cast(data[1].size, default_float()) self.data = (tf.cast(data[0], default_float()), tf.cast(data[1], default_float())) self.const = tf.cast(0.5*data[1].size*np.log(2*np.pi), default_float()) self.eye_2m = tf.eye(2*m, dtype=default_float()) self.yTy = tf.reduce_sum(tf.math.square(self.data[1])) self.m_float = tf.cast(m, default_float()) self.randn = tf.random.normal(shape=[m, data[0].shape[1]], dtype=default_float()) if randn is None else tf.cast(randn[:, None], default_float()) lengthscales0 = np.ones(data[0].shape[1]) if lengthscales is None else lengthscales self.lengthscales = Parameter(lengthscales0, transform=positive(), dtype=default_float()) self.sigma_f_sq = Parameter(sigma_f_sq, transform=positive(), dtype=default_float()) self.sigma_n_sq = Parameter(sigma_n_sq, transform=positive(), dtype=default_float())
def __init__(self, data: Tuple[tf.Tensor, tf.Tensor], m: int = 100, d: int = 4, lengthscales = None, sigma_n_sq: np.float = 1, sigma_f_sq: np.float = 1, dir_weights: str = None): if data[1].dtype == np.float64: K_bd.set_floatx('float64') else: set_default_float(np.float32) self.num_data = tf.cast(data[1].shape[0], default_float()) self.data = (tf.cast(data[0], default_float()), tf.cast(data[1], default_float())) self.const = tf.cast(0.5*data[1].size*np.log(2*np.pi), default_float()) self.eye_2m = tf.eye(2*m, dtype=default_float()) self.yTy = tf.reduce_sum(tf.math.square(self.data[1])) self.m_float = tf.cast(m, default_float()) self.randn = tf.random.normal(shape=[m, d], dtype=default_float()) lengthscales0 = np.ones(d) if lengthscales is None else lengthscales self.lengthscales = Parameter(lengthscales0, transform=positive(), dtype=default_float()) self.sigma_f_sq = Parameter(sigma_f_sq, transform=positive(), dtype=default_float()) self.sigma_n_sq = Parameter(sigma_n_sq, transform=positive(), dtype=default_float()) model = models.Sequential() model.add(layers.Dense(512, activation='tanh', input_dim=data[0].shape[1])) model.add(layers.Dense(256, activation='tanh')) model.add(layers.Dense(64, activation='tanh')) model.add(layers.Dense(d)) if dir_weights is not None: model.load_weights(dir_weights) self.neural_net = model
def __init__(self, images: TensorData, name: Optional[str] = None): """ :param images: initial values of inducing locations in image form. The shape of the inducing variables varies by representation: - as Z: [M, height * width * channels_in] - as images: [M, height, width, channels_in] - as patches: [M, height * width * channels_in] - as filters: [height, width, channels_in, M] TODO: - Generalize to allow for inducing image with multiple patches? - Work on naming convention? The term 'image' is a bit too general. Patch works, however this term usually refers to a vectorized form and (for now) overlaps with GPflow's own inducing class. Alternatives include: filter, window, glimpse """ super().__init__(name=name) self._images = Parameter(images, dtype=default_float())
def __init__(self, kernel: kernels.Kernel, image_shape: List, patch_shape: List, channels_in: int = 1, channels_out: int = 1, weights: TensorType = "default", strides: List = None, padding: str = "VALID", dilations: List = None, data_format: str = "NHWC"): strides = list((1, 1) if strides is None else strides) dilations = list((1, 1) if dilations is None else dilations) # Sanity checks assert len(strides) == 2 assert len(dilations) == 2 assert padding in ("VALID", "SAME") assert data_format in ("NHWC", "NCHW") if isinstance(weights, str) and weights == "default": # TODO: improve me spatial_out = self.get_spatial_out(spatial_in=image_shape, filter_shape=patch_shape, strides=strides, padding=padding, dilations=dilations) weights = tf.ones([tf.reduce_prod(spatial_out)], dtype=default_float()) super().__init__() self.kernel = kernel self.image_shape = image_shape self.patch_shape = patch_shape self.channels_in = channels_in self.channels_out = channels_out self.strides = strides self.padding = padding self.dilations = dilations self.data_format = data_format self._weights = None if (weights is None) else Parameter(weights)
class RobustObjectiveMixin: def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.jitter_variance = Parameter( max(default_jitter(), 1e-20), transform=positive(0.0), trainable=False, name="jitter" ) def _compute_robust_maximum_log_likelihood_objective(self) -> tf.Tensor: raise NotImplementedError def robust_maximum_log_likelihood_objective(self, restore_jitter=True) -> tf.Tensor: initial_jitter = self.jitter_variance.numpy() N_orders = 20 for i in range(N_orders): self.jitter_variance.assign(10 ** i * initial_jitter) logjitter = np.log10(self.jitter_variance.numpy()) if i > 0: if i == 1: print( f"{type(self).__name__}: Failed first computation. " f"Now attempting computation with jitter ", end="", ) print(f"10**{logjitter:.2f} ", end="", flush=True) try: val = self._compute_robust_maximum_log_likelihood_objective() break except tf.errors.InvalidArgumentError as e_inner: e_msg = e_inner.message if (("Cholesky" not in e_msg) and ("not invertible" not in e_msg)) or i == (N_orders - 1): print(e_msg) raise e_inner except AssertionError as e_inner: e_msg = e_inner.args if i == (N_orders - 1): print(e_msg) raise e_inner if restore_jitter: self.jitter_variance.assign(initial_jitter) if i > 0: print("") return val
def __init__( self, data: OutputData, kernel: Optional[Kernel] = None, latent_dimensions: Optional[int] = 2, num_inducing_variables: Optional[int] = None, inducing_variable=None, *, mean_function=None, q_diag: bool = False, q_mu=None, q_sqrt=None, whiten: bool = False, ): """ - kernel, likelihood, inducing_variables, mean_function are appropriate GPflow objects - num_latent_gps is the number of latent processes to use, defaults to 2, as the dimensionality reduction is at dimensions 2 - q_diag is a boolean. If True, the covariance is approximated by a diagonal matrix. - whiten is a boolean. If True, we use the whitened representation of the inducing points. - num_data is the total number of observations, defaults to X.shape[0] (relevant when feeding in external minibatches) """ self.latent_dimensions = latent_dimensions #grab data self.data = data_input_to_tensor(data) #define lat-space initialization X_data_mean = pca_reduce(data, self.latent_dimensions) num_data, num_latent_gps = data.shape self.num_data = num_data X_data_var = tf.ones((self.num_data, self.latent_dimensions), dtype=default_float()) assert X_data_var.ndim == 2 #def kernel if kernel is None: kernel = gpflow.kernels.SquaredExponential() #init Parameters latent self.X_data_mean = Parameter(X_data_mean) self.X_data_var = Parameter(X_data_var, transform=positive()) #init parameter inducing point if (inducing_variable is None) == (num_inducing_variables is None): raise ValueError( "BayesianGPLVM needs exactly one of `inducing_variable` and `num_inducing_variables`" ) if inducing_variable is None: # By default we initialize by subset of initial latent points # Note that tf.random.shuffle returns a copy, it does not shuffle in-place #maybe use k-means clustering Z = tf.random.shuffle(X_data_mean)[:num_inducing_variables] inducing_variable = InducingPoints(Z) self.inducing_variable = inducingpoint_wrapper(inducing_variable) #loss placeholder for analysis purpuse self.loss_placeholder = defaultdict( list, {k: [] for k in ("KL_x", "ELBO", "KL_u")}) # deal with parameters for the prior mean variance of X X_prior_mean = tf.zeros((self.num_data, self.latent_dimensions), dtype=default_float()) X_prior_var = tf.ones((self.num_data, self.latent_dimensions), dtype=default_float()) self.X_prior_mean = tf.convert_to_tensor(np.atleast_1d(X_prior_mean), dtype=default_float()) self.X_prior_var = tf.convert_to_tensor(np.atleast_1d(X_prior_var), dtype=default_float()) #sanity check assert np.all(X_data_mean.shape == X_data_var.shape) assert X_data_mean.shape[0] == self.data.shape[ 0], "X mean and Y must be same size." assert X_data_var.shape[0] == self.data.shape[ 0], "X var and Y must be same size." assert X_data_mean.shape[1] == self.latent_dimensions assert self.X_prior_mean.shape[0] == self.num_data assert self.X_prior_mean.shape[1] == self.latent_dimensions assert self.X_prior_var.shape[0] == self.num_data assert self.X_prior_var.shape[1] == self.latent_dimensions # init the super class, accept args super().__init__(kernel, likelihoods.Gaussian(variance=0.1), mean_function, num_latent_gps) self.q_diag = q_diag self.whiten = whiten #self.inducing_variable = inducingpoint_wrapper(inducing_variable) # init variational parameters num_inducing = self.inducing_variable.num_inducing self._init_variational_parameters(num_inducing, q_mu, q_sqrt, q_diag)
def main(args): datasets = Datasets(data_path=args.data_path) # prepare output files outname1 = args.results_dir + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.rmse' if not os.path.exists(os.path.dirname(outname1)): os.makedirs(os.path.dirname(outname1)) outfile1 = open(outname1, 'w') outname2 = args.results_dir + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.nll' outfile2 = open(outname2, 'w') outname3 = args.results_dir + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.time' outfile3 = open(outname3, 'w') # ========================================================================= # CROSS-VALIDATION LOOP # ========================================================================= running_err = 0 running_loss = 0 running_time = 0 test_errs = np.zeros(args.splits) test_nlls = np.zeros(args.splits) test_times = np.zeros(args.splits) for i in range(args.splits): # ===================================================================== # MODEL CONSTRUCTION # ===================================================================== print('Split: {}'.format(i)) print('Getting dataset...') # get dataset data = datasets.all_datasets[args.dataset].get_data( i, normalize=args.normalize_data) X, Y, Xs, Ys, Y_std = [ data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std'] ] # inducing points via k-means Z = kmeans2(X, args.num_inducing, minit='points')[0] # set up batches batch_size = args.M if args.M < X.shape[0] else X.shape[0] train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)).repeat()\ .prefetch(X.shape[0]//2)\ .shuffle(buffer_size=(X.shape[0]//2))\ .batch(batch_size) print('Setting up DGP model...') kernels = [] dims = [] # hidden_dim = min(args.max_dim, X.shape[1]) hidden_dim = X.shape[1] if X.shape[1] < args.max_dim else args.max_dim for l in range(args.num_layers): if l == 0: dim = X.shape[1] dims.append(dim) else: dim = hidden_dim dims.append(dim) if args.ard: # SE kernel with lengthscale per dimension kernels.append( SquaredExponential(lengthscale=[1.] * dim) + White(variance=1e-5)) else: # SE kernel with single lengthscale kernels.append( SquaredExponential(lengthscale=1.) + White(variance=1e-5)) # output dim dims.append(Y.shape[1]) dgp_model = DGP(X, Y, Z, dims, kernels, Gaussian(variance=0.05), num_samples=args.num_samples, num_data=X.shape[0]) # initialise inner layers almost deterministically for layer in dgp_model.layers[:-1]: layer.q_sqrt = Parameter(layer.q_sqrt.value() * 1e-5, transform=triangular()) # ===================================================================== # TRAINING # ===================================================================== optimiser = tf.optimizers.Adam(args.learning_rate) print('Training DGP model...') t0 = time.time() # training loop monitored_training_loop(dgp_model, train_dataset, optimiser=optimiser, logdir=args.log_dir, iterations=args.iterations, logging_iter_freq=args.logging_iter_freq) t1 = time.time() # ===================================================================== # TESTING # ===================================================================== test_times[i] = t1 - t0 print('Time taken to train: {}'.format(t1 - t0)) outfile3.write('Split {}: {}\n'.format(i + 1, t1 - t0)) outfile3.flush() os.fsync(outfile3.fileno()) running_time += t1 - t0 # minibatch test predictions means, vars = [], [] test_batch_size = args.test_batch_size if len(Xs) > test_batch_size: for mb in range(-(-len(Xs) // test_batch_size)): m, v = dgp_model.predict_y(Xs[mb * test_batch_size:(mb + 1) * test_batch_size, :], num_samples=args.test_samples) means.append(m) vars.append(v) else: m, v = dgp_model.predict_y(Xs, num_samples=args.test_samples) means.append(m) vars.append(v) mean_SND = np.concatenate(means, 1) # [S, N, D] var_SND = np.concatenate(vars, 1) # [S, N, D] mean_ND = np.mean(mean_SND, 0) # [N, D] # rmse test_err = np.mean(Y_std * np.mean((Ys - mean_ND)**2.0)**0.5) test_errs[i] = test_err print('Average RMSE: {}'.format(test_err)) outfile1.write('Split {}: {}\n'.format(i + 1, test_err)) outfile1.flush() os.fsync(outfile1.fileno()) running_err += test_err # nll test_nll = np.mean( logsumexp(norm.logpdf(Ys * Y_std, mean_SND * Y_std, var_SND**0.5 * Y_std), 0, b=1 / float(args.test_samples))) test_nlls[i] = test_nll print('Average test log likelihood: {}'.format(test_nll)) outfile2.write('Split {}: {}\n'.format(i + 1, test_nll)) outfile2.flush() os.fsync(outfile2.fileno()) running_loss += test_nll outfile1.write('Average: {}\n'.format(running_err / args.splits)) outfile1.write('Standard deviation: {}\n'.format(np.std(test_errs))) outfile2.write('Average: {}\n'.format(running_loss / args.splits)) outfile2.write('Standard deviation: {}\n'.format(np.std(test_nlls))) outfile3.write('Average: {}\n'.format(running_time / args.splits)) outfile3.write('Standard deviation: {}\n'.format(np.std(test_times))) outfile1.close() outfile2.close() outfile3.close()
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.jitter_variance = Parameter( max(default_jitter(), 1e-20), transform=positive(0.0), trainable=False, name="jitter" )
def __init__(self, c=None): super().__init__() c = np.zeros(1) if c is None else c self.c = Parameter(c)
def __init__( self, data: OutputData, split_space: bool, Xp_mean: tf.Tensor, Xp_var: tf.Tensor, pi: tf.Tensor, kernel_K: List[Kernel], Zp: tf.Tensor, Xs_mean=None, Xs_var=None, kernel_s=None, Zs=None, Xs_prior_mean=None, Xs_prior_var=None, Xp_prior_mean=None, Xp_prior_var=None, pi_prior=None ): """ Initialise Bayesian GPLVM object. This method only works with a Gaussian likelihood. :param data: data matrix, size N (number of points) x D (dimensions) :param: split_space, if true, have both shared and private space; if false, only have private spaces (note: to recover GPLVM, set split_space=False and let K=1) :param Xp_mean: mean latent positions in the private space [N, Qp] (Qp is the dimension of the private space) :param Xp_var: variance of the latent positions in the private space [N, Qp] :param pi: mixture responsibility of each category to each point [N, K] (K is the number of categories), i.e. q(c) :param kernel_K: private space kernel, one for each category :param Zp: inducing inputs of the private space [M, Qp] :param num_inducing_variables: number of inducing points, M :param Xs_mean: mean latent positions in the shared space [N, Qs] (Qs is the dimension of the shared space). i.e. mus in q(Xs) ~ N(Xs | mus, Ss) :param Xs_var: variance of latent positions in shared space [N, Qs], i.e. Ss, assumed diagonal :param kernel_s: shared space kernel :param Zs: inducing inputs of the shared space [M, Qs] (M is the number of inducing points) :param Xs_prior_mean: prior mean used in KL term of bound, [N, Qs]. By default 0. mean in p(Xs) :param Xs_prior_var: prior variance used in KL term of bound, [N, Qs]. By default 1. variance in p(Xs) :param Xp_prior_mean: prior mean used in KL term of bound, [N, Qp]. By default 0. mean in p(Xp) :param Xp_prior_var: prior variance used in KL term of bound, [N, Qp]. By default 1. variance in p(Xp) :param pi_prior: prior mixture weights used in KL term of the bound, [N, K]. By default uniform. p(c) """ # if don't want shared space, set shared space to none --> get a mixture of GPLVM # if don't want private space, set shared space to none, set K = 1 and only include 1 kernel in `kernel_K` --> recover the original GPLVM # TODO: think about how to do this with minibatch # it's awkward since w/ minibatch the model usually doesn't store the data internally # but for gplvm, you need to keep the q(xn) for all the n's # so you need to know which ones to update for each minibatch, probably can be solved but not pretty # using inference network / back constraints will solve this, since we will be keeping a global set of parameters # rather than a set for each q(xn) self.N, self.D = data.shape self.Qp = Xp_mean.shape[1] self.K = pi.shape[1] self.split_space = split_space assert Xp_var.ndim == 2 assert len(kernel_K) == self.K assert np.all(Xp_mean.shape == Xp_var.shape) assert Xp_mean.shape[0] == self.N, "Xp_mean and Y must be of same size" assert pi.shape[0] == self.N, "pi and Y must be of the same size" super().__init__() self.likelihood = likelihoods.Gaussian() self.kernel_K = kernel_K self.data = data_input_to_tensor(data) # the covariance of q(X) as a [N, Q] matrix, the assumption is that Sn's are diagonal # i.e. the latent dimensions are uncorrelated # otherwise would require a [N, Q, Q] matrix self.Xp_mean = Parameter(Xp_mean) self.Xp_var = Parameter(Xp_var, transform=positive()) self.pi = Parameter(pi, transform=tfp.bijectors.SoftmaxCentered()) self.Zp = inducingpoint_wrapper(Zp) self.M = len(self.Zp) # initialize the variational parameters for q(U), same way as in SVGP # q_mu: List[K], mean of the inducing variables U [M, D], i.e m in q(U) ~ N(U | m, S), # initialized as zeros # q_sqrt: List[K], cholesky of the covariance matrix of the inducing variables [D, M, M] # q_diag is false because natural gradient only works for full covariance # initialized as all identities # we need K sets of q(Uk), each approximating fs+fk self.q_mu = [] self.q_sqrt = [] for k in range(self.K): q_mu = np.zeros((self.M, self.D)) q_mu = Parameter(q_mu, dtype=default_float()) # [M, D] self.q_mu.append(q_mu) q_sqrt = [ np.eye(self.M, dtype=default_float()) for _ in range(self.D) ] q_sqrt = np.array(q_sqrt) q_sqrt = Parameter(q_sqrt, transform=triangular()) # [D, M, M] self.q_sqrt.append(q_sqrt) # deal with parameters for the prior if Xp_prior_mean is None: Xp_prior_mean = tf.zeros((self.N, self.Qp), dtype=default_float()) if Xp_prior_var is None: Xp_prior_var = tf.ones((self.N, self.Qp)) if pi_prior is None: pi_prior = tf.ones((self.N, self.K), dtype=default_float()) * 1/self.K self.Xp_prior_mean = tf.convert_to_tensor(np.atleast_1d(Xp_prior_mean), dtype=default_float()) self.Xp_prior_var = tf.convert_to_tensor(np.atleast_1d(Xp_prior_var), dtype=default_float()) self.pi_prior = tf.convert_to_tensor(np.atleast_1d(pi_prior), dtype=default_float()) # if we have both shared space and private space, need to initialize the parameters for the shared space if split_space: assert Xs_mean is not None and Xs_var is not None and kernel_s is not None and Zs is not None, 'Xs_mean, Xs_var, kernel_s, Zs need to be initialize if `split_space=True`' assert Xs_var.ndim == 2 assert np.all(Xs_mean.shape == Xs_var.shape) assert Xs_mean.shape[0] == self.N, "Xs_mean and Y must be of same size" self.Qs = Xs_mean.shape[1] self.kernel_s = kernel_s self.Xs_mean = Parameter(Xs_mean) self.Xs_var = Parameter(Xs_var, transform=positive()) self.Zs = inducingpoint_wrapper(Zs) if len(Zs) != len(Zp): raise ValueError( '`Zs` and `Zp` should have the same length' ) if Xs_prior_mean is None: Xs_prior_mean = tf.zeros((self.N, self.Qs), dtype=default_float()) if Xs_prior_var is None: Xs_prior_var = tf.ones((self.N, self.Qs)) self.Xs_prior_mean = tf.convert_to_tensor(np.atleast_1d(Xs_prior_mean), dtype=default_float()) self.Xs_prior_var = tf.convert_to_tensor(np.atleast_1d(Xs_prior_var), dtype=default_float()) self.Fq = tf.zeros((self.N, self.K), dtype=default_float())
def main(args): datasets = Datasets(data_path=args.data_path) # Prepare output files outname1 = '../tmp/' + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.nll' if not os.path.exists(os.path.dirname(outname1)): os.makedirs(os.path.dirname(outname1)) outfile1 = open(outname1, 'w') outname2 = '../tmp/' + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.time' outfile2 = open(outname2, 'w') running_loss = 0 running_time = 0 for i in range(args.splits): print('Split: {}'.format(i)) print('Getting dataset...') data = datasets.all_datasets[args.dataset].get_data(i) X, Y, Xs, Ys, Y_std = [ data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std'] ] Z = kmeans2(X, args.num_inducing, minit='points')[0] # set up batches batch_size = args.M if args.M < X.shape[0] else X.shape[0] train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)).repeat()\ .prefetch(X.shape[0]//2)\ .shuffle(buffer_size=(X.shape[0]//2))\ .batch(batch_size) print('Setting up DGP model...') kernels = [] for l in range(args.num_layers): kernels.append(SquaredExponential() + White(variance=1e-5)) dgp_model = DGP(X.shape[1], kernels, Gaussian(variance=0.05), Z, num_outputs=Y.shape[1], num_samples=args.num_samples, num_data=X.shape[0]) # initialise inner layers almost deterministically for layer in dgp_model.layers[:-1]: layer.q_sqrt = Parameter(layer.q_sqrt.value() * 1e-5, transform=triangular()) optimiser = tf.optimizers.Adam(args.learning_rate) def optimisation_step(model, X, Y): with tf.GradientTape() as tape: tape.watch(model.trainable_variables) obj = -model.elbo(X, Y, full_cov=False) grad = tape.gradient(obj, model.trainable_variables) optimiser.apply_gradients(zip(grad, model.trainable_variables)) def monitored_training_loop(model, train_dataset, logdir, iterations, logging_iter_freq): # TODO: use tensorboard to log trainables and performance tf_optimisation_step = tf.function(optimisation_step) batches = iter(train_dataset) for i in range(iterations): X, Y = next(batches) tf_optimisation_step(model, X, Y) iter_id = i + 1 if iter_id % logging_iter_freq == 0: tf.print( f'Epoch {iter_id}: ELBO (batch) {model.elbo(X, Y)}') print('Training DGP model...') t0 = time.time() monitored_training_loop(dgp_model, train_dataset, logdir=args.log_dir, iterations=args.iterations, logging_iter_freq=args.logging_iter_freq) t1 = time.time() print('Time taken to train: {}'.format(t1 - t0)) outfile2.write('Split {}: {}\n'.format(i + 1, t1 - t0)) outfile2.flush() os.fsync(outfile2.fileno()) running_time += t1 - t0 m, v = dgp_model.predict_y(Xs, num_samples=args.test_samples) test_nll = np.mean( logsumexp(norm.logpdf(Ys * Y_std, m * Y_std, v**0.5 * Y_std), 0, b=1 / float(args.test_samples))) print('Average test log likelihood: {}'.format(test_nll)) outfile1.write('Split {}: {}\n'.format(i + 1, test_nll)) outfile1.flush() os.fsync(outfile1.fileno()) running_loss += t1 - t0 outfile1.write('Average: {}\n'.format(running_loss / args.splits)) outfile2.write('Average: {}\n'.format(running_time / args.splits)) outfile1.close() outfile2.close()
def __init__( self, data: OutputData, X_data_mean: Optional[tf.Tensor] = None, X_data_var: Optional[tf.Tensor] = None, kernel: Optional[Kernel] = None, num_inducing_variables: Optional[int] = None, inducing_variable=None, X_prior_mean=None, X_prior_var=None, ): """ Initialise Bayesian GPLVM object. This method only works with a Gaussian likelihood. :param data: data matrix, size N (number of points) x D (dimensions) :param X_data_mean: initial latent positions, size N (number of points) x Q (latent dimensions). :param X_data_var: variance of latent positions ([N, Q]), for the initialisation of the latent space. :param kernel: kernel specification, by default Squared Exponential :param num_inducing_variables: number of inducing points, M :param inducing_variable: matrix of inducing points, size M (inducing points) x Q (latent dimensions). By default random permutation of X_data_mean. :param X_prior_mean: prior mean used in KL term of bound. By default 0. Same size as X_data_mean. :param X_prior_var: prior variance used in KL term of bound. By default 1. """ self.latent_dimensions = 2 #grab data self.data = data_input_to_tensor(data) #define lat-space initialization if X_data_mean is None: X_data_mean = pca_reduce(data, self.latent_dimensions) num_data, num_latent_gps = X_data_mean.shape self.num_data = num_data if X_data_var is None: X_data_var = tf.ones((self.num_data, self.latent_dimensions), dtype=default_float()) assert X_data_var.ndim == 2 self.output_dim = self.data.shape[-1] #num_latent maybe #def kernel if kernel is None: kernel = gpflow.kernels.SquaredExponential() #init GPMODEL super().__init__(kernel, likelihoods.Gaussian(variance=0.1), num_latent_gps=num_latent_gps) #init Parameters latent self.X_data_mean = Parameter(X_data_mean) self.X_data_var = Parameter(X_data_var, transform=positive()) #init parameter inducing point if (inducing_variable is None) == (num_inducing_variables is None): raise ValueError( "BayesianGPLVM needs exactly one of `inducing_variable` and `num_inducing_variables`" ) if inducing_variable is None: # By default we initialize by subset of initial latent points # Note that tf.random.shuffle returns a copy, it does not shuffle in-place #maybe use k-means clustering Z = tf.random.shuffle(X_data_mean)[:num_inducing_variables] inducing_variable = InducingPoints(Z) self.inducing_variable = inducingpoint_wrapper(inducing_variable) #loss placeholder for analysis purpuse self.loss_placeholder = defaultdict(list, {k: [] for k in ("KL_x", "ELBO")}) # deal with parameters for the prior mean variance of X if X_prior_mean is None: X_prior_mean = tf.zeros((self.num_data, self.latent_dimensions), dtype=default_float()) if X_prior_var is None: X_prior_var = tf.ones((self.num_data, self.latent_dimensions), dtype=default_float()) self.X_prior_mean = tf.convert_to_tensor(np.atleast_1d(X_prior_mean), dtype=default_float()) self.X_prior_var = tf.convert_to_tensor(np.atleast_1d(X_prior_var), dtype=default_float()) #sanity check assert np.all(X_data_mean.shape == X_data_var.shape) assert X_data_mean.shape[0] == self.data.shape[ 0], "X mean and Y must be same size." assert X_data_var.shape[0] == self.data.shape[ 0], "X var and Y must be same size." assert X_data_mean.shape[1] == self.latent_dimensions assert self.X_prior_mean.shape[0] == self.num_data assert self.X_prior_mean.shape[1] == self.latent_dimensions assert self.X_prior_var.shape[0] == self.num_data assert self.X_prior_var.shape[1] == self.latent_dimensions