def test_q_sqrt_constraints(inducing_points, kernel, mu, white): """ Test that sending in an unconstrained q_sqrt returns the same conditional evaluation and gradients. This is important to match the behaviour of the KL, which enforces q_sqrt is triangular. """ tril = np.tril(rng.randn(Ln, Nn, Nn)) q_sqrt_constrained = Parameter(tril, transform=triangular()) q_sqrt_unconstrained = Parameter(tril) diff_before_gradient_step = (q_sqrt_constrained - q_sqrt_unconstrained).numpy() assert_allclose(diff_before_gradient_step, 0) kls = [] for q_sqrt in [q_sqrt_constrained, q_sqrt_unconstrained]: with tf.GradientTape() as tape: kl = prior_kl(inducing_points, kernel, mu, q_sqrt, whiten=white) grad = tape.gradient(kl, q_sqrt.unconstrained_variable) q_sqrt.unconstrained_variable.assign_sub(grad) kls.append(kl) diff_kls_before_gradient_step = kls[0] - kls[1] assert_allclose(diff_kls_before_gradient_step, 0) diff_after_gradient_step = (q_sqrt_constrained - q_sqrt_unconstrained).numpy() assert_allclose(diff_after_gradient_step, 0)
def __init__(self, num_data: int, latent_dim: int, means: Optional[np.ndarray] = None): """ Directly parameterise the posterior of the latent variables associated with each datapoint with a diagonal multivariate Normal distribution. Note that across latent variables we assume a mean-field approximation. See :cite:t:`dutordoir2018cde` for a more thorough explanation of latent variable models and encoders. :param num_data: The number of datapoints, ``N``. :param latent_dim: The dimensionality of the latent variable, ``W``. :param means: The initialisation of the mean of the latent variable posterior distribution. (see :attr:`means`). If `None` (the default setting), set to ``np.random.randn(N, W) * 0.01``; otherwise, ``means`` should be an array of rank two with the shape ``[N, W]``. """ super().__init__() if means is None: # break the symmetry in the means: means = 0.01 * np.random.randn(num_data, latent_dim) else: if np.any(means.shape != (num_data, latent_dim)): raise EncoderInitializationError( f"means must have shape [num_data, latent_dim] = [{num_data}, {latent_dim}]; " f"got {means.shape} instead." ) # initialise distribution with a small standard deviation, as this has # been observed to help fitting: stds = 1e-5 * np.ones_like(means) # TODO: Rename to `scale` and `loc` to match tfp.distributions self.means = Parameter(means, dtype=default_float(), name="w_means") self.stds = Parameter(stds, transform=positive(), dtype=default_float(), name="w_stds")
def __init__(self,active_dims=[0],decay=0.1,max_subsequence_length=3, alphabet = [], maxlen=0, batch_size=100): super().__init__(active_dims=active_dims) # constrain decay kernel params to between 0 and 1 self.logistic = tfb.Chain([tfb.Shift(tf.cast(0,tf.float64))(tfb.Scale(tf.cast(1,tf.float64))),tfb.Sigmoid()]) self.decay_param= Parameter(decay, transform=self.logistic ,name="decay") # use will use copies of the kernel params to stop building expensive computation graph # we instead efficientely calculate gradients using dynamic programming # These params are updated at every call to K and K_diag (to check if parameters have been updated) self.decay = self.decay_param.numpy() self.decay_unconstrained = self.decay_param.unconstrained_variable.numpy() self.order_coefs=tf.ones(max_subsequence_length,dtype=tf.float64) # store additional kernel parameters self.max_subsequence_length = tf.constant(max_subsequence_length) self.alphabet = tf.constant(alphabet) self.alphabet_size=tf.shape(self.alphabet)[0] self.maxlen = tf.constant(maxlen) self.batch_size = tf.constant(batch_size) # build a lookup table of the alphabet to encode input strings self.table = tf.lookup.StaticHashTable( initializer=tf.lookup.KeyValueTensorInitializer( keys=tf.constant(["PAD"]+alphabet), values=tf.constant(range(0,len(alphabet)+1)),),default_value=0) # initialize helful construction matricies to be lazily computed once needed self.D = None self.dD_dgap = None
def __init__(self,rank=1,active_dims=[0],gap_decay=0.1, match_decay=0.9,max_subsequence_length=3, alphabet = [], maxlen=0): super().__init__(active_dims=active_dims) # constrain decay kernel params to between 0 and 1 logistic_gap = tfb.Chain([tfb.Shift(tf.cast(0,tf.float64))(tfb.Scale(tf.cast(1,tf.float64))),tfb.Sigmoid()]) logisitc_match = tfb.Chain([tfb.AffineScalar(shift=tf.cast(0,tf.float64),scale=tf.cast(1,tf.float64)),tfb.Sigmoid()]) self.gap_decay= Parameter(gap_decay, transform=logistic_gap ,name="gap_decay") self.match_decay = Parameter(match_decay, transform=logisitc_match,name="match_decay") # prepare similarity matrix parameters self.rank=rank W = 0.1 * tf.ones((len(alphabet), self.rank)) kappa = tf.ones(len(alphabet)) self.W = Parameter(W,name="W") self.kappa = Parameter(kappa, transform=positive(),name="kappa") # store additional kernel parameters self.max_subsequence_length = tf.constant(max_subsequence_length) self.alphabet = tf.constant(alphabet) self.alphabet_size=tf.shape(self.alphabet)[0] self.maxlen = tf.constant(maxlen) # build a lookup table of the alphabet to encode input strings self.table = tf.lookup.StaticHashTable( initializer=tf.lookup.KeyValueTensorInitializer( keys=tf.constant(["PAD"]+alphabet), values=tf.constant(range(0,len(alphabet)+1)),),default_value=0)
class LinearController(gpflow.Module): def __init__(self, state_dim, control_dim, max_action=1.0): gpflow.Module.__init__(self) self.W = Parameter(np.random.rand(control_dim, state_dim)) self.b = Parameter(np.random.rand(1, control_dim)) self.max_action = max_action def compute_action(self, m, s, squash=True): ''' Simple affine action: M <- W(m-t) - b IN: mean (m) and variance (s) of the state OUT: mean (M) and variance (S) of the action ''' M = m @ tf.transpose(self.W) + self.b # mean output S = self.W @ s @ tf.transpose(self.W) # output variance V = tf.transpose(self.W) #input output covariance if squash: M, S, V2 = squash_sin(M, S, self.max_action) V = V @ V2 return M, S, V def randomize(self): mean = 0 sigma = 1 self.W.assign(mean + sigma * np.random.normal(size=self.W.shape)) self.b.assign(mean + sigma * np.random.normal(size=self.b.shape))
def __init__(self, active_dims=[0], gap_decay=0.1, match_decay=0.9, max_subsequence_length=3, max_occurence_length=10, alphabet=[], maxlen=0, normalize=True, batch_size=1000): super().__init__(active_dims=active_dims) # constrain kernel params to between 0 and 1 self.logistic_gap = tfb.Chain([ tfb.AffineScalar(shift=tf.cast(0, tf.float64), scale=tf.cast(1, tf.float64)), tfb.Sigmoid() ]) self.logisitc_match = tfb.Chain([ tfb.AffineScalar(shift=tf.cast(0, tf.float64), scale=tf.cast(1, tf.float64)), tfb.Sigmoid() ]) self.gap_decay_param = Parameter(gap_decay, transform=self.logistic_gap, name="gap_decay") self.match_decay_param = Parameter(match_decay, transform=self.logisitc_match, name="match_decay") self.max_subsequence_length = max_subsequence_length self.max_occurence_length = max_occurence_length self.alphabet = alphabet self.maxlen = maxlen self.normalize = normalize self.batch_size = batch_size self.symmetric = False # use will use copies of the kernel params to stop building expensive computation graph # we instead efficientely calculate gradients using dynamic programming # These params are updated at every call to K and K_diag (to check if parameters have been updated) self.match_decay = self.match_decay_param.numpy() self.gap_decay = self.gap_decay_param.numpy() self.match_decay_unconstrained = self.match_decay_param.unconstrained_variable.numpy( ) self.gap_decay_unconstrained = self.gap_decay_param.unconstrained_variable.numpy( ) # initialize helful construction matricies to be lazily computed once needed self.D = None self.dD_dgap = None # build a lookup table of the alphabet to encode input strings self.table = tf.lookup.StaticHashTable( initializer=tf.lookup.KeyValueTensorInitializer( keys=tf.constant(["PAD"] + alphabet), values=tf.constant(range(0, len(alphabet) + 1)), ), default_value=0)
def __init__(self, state_dim, rewards=[], coefs=None): self.state_dim = state_dim self.base_rewards = rewards if coefs is not None: self.coefs = Parameter(coefs, trainable=False) else: self.coefs = Parameter(np.ones(len(rewards)), dtype=float_type, trainable=False)
def __init__(self, state_dim, W=None, t=None): self.state_dim = state_dim if W is not None: self.W = Parameter(np.reshape(W, (state_dim, state_dim)), trainable=False) else: self.W = Parameter(np.eye(state_dim), trainable=False) if t is not None: self.t = Parameter(np.reshape(t, (1, state_dim)), trainable=False) else: self.t = Parameter(np.zeros((1, state_dim)), trainable=False)
def __init__(self, m=1, active_dims=[0], gap_decay=0.1, match_decay=0.9, max_subsequence_length=3, alphabet=[], maxlen=0): super().__init__(active_dims=active_dims) # constrain decay kernel params to between 0 and 1 logistic_gap = tfb.Chain([ tfb.Shift(tf.cast(0, tf.float64))(tfb.Scale(tf.cast(1, tf.float64))), tfb.Sigmoid() ]) logisitc_match = tfb.Chain([ tfb.AffineScalar(shift=tf.cast(0, tf.float64), scale=tf.cast(1, tf.float64)), tfb.Sigmoid() ]) self.gap_decay = Parameter(gap_decay, transform=logistic_gap, name="gap_decay") self.match_decay = Parameter(match_decay, transform=logisitc_match, name="match_decay") # prepare order coefs params order_coefs = tf.ones(max_subsequence_length) self.order_coefs = Parameter(order_coefs, transform=positive(), name="order_coefs") # get split weights self.m = m split_weights = tf.ones(2 * self.m - 1) self.split_weights = Parameter(split_weights, transform=positive(), name="order_coefs") # store additional kernel parameters self.max_subsequence_length = tf.constant(max_subsequence_length) self.alphabet = tf.constant(alphabet) self.alphabet_size = tf.shape(self.alphabet)[0] self.maxlen = tf.cast(tf.math.ceil(maxlen / self.m), dtype=tf.int32) self.full_maxlen = tf.constant(maxlen) # build a lookup table of the alphabet to encode input strings self.table = tf.lookup.StaticHashTable( initializer=tf.lookup.KeyValueTensorInitializer( keys=tf.constant(["PAD"] + alphabet), values=tf.constant(range(0, len(alphabet) + 1)), ), default_value=0)
def __init__(self, state_dim, weight_matrix, target_state): """ :param state_dim: dimensionality of observation. :param weight_matrix: numpy array giving the pre-defined weight matrix which sets the sensitivity of the reward to the different dimensions. :param target_state: numpy array giving the target state. """ self.state_dim = state_dim self.weight_matrix = Parameter(np.reshape(weight_matrix, (state_dim, state_dim)), trainable=False) self.target_state = Parameter(np.reshape(target_state, (1, state_dim)), trainable=False)
def __init__(self, data, kernel, X=None, likelihood_variance=1e-4): gpflow.Module.__init__(self) if X is None: self.X = Parameter(data[0], name="DataX", dtype=gpflow.default_float()) else: self.X = X self.Y = Parameter(data[1], name="DataY", dtype=gpflow.default_float()) self.data = [self.X, self.Y] self.kernel = kernel self.likelihood = gpflow.likelihoods.Gaussian() self.likelihood.variance.assign(likelihood_variance) set_trainable(self.likelihood.variance, False)
def init_variational_params(self, num_inducing): q_mu = np.zeros( (num_inducing, self.num_kernels, self.num_latent_gps)) # M x K x O self.q_mu = Parameter(q_mu, dtype=default_float()) q_sqrt = [] for _ in range(self.num_kernels): q_sqrt.append([ np.eye(num_inducing, dtype=default_float()) for _ in range(self.num_latent_gps) ]) q_sqrt = np.array(q_sqrt) self.q_sqrt = Parameter(q_sqrt, transform=triangular()) # K x O x M x M
def __init__(self, data: RegressionData, kernel, noise_variance: float = 1.0, parallel=False, max_parallel=10000): self.noise_variance = Parameter(noise_variance, transform=positive()) ts, ys = data_input_to_tensor(data) super().__init__(kernel, None, None, num_latent_gps=ys.shape[-1]) self.data = ts, ys filter_spec = kernel.get_spec(ts.shape[0]) filter_ys_spec = tf.TensorSpec((ts.shape[0], 1), config.default_float()) smoother_spec = kernel.get_spec(None) smoother_ys_spec = tf.TensorSpec((None, 1), config.default_float()) if not parallel: self._kf = tf.function( partial(kf, return_loglikelihood=True, return_predicted=False), input_signature=[filter_spec, filter_ys_spec]) self._kfs = tf.function( kfs, input_signature=[smoother_spec, smoother_ys_spec]) else: self._kf = tf.function( partial(pkf, return_loglikelihood=True, max_parallel=ts.shape[0]), input_signature=[filter_spec, filter_ys_spec]) self._kfs = tf.function( partial(pkfs, max_parallel=max_parallel), input_signature=[smoother_spec, smoother_ys_spec])
def __init__(self, variance, lengthscales, name='Kernel', active_dims=None): """ Kernel Constructor. Args: variance: An (L,L) symmetric, positive definite matrix for the signal variance. lengthscales: An (L,M) matrix of positive definite lengthscales. is_lengthscales_trainable: Whether the lengthscales of this kernel are trainable. name: The name of this kernel. active_dims: Which of the input dimensions are used. The default None means all of them. """ super(AnisotropicStationary, self).__init__( active_dims=active_dims, name=name ) # Do not call gf.kernels.AnisotropicStationary.__init__()! self.variance = Variance(value=np.atleast_2d(variance), name=name + 'Variance') self._L = self.variance.shape[0] lengthscales = data_input_to_tensor(lengthscales) lengthscales_shape = tuple(tf.shape(lengthscales).numpy()) self._M = 1 if lengthscales_shape in ((), (1, ), (1, 1), ( self._L, )) else lengthscales_shape[-1] lengthscales = tf.reshape( tf.broadcast_to(lengthscales, (self._L, self._M)), (self._L, 1, self._M)) self.lengthscales = Parameter(lengthscales, transform=positive(), trainable=False, name=name + 'Lengthscales') self._validate_ard_active_dims(self.lengthscales[0, 0])
def __init__( self, value, name: str = 'Variance', cholesky_diagonal_lower_bound: float = CHOLESKY_DIAGONAL_LOWER_BOUND ): """ Construct a non-diagonal covariance matrix. Mutable only through it's properties cholesky_diagonal and cholesky_lower_triangle. Args: value: A symmetric, positive definite matrix, expressed in tensorflow or numpy. cholesky_diagonal_lower_bound: Lower bound on the diagonal of the Cholesky decomposition. """ super().__init__(name=name) value = data_input_to_tensor(value) self._shape = (value.shape[-1], value.shape[-1]) self._broadcast_shape = (value.shape[-1], 1, value.shape[-1], 1) if value.shape != self._shape: raise ValueError('Variance must have shape (L,L).') cholesky = tf.linalg.cholesky(value) self._cholesky_diagonal = tf.linalg.diag_part(cholesky) if min(self._cholesky_diagonal) <= cholesky_diagonal_lower_bound: raise ValueError( f'The Cholesky diagonal of {name} must be strictly greater than {cholesky_diagonal_lower_bound}.' ) self._cholesky_diagonal = Parameter( self._cholesky_diagonal, transform=positive(lower=cholesky_diagonal_lower_bound), name=name + '.cholesky_diagonal') mask = sum([ list(range(i * self._shape[0], i * (self._shape[0] + 1))) for i in range(1, self._shape[0]) ], start=[]) self._cholesky_lower_triangle = Parameter( tf.gather(tf.reshape(cholesky, [-1]), mask), name=name + '.cholesky_lower_triangle') self._row_lengths = tuple(range(self._shape[0]))
def test_q_sqrt_constraints(Xdata, Xnew, kernel, mu, white): """ Test that sending in an unconstrained q_sqrt returns the same conditional evaluation and gradients. This is important to match the behaviour of the KL, which enforces q_sqrt is triangular. """ tril = np.tril(rng.randn(Ln, Nn, Nn)) q_sqrt_constrained = Parameter(tril, transform=triangular()) q_sqrt_unconstrained = Parameter(tril) diff_before_gradient_step = (q_sqrt_constrained - q_sqrt_unconstrained).numpy() assert_allclose(diff_before_gradient_step, 0) Fstars = [] for q_sqrt in [q_sqrt_constrained, q_sqrt_unconstrained]: with tf.GradientTape() as tape: _, Fstar_var = conditional(Xnew, Xdata, kernel, mu, q_sqrt=q_sqrt, white=white) grad = tape.gradient(Fstar_var, q_sqrt.unconstrained_variable) q_sqrt.unconstrained_variable.assign_sub(grad) Fstars.append(Fstar_var) diff_Fstar_before_gradient_step = Fstars[0] - Fstars[1] assert_allclose(diff_Fstar_before_gradient_step, 0) diff_after_gradient_step = (q_sqrt_constrained - q_sqrt_unconstrained).numpy() assert_allclose(diff_after_gradient_step, 0)
def create_models(self, data): self.models = [] for i in range(self.num_outputs): kernel = gpflow.kernels.SquaredExponential( lengthscales=tf.ones([ data[0].shape[1], ], dtype=float_type)) transformed_lengthscales = Parameter( kernel.lengthscales, transform=positive(lower=1e-3)) kernel.lengthscales = transformed_lengthscales kernel.lengthscales.prior = tfd.Gamma(f64(1.1), f64(1 / 10.0)) if i == 0: self.models.append( FakeGPR((data[0], data[1][:, i:i + 1]), kernel)) else: self.models.append( FakeGPR((data[0], data[1][:, i:i + 1]), kernel, self.models[-1].X))
class BayesianDenseLayer(TrackableLayer): """A Bayesian dense layer for variational Bayesian neural networks""" def __init__( self, input_dim: int, output_dim: int, num_data: int, w_mu: Optional[np.ndarray] = None, w_sqrt: Optional[np.ndarray] = None, activation: Optional[Callable] = None, is_mean_field: bool = True, temperature: float = 1e-4, returns_samples: bool = True, ): """ A Bayesian dense layer for variational Bayesian neural nets. This layer holds the weight mean and sqrt as well as the temperature for cooling (or heating) the posterior. :param input_dim: The layer's input dimension (excluding bias) :param output_dim: The layer's output dimension :param num_data: number of data points :param w_mu: Initial value of the variational mean (weights + bias) :param w_sqrt: Initial value of the variational Cholesky (covering weights + bias) :param activation: The type of activation function (None is linear) :param is_mean_field: Determines mean field approximation of the weight posterior :param temperature: For cooling or heating the posterior :param returns_samples: If True, return samples on calling the layer, Else return mean and variance """ super().__init__(dtype=default_float()) assert input_dim >= 1 assert output_dim >= 1 assert num_data >= 1 if w_mu is not None: # add + 1 for the bias assert w_mu.shape == ((input_dim + 1) * output_dim,) if w_sqrt is not None: if not is_mean_field: assert w_sqrt.shape == ( (input_dim + 1) * output_dim, (input_dim + 1) * output_dim, ) else: assert w_sqrt.shape == ((input_dim + 1) * output_dim,) assert temperature > 0.0 self.input_dim = input_dim self.output_dim = output_dim self.num_data = num_data self.w_mu_ini = w_mu self.w_sqrt_ini = w_sqrt self.activation = activation self.is_mean_field = is_mean_field self.temperature = temperature self.returns_samples = returns_samples self.dim = (input_dim + 1) * output_dim self.full_output_cov = False self.full_cov = False self.w_mu = Parameter(np.zeros((self.dim,)), dtype=default_float(), name="w_mu") # [dim] self.w_sqrt = Parameter( np.zeros((self.dim, self.dim)) if not self.is_mean_field else np.ones((self.dim,)), transform=triangular() if not self.is_mean_field else positive(), dtype=default_float(), name="w_sqrt", ) # [dim, dim] or [dim] def initialize_variational_distribution(self) -> None: if self.w_mu_ini is None: w = xavier_initialization_numpy(self.input_dim, self.output_dim) b = np.zeros((1, self.output_dim)) self.w_mu_ini = np.concatenate((w, b), axis=0).reshape((self.dim,)) self.w_mu.assign(self.w_mu_ini) if self.w_sqrt_ini is None: if not self.is_mean_field: self.w_sqrt_ini = 1e-5 * np.eye(self.dim) else: self.w_sqrt_ini = 1e-5 * np.ones((self.dim,)) self.w_sqrt.assign(self.w_sqrt_ini) def build(self, input_shape: ShapeType) -> None: """Build the variables necessary on first call""" super().build(input_shape) self.initialize_variational_distribution() def predict_samples( self, inputs: TensorType, *, num_samples: Optional[int] = None, full_output_cov: bool = False, full_cov: bool = False, whiten: bool = False, ) -> tf.Tensor: """ Make a sample predictions at N test inputs, with input_dim = D, output_dim = Q. Return a sample, and the conditional mean and covariance at these points. :param inputs: the inputs to predict at. shape [N, D] :param num_samples: the number of samples S, to draw. shape [S, N, Q] if S is not None else [N, Q]. :param full_output_cov: assert to False since not supported for now :param full_cov: assert to False since not supported for now :param whiten: assert to False since not sensible in Bayesian neural nets """ assert full_output_cov is False assert full_cov is False assert whiten is False _num_samples = num_samples or 1 z = tf.random.normal((self.dim, _num_samples), dtype=default_float()) # [dim, S] if not self.is_mean_field: w = self.w_mu[:, None] + tf.matmul(self.w_sqrt, z) # [dim, S] else: w = self.w_mu[:, None] + self.w_sqrt[:, None] * z # [dim, S] N = tf.shape(inputs)[0] inputs_concat_1 = tf.concat( (inputs, tf.ones((N, 1), dtype=default_float())), axis=-1 ) # [N, D+1] samples = tf.tensordot( inputs_concat_1, tf.reshape(tf.transpose(w), (_num_samples, self.input_dim + 1, self.output_dim)), [[-1], [1]], ) # [N, S, Q] if num_samples is None: samples = tf.squeeze(samples, axis=-2) # [N, Q] else: samples = tf.transpose(samples, perm=[1, 0, 2]) # [S, N, Q] if self.activation is not None: samples = self.activation(samples) return samples def call( self, inputs: TensorType, training: Optional[bool] = False ) -> Union[tf.Tensor, MeanAndVariance]: """The default behaviour upon calling the BayesianDenseLayer()(X)""" sample = self.predict_samples( inputs, num_samples=None, full_output_cov=self.full_output_cov, full_cov=self.full_cov, ) # TF quirk: add_loss must add a tensor to compile if training: # Wenzel et al. 2020: How good is the Bayes posterior in DNNs really? loss = self.temperature * self.prior_kl() else: loss = tf.constant(0.0, dtype=default_float()) loss_per_datapoint = loss / self.num_data self.add_loss(loss_per_datapoint) # for latent layers, return samples if self.returns_samples: return sample # [N, Q] # for output layers, return samples as mean with 0 cov return sample, tf.ones_like(sample) * 1e-10 # [N, Q], [N, Q] def prior_kl(self) -> tf.Tensor: """ The KL divergence from the variational distribution to the prior :return: KL divergence from N(w_mu, w_sqrt) to N(0, I) """ return gauss_kl( self.w_mu[:, None], self.w_sqrt[None] if not self.is_mean_field else self.w_sqrt[:, None], )
class StringKernel(Kernel): """ Code to run the SSK of Moss et al. 2020 with gpflow with hyperparameters: 1) match_decay float decrease the contribution of long subsequences 2) gap_decay float decrease the contribtuion of subsequences with large gaps (penalize non-contiguous) 3) max_subsequence_length int largest subsequence considered 4) max_occurence_length int longest non-contiguous occurences of subsequences considered (max_occurence_length > max_subsequence_length) We calculate gradients for match_decay and gap_decay w.r.t kernel hyperparameters following Beck (2017) We recommend normalize = True to allow meaningful comparrison of strings of different length """ def __init__(self, active_dims=[0], gap_decay=0.1, match_decay=0.9, max_subsequence_length=3, max_occurence_length=10, alphabet=[], maxlen=0, normalize=True, batch_size=1000): super().__init__(active_dims=active_dims) # constrain kernel params to between 0 and 1 self.logistic_gap = tfb.Chain([ tfb.AffineScalar(shift=tf.cast(0, tf.float64), scale=tf.cast(1, tf.float64)), tfb.Sigmoid() ]) self.logisitc_match = tfb.Chain([ tfb.AffineScalar(shift=tf.cast(0, tf.float64), scale=tf.cast(1, tf.float64)), tfb.Sigmoid() ]) self.gap_decay_param = Parameter(gap_decay, transform=self.logistic_gap, name="gap_decay") self.match_decay_param = Parameter(match_decay, transform=self.logisitc_match, name="match_decay") self.max_subsequence_length = max_subsequence_length self.max_occurence_length = max_occurence_length self.alphabet = alphabet self.maxlen = maxlen self.normalize = normalize self.batch_size = batch_size self.symmetric = False # use will use copies of the kernel params to stop building expensive computation graph # we instead efficientely calculate gradients using dynamic programming # These params are updated at every call to K and K_diag (to check if parameters have been updated) self.match_decay = self.match_decay_param.numpy() self.gap_decay = self.gap_decay_param.numpy() self.match_decay_unconstrained = self.match_decay_param.unconstrained_variable.numpy( ) self.gap_decay_unconstrained = self.gap_decay_param.unconstrained_variable.numpy( ) # initialize helful construction matricies to be lazily computed once needed self.D = None self.dD_dgap = None # build a lookup table of the alphabet to encode input strings self.table = tf.lookup.StaticHashTable( initializer=tf.lookup.KeyValueTensorInitializer( keys=tf.constant(["PAD"] + alphabet), values=tf.constant(range(0, len(alphabet) + 1)), ), default_value=0) def K_diag(self, X): r""" Calc just the diagonal elements of a kernel matrix """ # check if string is not longer than max length if tf.reduce_max(tf.strings.length(X)) + 1 > 2 * self.maxlen: raise ValueError( "An input string is longer that max-length so refit the kernel with a larger maxlen param" ) if self.normalize: # if normalizing then diagonal will just be ones return tf.cast(tf.fill(tf.shape(X)[:-1], 1), tf.float64) else: # otherwise have to calc kernel elements # Turn inputs into lists of integers using one-hot embedding and pad until all same length X = tf.strings.split(tf.squeeze(X, 1)).to_tensor( "PAD", shape=[None, self.maxlen]) X = self.table.lookup(X) # prep required quantities and check kernel parameters self._precalc() # Proceed with kernel matrix calculations in batches k_results = tf.TensorArray(tf.float64, size=0, dynamic_size=True, infer_shape=False) num_batches = tf.math.ceil(tf.shape(X)[0] / self.batch_size) # iterate through batches for i in tf.range( tf.cast(tf.math.ceil(tf.shape(X)[0] / self.batch_size), dtype=tf.int32)): X_batch = X[self.batch_size * i:self.batch_size * (i + 1)] k_results = k_results.write(k_results.size(), self._k(X_batch, X_batch)) # collect all batches return tf.reshape(k_results.concat(), (-1, )) def K(self, X, X2=None): r""" Now we calculate the kernel values and kernel gradients Efficientely calculating kernel gradients requires dynamic programming and so we 'turn off' autograd and calculate manually We currently only bother calculating the kernel gradients for gram matricies i.e (when X=X2) as required when fitting the model. For predictions (where X != X2) we do not calculate gradients """ if X2 is None: self.symmetric = True k_results = self.K_calc(X, X) else: self.symmetric = False k_results = self.K_calc(X, X2) return k_results def _precalc(self): r""" Update stored kernel params (incase they have changed) and precalc D and dD_dgap as required for kernel calcs following notation from Beck (2017) """ self.match_decay = self.match_decay_param.numpy() self.gap_decay = self.gap_decay_param.numpy() self.match_decay_unconstrained = self.match_decay_param.unconstrained_variable.numpy( ) self.gap_decay_unconstrained = self.gap_decay_param.unconstrained_variable.numpy( ) tril = tf.linalg.band_part( tf.ones((self.maxlen, self.maxlen), dtype=tf.float64), -1, 0) # get upper triangle matrix of increasing intergers values = tf.TensorArray(tf.int32, size=self.maxlen) for i in tf.range(self.maxlen): values = values.write(i, tf.range(-i - 1, self.maxlen - 1 - i)) power = tf.cast(values.stack(), tf.float64) values.close() power = tf.linalg.band_part(power, 0, -1) - tf.linalg.band_part( power, 0, 0) + tril tril = tf.transpose( tf.linalg.band_part( tf.ones((self.maxlen, self.maxlen), dtype=tf.float64), self.max_occurence_length, 0)) - tf.eye(self.maxlen, dtype=tf.float64) gaps = tf.fill([self.maxlen, self.maxlen], self.gap_decay) self.D = tf.pow(gaps * tril, power) self.dD_dgap = tf.pow((tril * gaps), (power - 1.0)) * tril * power @tf.custom_gradient def K_calc(self, X, X2): r""" Calc the elements of the kernel matrix (and gradients if symmetric) """ # check if input strings are longer than max allowed length if (tf.reduce_max(tf.strings.length(X)) + 1 > 2 * self.maxlen) or ( tf.reduce_max(tf.strings.length(X2)) + 1 > 2 * self.maxlen): raise ValueError( "An input string is longer that max-length so refit the kernel with a larger maxlen param" ) # Turn our inputs into lists of integers using one-hot embedding # first split up strings and pad to fixed length and prep for gpu # pad until all have length of self.maxlen X = tf.strings.split(tf.squeeze(X, 1)).to_tensor( "PAD", shape=[None, self.maxlen]) X = self.table.lookup(X) if self.symmetric: X2 = X else: # pad until all have length of self.maxlen X2 = tf.strings.split(tf.squeeze(X2, 1)).to_tensor( "PAD", shape=[None, self.maxlen]) X2 = self.table.lookup(X2) # get the decay tensors D and dD_dgap self._precalc() # get indicies of all possible pairings from X and X2 # this way allows maximum number of kernel calcs to be squished onto the GPU (rather than just doing individual rows of gram) indicies_2, indicies_1 = tf.meshgrid(tf.range(0, tf.shape(X2)[0]), tf.range(0, tf.shape(X)[0])) indicies = tf.concat( [tf.reshape(indicies_1, (-1, 1)), tf.reshape(indicies_2, (-1, 1))], axis=1) # if symmetric then only calc upper matrix (fill in rest later) if self.symmetric: indicies = tf.boolean_mask( indicies, tf.greater_equal(indicies[:, 1], indicies[:, 0])) # make kernel calcs in batches num_batches = tf.math.ceil(tf.shape(indicies)[0] / self.batch_size) # iterate through batches if self.symmetric: k_results = tf.TensorArray(tf.float64, size=0, dynamic_size=True, infer_shape=False) gap_grads = tf.TensorArray(tf.float64, size=0, dynamic_size=True, infer_shape=False) match_grads = tf.TensorArray(tf.float64, size=0, dynamic_size=True, infer_shape=False) for i in tf.range( tf.cast(tf.math.ceil( tf.shape(indicies)[0] / self.batch_size), dtype=tf.int32)): indicies_batch = indicies[self.batch_size * i:self.batch_size * (i + 1)] X_batch = tf.gather(X, indicies_batch[:, 0], axis=0) X2_batch = tf.gather(X2, indicies_batch[:, 1], axis=0) results = self._k_grads(X_batch, X2_batch) k_results = k_results.write(k_results.size(), results[0]) gap_grads = gap_grads.write(gap_grads.size(), results[1]) match_grads = match_grads.write(match_grads.size(), results[2]) # combine indivual kernel results k_results = tf.reshape(k_results.concat(), [1, -1]) gap_grads = tf.reshape(gap_grads.concat(), [1, -1]) match_grads = tf.reshape(match_grads.concat(), [1, -1]) else: k_results = tf.TensorArray(tf.float64, size=0, dynamic_size=True, infer_shape=False) for i in tf.range( tf.cast(tf.math.ceil( tf.shape(indicies)[0] / self.batch_size), dtype=tf.int32)): indicies_batch = indicies[self.batch_size * i:self.batch_size * (i + 1)] X_batch = tf.gather(X, indicies_batch[:, 0], axis=0) X2_batch = tf.gather(X2, indicies_batch[:, 1], axis=0) k_results = k_results.write(k_results.size(), self._k(X_batch, X2_batch)) # combine indivual kernel results k_results = tf.reshape(k_results.concat(), [1, -1]) # put results into the right places in the gram matrix # if symmetric then only put in top triangle (inc diag) if self.symmetric: mask = tf.linalg.band_part( tf.ones((tf.shape(X)[0], tf.shape(X)[0]), dtype=tf.int64), 0, -1) non_zero = tf.not_equal(mask, tf.constant(0, dtype=tf.int64)) indices = tf.where( non_zero) # Extracting the indices of upper triangle elements out = tf.SparseTensor(indices, tf.squeeze(k_results), dense_shape=tf.cast( (tf.shape(X)[0], tf.shape(X)[0]), dtype=tf.int64)) k_results = tf.sparse.to_dense(out) out = tf.SparseTensor(indices, tf.squeeze(gap_grads), dense_shape=tf.cast( (tf.shape(X)[0], tf.shape(X)[0]), dtype=tf.int64)) gap_grads = tf.sparse.to_dense(out) out = tf.SparseTensor(indices, tf.squeeze(match_grads), dense_shape=tf.cast( (tf.shape(X)[0], tf.shape(X)[0]), dtype=tf.int64)) match_grads = tf.sparse.to_dense(out) #add in mising elements (lower diagonal) k_results = k_results + tf.linalg.set_diag( tf.transpose(k_results), tf.zeros(tf.shape(X)[0], dtype=tf.float64)) gap_grads = gap_grads + tf.linalg.set_diag( tf.transpose(gap_grads), tf.zeros(tf.shape(X)[0], dtype=tf.float64)) match_grads = match_grads + tf.linalg.set_diag( tf.transpose(match_grads), tf.zeros(tf.shape(X)[0], dtype=tf.float64)) else: k_results = tf.reshape( k_results, [tf.shape(X)[0], tf.shape(X2)[0]]) # normalize if required if self.normalize: if self.symmetric: # if symmetric then can extract normalization terms from gram X_diag_Ks = tf.linalg.diag_part(k_results) X_diag_gap_grads = tf.linalg.diag_part(gap_grads) X_diag_match_grads = tf.linalg.diag_part(match_grads) # norm for kernel entries norm = tf.tensordot(X_diag_Ks, X_diag_Ks, axes=0) k_results = tf.divide(k_results, tf.sqrt(norm)) # norm for gap_decay and match_decay grads diff_gap = tf.divide( tf.tensordot(X_diag_gap_grads, X_diag_Ks, axes=0) + tf.tensordot(X_diag_Ks, X_diag_gap_grads, axes=0), 2 * norm) diff_match = tf.divide( tf.tensordot(X_diag_match_grads, X_diag_Ks, axes=0) + tf.tensordot(X_diag_Ks, X_diag_match_grads, axes=0), 2 * norm) gap_grads = tf.divide(gap_grads, tf.sqrt(norm)) - tf.multiply( k_results, diff_gap) match_grads = tf.divide(match_grads, tf.sqrt(norm)) - tf.multiply( k_results, diff_match) else: # if not symmetric then need to calculate some extra kernel calcs # get diagonal kernel calcs for X1 X_diag_Ks = tf.TensorArray(tf.float64, size=0, dynamic_size=True, infer_shape=False) num_batches = tf.math.ceil(tf.shape(X)[0] / self.batch_size) # iterate through batches for i in tf.range( tf.cast(tf.math.ceil(tf.shape(X)[0] / self.batch_size), dtype=tf.int32)): X_batch = X[self.batch_size * i:self.batch_size * (i + 1)] X_diag_Ks = X_diag_Ks.write(X_diag_Ks.size(), self._k(X_batch, X_batch)) # collect up all batches X_diag_Ks = tf.reshape(X_diag_Ks.concat(), (-1, )) # get diagonal kernel calcs for X2 X2_diag_Ks = tf.TensorArray(tf.float64, size=0, dynamic_size=True, infer_shape=False) num_batches = tf.math.ceil(tf.shape(X2)[0] / self.batch_size) # iterate through batches for i in tf.range( tf.cast(tf.math.ceil( tf.shape(X2)[0] / self.batch_size), dtype=tf.int32)): X2_batch = X2[self.batch_size * i:self.batch_size * (i + 1)] X2_diag_Ks = X2_diag_Ks.write(X2_diag_Ks.size(), self._k(X2_batch, X2_batch)) # collect up all batches X2_diag_Ks = tf.reshape(X2_diag_Ks.concat(), (-1, )) # norm for kernel entries norm = tf.tensordot(X_diag_Ks, X2_diag_Ks, axes=0) k_results = tf.divide(k_results, tf.sqrt(norm)) def grad(dy, variables=None): if self.symmetric: # get gradients of unconstrained params grads = {} grads['gap_decay:0'] = tf.reduce_sum( tf.multiply( dy, gap_grads * tf.math.exp( self.logistic_gap.forward_log_det_jacobian( self.gap_decay_unconstrained, 0)))) grads['match_decay:0'] = tf.reduce_sum( tf.multiply( dy, match_grads * tf.math.exp( self.logisitc_match.forward_log_det_jacobian( self.match_decay_unconstrained, 0)))) gradient = [grads[v.name] for v in variables] return ((None, None), gradient) else: return ((None, None), [None, None]) return k_results, grad def _k_grads(self, X1, X2): r""" Vectorized kernel calc and kernel grad calc. Following notation from Beck (2017), i.e have tensors S,D,Kpp,Kp Input is two tensors of shape (# strings , # characters) and we calc the pair-wise kernel calcs between the elements (i.e n kern calcs for two lists of length n) D is the tensor than unrolls the recursion and allows vecotrizaiton """ # turn into one-hot i.e. shape (# strings, #characters+1, alphabet size) X1 = tf.one_hot(X1, len(self.alphabet) + 1, dtype=tf.float64) X2 = tf.one_hot(X2, len(self.alphabet) + 1, dtype=tf.float64) # remove the ones in the first column that encode the padding (i.e we dont want them to count as a match) paddings = tf.constant([[0, 0], [0, 0], [0, len(self.alphabet)]]) X1 = X1 - tf.pad(tf.expand_dims(X1[:, :, 0], 2), paddings, "CONSTANT") X2 = X2 - tf.pad(tf.expand_dims(X2[:, :, 0], 2), paddings, "CONSTANT") # store squared match coef match_sq = tf.square(self.match_decay) # Make S: the similarity tensor of shape (# strings, #characters, # characters) S = tf.matmul(X1, tf.transpose(X2, perm=(0, 2, 1))) # Main loop, where Kp, Kpp values and gradients are calculated. Kp = tf.TensorArray(tf.float64, size=0, dynamic_size=True, clear_after_read=False) dKp_dgap = tf.TensorArray(tf.float64, size=0, dynamic_size=True, clear_after_read=False) dKp_dmatch = tf.TensorArray(tf.float64, size=0, dynamic_size=True, clear_after_read=False) Kp = Kp.write( Kp.size(), tf.ones(shape=tf.stack([tf.shape(X1)[0], self.maxlen, self.maxlen]), dtype=tf.float64)) dKp_dgap = dKp_dgap.write( dKp_dgap.size(), tf.zeros(shape=tf.stack( [tf.shape(X1)[0], self.maxlen, self.maxlen]), dtype=tf.float64)) dKp_dmatch = dKp_dmatch.write( dKp_dmatch.size(), tf.zeros(shape=tf.stack( [tf.shape(X1)[0], self.maxlen, self.maxlen]), dtype=tf.float64)) # calc subkernels for each subsequence length for i in tf.range(0, self.max_subsequence_length - 1): Kp_temp = tf.multiply(S, Kp.read(i)) Kp_temp0 = match_sq * Kp_temp Kp_temp1 = tf.matmul(Kp_temp0, self.D) Kp_temp2 = tf.matmul(self.D, Kp_temp1, transpose_a=True) Kp = Kp.write(Kp.size(), Kp_temp2) dKp_dgap_temp_1 = tf.matmul(self.dD_dgap, Kp_temp1, transpose_a=True) dKp_dgap_temp_2 = tf.multiply(S, dKp_dgap.read(i)) dKp_dgap_temp_2 = dKp_dgap_temp_2 * match_sq dKp_dgap_temp_2 = tf.matmul(dKp_dgap_temp_2, self.D) dKp_dgap_temp_2 = dKp_dgap_temp_2 + tf.matmul( Kp_temp0, self.dD_dgap) dKp_dgap_temp_2 = tf.matmul(self.D, dKp_dgap_temp_2, transpose_a=True) dKp_dgap = dKp_dgap.write(dKp_dgap.size(), dKp_dgap_temp_1 + dKp_dgap_temp_2) dKp_dmatch_temp_1 = 2 * tf.divide(Kp_temp2, self.match_decay) dKp_dmatch_temp_2 = tf.multiply(S, dKp_dmatch.read(i)) dKp_dmatch_temp_2 = dKp_dmatch_temp_2 * match_sq dKp_dmatch_temp_2 = tf.matmul(dKp_dmatch_temp_2, self.D) dKp_dmatch_temp_2 = tf.matmul(self.D, dKp_dmatch_temp_2, transpose_a=True) dKp_dmatch = dKp_dmatch.write( dKp_dmatch.size(), dKp_dmatch_temp_1 + dKp_dmatch_temp_2) # Final calculation. We gather all Kps Kp_stacked = Kp.stack() Kp.close() dKp_dgap_stacked = dKp_dgap.stack() dKp_dgap.close() dKp_dmatch_stacked = dKp_dmatch.stack() dKp_dmatch.close() # get k temp = tf.multiply(S, Kp_stacked) temp = tf.reduce_sum(temp, -1) sum2 = tf.reduce_sum(temp, -1) Ki = sum2 * match_sq k = tf.reduce_sum(Ki, 0) k = tf.expand_dims(k, 1) # get gap decay grads temp = tf.multiply(S, dKp_dgap_stacked) temp = tf.reduce_sum(temp, -1) temp = tf.reduce_sum(temp, -1) temp = temp * match_sq dk_dgap = tf.reduce_sum(temp, 0) dk_dgap = tf.expand_dims(dk_dgap, 1) # get match decay grads temp = tf.multiply(S, dKp_dmatch_stacked) temp = tf.reduce_sum(temp, -1) temp = tf.reduce_sum(temp, -1) temp = temp * match_sq temp = temp + 2 * self.match_decay * sum2 dk_dmatch = tf.reduce_sum(temp, 0) dk_dmatch = tf.expand_dims(dk_dmatch, 1) return k, dk_dgap, dk_dmatch def _k(self, X1, X2): r""" Vectorized kernel calc. Following notation from Beck (2017), i.e have tensors S,D,Kpp,Kp Input is two tensors of shape (# strings , # characters) and we calc the pair-wise kernel calcs between the elements (i.e n kern calcs for two lists of length n) D is the tensor than unrolls the recursion and allows vecotrizaiton """ # turn into one-hot i.e. shape (# strings, #characters+1, alphabet size) X1 = tf.one_hot(X1, len(self.alphabet) + 1, dtype=tf.float64) X2 = tf.one_hot(X2, len(self.alphabet) + 1, dtype=tf.float64) # remove the ones in the first column that encode the padding (i.e we dont want them to count as a match) paddings = tf.constant([[0, 0], [0, 0], [0, len(self.alphabet)]]) X1 = X1 - tf.pad(tf.expand_dims(X1[:, :, 0], 2), paddings, "CONSTANT") X2 = X2 - tf.pad(tf.expand_dims(X2[:, :, 0], 2), paddings, "CONSTANT") # store squared match coef match_sq = tf.square(self.match_decay) # Make S: the similarity tensor of shape (# strings, #characters, # characters) S = tf.matmul(X1, tf.transpose(X2, perm=(0, 2, 1))) # Main loop, where Kp, Kpp values and gradients are calculated. Kp = tf.TensorArray(tf.float64, size=0, dynamic_size=True, clear_after_read=False) Kp = Kp.write( Kp.size(), tf.ones(shape=tf.stack([tf.shape(X1)[0], self.maxlen, self.maxlen]), dtype=tf.float64)) # calc subkernels for each subsequence length for i in tf.range(0, self.max_subsequence_length - 1): temp = tf.multiply(S, Kp.read(i)) temp = tf.matmul(temp, self.D) temp = tf.matmul(self.D, temp, transpose_a=True) temp = match_sq * temp Kp = Kp.write(Kp.size(), temp) # Final calculation. We gather all Kps Kp_stacked = Kp.stack() Kp.close() # Get k aux = tf.multiply(S, Kp_stacked) aux = tf.reduce_sum(aux, -1) sum2 = tf.reduce_sum(aux, -1) Ki = tf.multiply(sum2, match_sq) k = tf.reduce_sum(Ki, 0) k = tf.expand_dims(k, 1) return k
def __init__(self, state_dim, W): self.state_dim = state_dim self.W = Parameter(np.reshape(W, (state_dim, 1)), trainable=False)
class Batch_simple_SSK(Kernel): """ with hyperparameters: 1) match_decay float decrease the contribution of long subsequences 3) max_subsequence_length int largest subsequence considered """ def __init__(self,active_dims=[0],decay=0.1,max_subsequence_length=3, alphabet = [], maxlen=0, batch_size=100): super().__init__(active_dims=active_dims) # constrain decay kernel params to between 0 and 1 self.logistic = tfb.Chain([tfb.Shift(tf.cast(0,tf.float64))(tfb.Scale(tf.cast(1,tf.float64))),tfb.Sigmoid()]) self.decay_param= Parameter(decay, transform=self.logistic ,name="decay") # use will use copies of the kernel params to stop building expensive computation graph # we instead efficientely calculate gradients using dynamic programming # These params are updated at every call to K and K_diag (to check if parameters have been updated) self.decay = self.decay_param.numpy() self.decay_unconstrained = self.decay_param.unconstrained_variable.numpy() self.order_coefs=tf.ones(max_subsequence_length,dtype=tf.float64) # store additional kernel parameters self.max_subsequence_length = tf.constant(max_subsequence_length) self.alphabet = tf.constant(alphabet) self.alphabet_size=tf.shape(self.alphabet)[0] self.maxlen = tf.constant(maxlen) self.batch_size = tf.constant(batch_size) # build a lookup table of the alphabet to encode input strings self.table = tf.lookup.StaticHashTable( initializer=tf.lookup.KeyValueTensorInitializer( keys=tf.constant(["PAD"]+alphabet), values=tf.constant(range(0,len(alphabet)+1)),),default_value=0) # initialize helful construction matricies to be lazily computed once needed self.D = None self.dD_dgap = None def K_diag(self, X): r""" The diagonal elements of the string kernel are always unity (due to normalisation) """ return tf.ones(tf.shape(X)[:-1],dtype=tf.float64) def K(self, X1, X2=None): r""" Vectorized kernel calc. Following notation from Beck (2017), i.e have tensors S,D,Kpp,Kp Input is two tensors of shape (# strings , # characters) and we calc the pair-wise kernel calcs between the elements (i.e n kern calcs for two lists of length n) D is the tensor than unrolls the recursion and allows vecotrizaiton """ # Turn our inputs into lists of integers using one-hot embedding # first split up strings and pad to fixed length and prep for gpu # pad until all have length of self.maxlen # turn into one-hot i.e. shape (# strings, #characters+1, alphabet size) X1 = tf.strings.split(tf.squeeze(X1,1)).to_tensor("PAD",shape=[None,self.maxlen]) X1 = self.table.lookup(X1) # keep track of original input sizes X1_shape = tf.shape(X1)[0] X1 = tf.one_hot(X1,self.alphabet_size+1,dtype=tf.float64) if X2 is None: X2 = X1 X2_shape = X1_shape self.symmetric = True else: self.symmetric = False X2 = tf.strings.split(tf.squeeze(X2,1)).to_tensor("PAD",shape=[None,self.maxlen]) X2 = self.table.lookup(X2) X2_shape = tf.shape(X2)[0] X2 = tf.one_hot(X2,self.alphabet_size+1,dtype=tf.float64) # prep the decay tensors self._precalc() # combine all target strings and remove the ones in the first column that encode the padding (i.e we dont want them to count as a match) X_full = tf.concat([X1,X2],0)[:,:,1:] # get indicies of all possible pairings from X and X2 # this way allows maximum number of kernel calcs to be squished onto the GPU (rather than just doing individual rows of gram) indicies_2, indicies_1 = tf.meshgrid(tf.range(0, X1_shape ),tf.range(X1_shape , tf.shape(X_full)[0])) indicies = tf.concat([tf.reshape(indicies_1,(-1,1)),tf.reshape(indicies_2,(-1,1))],axis=1) if self.symmetric: # if symmetric then only calc upper matrix (fill in rest later) indicies = tf.boolean_mask(indicies,tf.greater_equal(indicies[:,1]+ X1_shape ,indicies[:,0])) else: # if not symmetric need to calculate some extra kernel evals for the normalization later on indicies = tf.concat([indicies,tf.tile(tf.expand_dims(tf.range(tf.shape(X_full)[0]),1),(1,2))],0) # make kernel calcs in batches num_batches = tf.cast(tf.math.ceil(tf.shape(indicies)[0]/self.batch_size),dtype=tf.int32) k_split = tf.TensorArray(tf.float64, size=num_batches,clear_after_read=False,infer_shape=False) # iterate through batches for j in tf.range(num_batches): # collect strings for this batch indicies_batch = indicies[self.batch_size*j:self.batch_size*(j+1)] X_batch = tf.gather(X_full,indicies_batch[:,0],axis=0) X2_batch = tf.gather(X_full,indicies_batch[:,1],axis=0) # Make S: the similarity tensor of shape (# strings, #characters, # characters) #S = tf.matmul( tf.matmul(X_batch,self.sim),tf.transpose(X2_batch,perm=(0,2,1))) S = tf.matmul(X_batch,tf.transpose(X2_batch,perm=(0,2,1))) # collect results for the batch result = self.kernel_calc(S) k_split = k_split.write(j,result) # combine batch results k = tf.expand_dims(k_split.concat(),1) k_split.close() # put results into the right places in the gram matrix and normalize if self.symmetric: # if symmetric then only put in top triangle (inc diag) mask = tf.linalg.band_part(tf.ones((X1_shape,X2_shape),dtype=tf.int64), 0, -1) non_zero = tf.not_equal(mask, tf.constant(0, dtype=tf.int64)) # Extracting the indices of upper triangle elements indices = tf.where(non_zero) out = tf.SparseTensor(indices,tf.squeeze(k),dense_shape=tf.cast((X1_shape,X2_shape),dtype=tf.int64)) k_results = tf.sparse.to_dense(out) # add in mising elements (lower diagonal) k_results = k_results + tf.linalg.set_diag(tf.transpose(k_results),tf.zeros(X1_shape,dtype=tf.float64)) # normalise X_diag_Ks = tf.linalg.diag_part(k_results) norm = tf.tensordot(X_diag_Ks, X_diag_Ks,axes=0) k_results = tf.divide(k_results, tf.sqrt(norm)) else: # otherwise can just reshape into gram matrix # but first take extra kernel calcs off end of k and use them to normalise X_diag_Ks = tf.reshape(k[X1_shape*X2_shape:X1_shape*X2_shape+X1_shape],(-1,)) X2_diag_Ks = tf.reshape(k[-X2_shape:],(-1,)) k = k[0:X1_shape*X2_shape] k_results = tf.transpose(tf.reshape(k,[X2_shape,X1_shape])) # normalise norm = tf.tensordot(X_diag_Ks, X2_diag_Ks,axes=0) k_results = tf.divide(k_results, tf.sqrt(norm)) return k_results def _precalc(self): r""" Update stored kernel params (incase they have changed) and precalc D and dD_dgap as required for kernel calcs following notation from Beck (2017) """ self.decay = self.decay_param.numpy() self.decay_unconstrained = self.decay_param.unconstrained_variable.numpy() tril = tf.linalg.band_part(tf.ones((self.maxlen,self.maxlen),dtype=tf.float64), -1, 0) # get upper triangle matrix of increasing intergers values = tf.TensorArray(tf.int32, size= self.maxlen) for i in tf.range(self.maxlen): values = values.write(i,tf.range(-i-1,self.maxlen-1-i)) power = tf.cast(values.stack(),tf.float64) values.close() power = tf.linalg.band_part(power, 0, -1) - tf.linalg.band_part(power, 0, 0) + tril tril = tf.transpose(tf.linalg.band_part(tf.ones((self.maxlen,self.maxlen),dtype=tf.float64), -1, 0))-tf.eye(self.maxlen,dtype=tf.float64) gaps = tf.fill([self.maxlen, self.maxlen],self.decay) self.D = tf.pow(gaps*tril, power) self.dD_dgap = tf.pow((tril * gaps), (power - 1.0)) * tril * power @tf.custom_gradient def kernel_calc(self,S): # fake computations to ensure we take the custom gradients for these two params a = tf.square(self.decay_param) if self.symmetric: k, dk_dgap = tf.stop_gradient(self.kernel_calc_with_grads(S)) else: k = tf.stop_gradient(self.kernel_calc_without_grads(S)) def grad(dy, variables=None): # get gradients of unconstrained params grads= {} if self.symmetric: grads['decay:0'] = tf.reduce_sum(tf.multiply(dy,dk_dgap*tf.math.exp(self.logistic.forward_log_det_jacobian(self.decay_unconstrained,0)))) gradient = [grads[v.name] for v in variables] else: gradient = [None for v in variables] return ((None),gradient) return k, grad def kernel_calc_without_grads(self,S): # store squared match coef for easier calc later match_sq = tf.square(self.decay) # calc subkernels for each subsequence length (See Moss et al. 2020 for notation) Kp = tf.TensorArray(tf.float64,size=self.max_subsequence_length,clear_after_read=False) # fill in first entries Kp = Kp.write(0, tf.ones(shape=tf.stack([tf.shape(S)[0], self.maxlen,self.maxlen]), dtype=tf.float64)) # calculate dynamic programs for i in tf.range(self.max_subsequence_length-1): Kp_temp = tf.multiply(S, Kp.read(i)) Kp_temp0 = match_sq * Kp_temp Kp_temp1 = tf.matmul(Kp_temp0,self.D) Kp_temp2 = tf.matmul(self.D,Kp_temp1,transpose_a=True) Kp = Kp.write(i+1,Kp_temp2) # Final calculation. We gather all Kps Kp_stacked = Kp.stack() Kp.close() # combine and get overall kernel aux = tf.multiply(S, Kp_stacked) aux = tf.reduce_sum(aux, -1) sum2 = tf.reduce_sum(aux, -1) Ki = sum2 * match_sq k = tf.linalg.matvec(tf.transpose(Ki),self.order_coefs) return k def kernel_calc_with_grads(self,S): # store squared match coef for easier calc later match_sq = tf.square(self.decay) # calc subkernels for each subsequence length (See Moss et al. 2020 for notation) Kp = tf.TensorArray(tf.float64,size=self.max_subsequence_length,clear_after_read=False) dKp_dgap = tf.TensorArray(tf.float64, size=self.max_subsequence_length, clear_after_read=False) # fill in first entries Kp = Kp.write(0, tf.ones(shape=tf.stack([tf.shape(S)[0], self.maxlen,self.maxlen]), dtype=tf.float64)) dKp_dgap = dKp_dgap.write(0, tf.zeros(shape=tf.stack([tf.shape(S)[0], self.maxlen,self.maxlen]), dtype=tf.float64)) # calculate dynamic programs for i in tf.range(self.max_subsequence_length-1): Kp_temp = tf.multiply(S, Kp.read(i)) Kp_temp0 = match_sq * Kp_temp Kp_temp1 = tf.matmul(Kp_temp0,self.D) Kp_temp2 = tf.matmul(self.D,Kp_temp1,transpose_a=True) Kp = Kp.write(i+1,Kp_temp2) dKp_dgap_temp_1 = tf.matmul(self.dD_dgap,Kp_temp1,transpose_a=True) dKp_dgap_temp_2 = tf.multiply(S, dKp_dgap.read(i)) dKp_dgap_temp_2 = dKp_dgap_temp_2 * match_sq dKp_dgap_temp_2 = tf.matmul(dKp_dgap_temp_2,self.D) dKp_dgap_temp_2 = dKp_dgap_temp_2 + tf.matmul(Kp_temp0,self.dD_dgap) dKp_dgap_temp_2 = tf.matmul(self.D,dKp_dgap_temp_2,transpose_a=True) dKp_dgap = dKp_dgap.write(i+1,dKp_dgap_temp_1 + dKp_dgap_temp_2) # Final calculation. We gather all Kps Kp_stacked = Kp.stack() Kp.close() dKp_dgap_stacked = dKp_dgap.stack() dKp_dgap.close() # combine and get overall kernel # get k aux = tf.multiply(S, Kp_stacked) aux = tf.reduce_sum(aux, -1) sum2 = tf.reduce_sum(aux, -1) Ki = sum2 * match_sq k = tf.linalg.matvec(tf.transpose(Ki),self.order_coefs) # get gap decay grads temp = tf.multiply(S, dKp_dgap_stacked) temp = tf.reduce_sum(temp, -1) temp = tf.reduce_sum(temp, -1) temp = temp * match_sq dk_dgap = tf.linalg.matvec(tf.transpose(temp),self.order_coefs) return k, dk_dgap
def __init__( self, kernel: MultioutputKernel, inducing_variable: MultioutputInducingVariables, num_data: int, mean_function: Optional[MeanFunction] = None, *, num_samples: Optional[int] = None, full_cov: bool = False, full_output_cov: bool = False, num_latent_gps: int = None, whiten: bool = True, name: Optional[str] = None, verbose: bool = False, ): """ :param kernel: The multioutput kernel for this layer. :param inducing_variable: The inducing features for this layer. :param num_data: The number of points in the training dataset (see :attr:`num_data`). :param mean_function: The mean function that will be applied to the inputs. Default: :class:`~gpflow.mean_functions.Identity`. .. note:: The Identity mean function requires the input and output dimensionality of this layer to be the same. If you want to change the dimensionality in a layer, you may want to provide a :class:`~gpflow.mean_functions.Linear` mean function instead. :param num_samples: The number of samples to draw when converting the :class:`~tfp.layers.DistributionLambda` into a `tf.Tensor`, see :meth:`_convert_to_tensor_fn`. Will be stored in the :attr:`num_samples` attribute. If `None` (the default), draw a single sample without prefixing the sample shape (see :class:`tfp.distributions.Distribution`'s `sample() <https://www.tensorflow.org/probability/api_docs/python/tfp/distributions/Distribution#sample>`_ method). :param full_cov: Sets default behaviour of calling this layer (:attr:`full_cov` attribute): If `False` (the default), only predict marginals (diagonal of covariance) with respect to inputs. If `True`, predict full covariance over inputs. :param full_output_cov: Sets default behaviour of calling this layer (:attr:`full_output_cov` attribute): If `False` (the default), only predict marginals (diagonal of covariance) with respect to outputs. If `True`, predict full covariance over outputs. :param num_latent_gps: The number of (latent) GPs in the layer (which can be different from the number of outputs, e.g. with a :class:`~gpflow.kernels.LinearCoregionalization` kernel). This is used to determine the size of the variational parameters :attr:`q_mu` and :attr:`q_sqrt`. If possible, it is inferred from the *kernel* and *inducing_variable*. :param whiten: If `True` (the default), uses the whitened parameterisation of the inducing variables; see :attr:`whiten`. :param name: The name of this layer. :param verbose: The verbosity mode. Set this parameter to `True` to show debug information. """ super().__init__( make_distribution_fn=self._make_distribution_fn, convert_to_tensor_fn=self._convert_to_tensor_fn, dtype=default_float(), name=name, ) self.kernel = kernel self.inducing_variable = inducing_variable self.num_data = num_data if mean_function is None: mean_function = Identity() self.mean_function = mean_function self.full_output_cov = full_output_cov self.full_cov = full_cov self.whiten = whiten self.verbose = verbose try: num_inducing, self.num_latent_gps = verify_compatibility( kernel, mean_function, inducing_variable) # TODO: if num_latent_gps is not None, verify it is equal to self.num_latent_gps except GPLayerIncompatibilityException as e: if num_latent_gps is None: raise e if self.verbose: warnings.warn( "Could not verify the compatibility of the `kernel`, `inducing_variable` " "and `mean_function`. We advise using `gpflux.helpers.construct_*` to create " "compatible kernels and inducing variables. As " f"`num_latent_gps={num_latent_gps}` has been specified explicitly, this will " "be used to create the `q_mu` and `q_sqrt` parameters.") num_inducing, self.num_latent_gps = ( len(inducing_variable), num_latent_gps, ) self.q_mu = Parameter( np.zeros((num_inducing, self.num_latent_gps)), dtype=default_float(), name=f"{self.name}_q_mu" if self.name else "q_mu", ) # [num_inducing, num_latent_gps] self.q_sqrt = Parameter( np.stack( [np.eye(num_inducing) for _ in range(self.num_latent_gps)]), transform=triangular(), dtype=default_float(), name=f"{self.name}_q_sqrt" if self.name else "q_sqrt", ) # [num_latent_gps, num_inducing, num_inducing] self.num_samples = num_samples
def __init__( self, inducing_variable: gpflow.inducing_variables.InducingVariables, kernel: gpflow.kernels.Kernel, domain: np.ndarray, q_mu: np.ndarray, q_S: np.ndarray, *, beta0: float = 1e-6, num_observations: int = 1, num_events: Optional[int] = None, ): """ D = number of dimensions M = size of inducing variables (number of inducing points) :param inducing_variable: inducing variables (here only implemented for a gpflow .inducing_variables.InducingPoints instance, with Z of shape M x D) :param kernel: the kernel (here only implemented for a gpflow.kernels .SquaredExponential instance) :param domain: lower and upper bounds of (hyper-rectangular) domain (D x 2) :param q_mu: initial mean vector of the variational distribution q(u) (length M) :param q_S: how to initialise the covariance matrix of the variational distribution q(u) (M x M) :param beta0: a constant offset, corresponding to initial value of the prior mean of the GP (but trainable); should be sufficiently large so that the GP does not go negative... :param num_observations: number of observations of sets of events under the distribution :param num_events: total number of events, defaults to events.shape[0] (relevant when feeding in minibatches) """ super().__init__(kernel, likelihood=None) # custom likelihood # observation domain (D x 2) self.domain = domain if domain.ndim != 2 or domain.shape[1] != 2: raise ValueError("domain must be of shape D x 2") self.num_observations = num_observations self.num_events = num_events if not (isinstance(kernel, gpflow.kernels.SquaredExponential) and isinstance(inducing_variable, gpflow.inducing_variables.InducingPoints)): raise NotImplementedError( "This VBPP implementation can only handle real-space " "inducing points together with the SquaredExponential " "kernel.") self.kernel = kernel self.inducing_variable = inducing_variable self.beta0 = Parameter(beta0, transform=positive(), name="beta0") # constant mean offset # variational approximate Gaussian posterior q(u) = N(u; m, S) self.q_mu = Parameter(q_mu, name="q_mu") # mean vector (length M) # covariance: L = np.linalg.cholesky( q_S) # S = L L^T, with L lower-triangular (M x M) self.q_sqrt = Parameter(L, transform=triangular(), name="q_sqrt") self.psi_jitter = 0.0
def __init__( self, input_dim: int, output_dim: int, num_data: int, w_mu: Optional[np.ndarray] = None, w_sqrt: Optional[np.ndarray] = None, activation: Optional[Callable] = None, is_mean_field: bool = True, temperature: float = 1e-4, # TODO is this intentional? ): """ :param input_dim: The input dimension (excluding bias) of this layer. :param output_dim: The output dimension of this layer. :param num_data: The number of points in the training dataset (used for scaling the KL regulariser). :param w_mu: Initial value of the variational mean for weights + bias. If not specified, this defaults to `xavier_initialization_numpy` for the weights and zero for the bias. :param w_sqrt: Initial value of the variational Cholesky of the (co)variance for weights + bias. If not specified, this defaults to 1e-5 * Identity. :param activation: The activation function. If not specified, this defaults to the identity. :param is_mean_field: Determines whether the approximation to the weight posterior is mean field. Must be consistent with the shape of ``w_sqrt``, if specified. :param temperature: For cooling (< 1.0) or heating (> 1.0) the posterior. """ super().__init__(dtype=default_float()) assert input_dim >= 1 assert output_dim >= 1 assert num_data >= 1 if w_mu is not None: # add + 1 for the bias assert w_mu.shape == ((input_dim + 1) * output_dim, ) if w_sqrt is not None: if not is_mean_field: assert w_sqrt.shape == ( (input_dim + 1) * output_dim, (input_dim + 1) * output_dim, ) else: assert w_sqrt.shape == ((input_dim + 1) * output_dim, ) assert temperature > 0.0 self.input_dim = input_dim self.output_dim = output_dim self.num_data = num_data self.w_mu_ini = w_mu self.w_sqrt_ini = w_sqrt self.activation = activation self.is_mean_field = is_mean_field self.temperature = temperature self.dim = (input_dim + 1) * output_dim self.full_output_cov = False self.full_cov = False self.w_mu = Parameter(np.zeros((self.dim, )), dtype=default_float(), name="w_mu") # [dim] self.w_sqrt = Parameter( np.zeros( (self.dim, self.dim)) if not self.is_mean_field else np.ones( (self.dim, )), transform=triangular() if not self.is_mean_field else positive(), dtype=default_float(), name="w_sqrt", ) # [dim, dim] or [dim]
class BayesianDenseLayer(TrackableLayer): """ A dense (fully-connected) layer for variational Bayesian neural networks. This layer holds the mean and square-root of the variance of the distribution over the weights. This layer also has a temperature for cooling (or heating) the posterior. """ def __init__( self, input_dim: int, output_dim: int, num_data: int, w_mu: Optional[np.ndarray] = None, w_sqrt: Optional[np.ndarray] = None, activation: Optional[Callable] = None, is_mean_field: bool = True, temperature: float = 1e-4, # TODO is this intentional? ): """ :param input_dim: The input dimension (excluding bias) of this layer. :param output_dim: The output dimension of this layer. :param num_data: The number of points in the training dataset (used for scaling the KL regulariser). :param w_mu: Initial value of the variational mean for weights + bias. If not specified, this defaults to `xavier_initialization_numpy` for the weights and zero for the bias. :param w_sqrt: Initial value of the variational Cholesky of the (co)variance for weights + bias. If not specified, this defaults to 1e-5 * Identity. :param activation: The activation function. If not specified, this defaults to the identity. :param is_mean_field: Determines whether the approximation to the weight posterior is mean field. Must be consistent with the shape of ``w_sqrt``, if specified. :param temperature: For cooling (< 1.0) or heating (> 1.0) the posterior. """ super().__init__(dtype=default_float()) assert input_dim >= 1 assert output_dim >= 1 assert num_data >= 1 if w_mu is not None: # add + 1 for the bias assert w_mu.shape == ((input_dim + 1) * output_dim, ) if w_sqrt is not None: if not is_mean_field: assert w_sqrt.shape == ( (input_dim + 1) * output_dim, (input_dim + 1) * output_dim, ) else: assert w_sqrt.shape == ((input_dim + 1) * output_dim, ) assert temperature > 0.0 self.input_dim = input_dim self.output_dim = output_dim self.num_data = num_data self.w_mu_ini = w_mu self.w_sqrt_ini = w_sqrt self.activation = activation self.is_mean_field = is_mean_field self.temperature = temperature self.dim = (input_dim + 1) * output_dim self.full_output_cov = False self.full_cov = False self.w_mu = Parameter(np.zeros((self.dim, )), dtype=default_float(), name="w_mu") # [dim] self.w_sqrt = Parameter( np.zeros( (self.dim, self.dim)) if not self.is_mean_field else np.ones( (self.dim, )), transform=triangular() if not self.is_mean_field else positive(), dtype=default_float(), name="w_sqrt", ) # [dim, dim] or [dim] def initialize_variational_distribution(self) -> None: if self.w_mu_ini is None: w = xavier_initialization_numpy(self.input_dim, self.output_dim) b = np.zeros((1, self.output_dim)) self.w_mu_ini = np.concatenate((w, b), axis=0).reshape( (self.dim, )) self.w_mu.assign(self.w_mu_ini) if self.w_sqrt_ini is None: if not self.is_mean_field: self.w_sqrt_ini = 1e-5 * np.eye(self.dim) else: self.w_sqrt_ini = 1e-5 * np.ones((self.dim, )) self.w_sqrt.assign(self.w_sqrt_ini) def build(self, input_shape: ShapeType) -> None: """Build the variables necessary on first call""" super().build(input_shape) self.initialize_variational_distribution() def predict_samples( self, inputs: TensorType, *, num_samples: Optional[int] = None, ) -> tf.Tensor: """ Samples from the approximate posterior at N test inputs, with input_dim = D, output_dim = Q. :param inputs: The inputs to predict at; shape ``[N, D]``. :param num_samples: The number of samples S, to draw. :returns: Samples, shape ``[S, N, Q]`` if S is not None else ``[N, Q]``. """ _num_samples = num_samples or 1 z = tf.random.normal((self.dim, _num_samples), dtype=default_float()) # [dim, S] if not self.is_mean_field: w = self.w_mu[:, None] + tf.matmul(self.w_sqrt, z) # [dim, S] else: w = self.w_mu[:, None] + self.w_sqrt[:, None] * z # [dim, S] N = tf.shape(inputs)[0] inputs_concat_1 = tf.concat( (inputs, tf.ones( (N, 1), dtype=default_float())), axis=-1) # [N, D+1] samples = tf.tensordot( inputs_concat_1, tf.reshape(tf.transpose(w), (_num_samples, self.input_dim + 1, self.output_dim)), [[-1], [1]], ) # [N, S, Q] if num_samples is None: samples = tf.squeeze(samples, axis=-2) # [N, Q] else: samples = tf.transpose(samples, perm=[1, 0, 2]) # [S, N, Q] if self.activation is not None: samples = self.activation(samples) return samples def call( self, inputs: TensorType, training: Optional[bool] = False ) -> Union[tf.Tensor, MeanAndVariance]: """ The default behaviour upon calling this layer. """ sample = self.predict_samples( inputs, num_samples=None, ) # TF quirk: add_loss must add a tensor to compile if training: # Wenzel et al. 2020: How good is the Bayes posterior in DNNs really? loss = self.temperature * self.prior_kl() else: loss = tf.constant(0.0, dtype=default_float()) loss_per_datapoint = loss / self.num_data self.add_loss(loss_per_datapoint) return sample # [N, Q] def prior_kl(self) -> tf.Tensor: """ Returns the KL divergence ``KL[q(u)∥p(u)]`` from the prior ``p(u) = N(0, I)`` to the variational distribution ``q(u) = N(w_mu, w_sqrt²)``. """ return gauss_kl( self.w_mu[:, None], self.w_sqrt[None] if not self.is_mean_field else self.w_sqrt[:, None], )
def mu(): return Parameter(rng.randn(Nn, Ln))
def __init__(self): super().__init__() self.var = tf.Variable(0.0) self.param = Parameter(0.0)
def __init__(self, state_dim, control_dim, max_action=1.0): gpflow.Module.__init__(self) self.W = Parameter(np.random.rand(control_dim, state_dim)) self.b = Parameter(np.random.rand(1, control_dim)) self.max_action = max_action
def __init__( self, input_dim: int, output_dim: int, num_data: int, w_mu: Optional[np.ndarray] = None, w_sqrt: Optional[np.ndarray] = None, activation: Optional[Callable] = None, is_mean_field: bool = True, temperature: float = 1e-4, returns_samples: bool = True, ): """ A Bayesian dense layer for variational Bayesian neural nets. This layer holds the weight mean and sqrt as well as the temperature for cooling (or heating) the posterior. :param input_dim: The layer's input dimension (excluding bias) :param output_dim: The layer's output dimension :param num_data: number of data points :param w_mu: Initial value of the variational mean (weights + bias) :param w_sqrt: Initial value of the variational Cholesky (covering weights + bias) :param activation: The type of activation function (None is linear) :param is_mean_field: Determines mean field approximation of the weight posterior :param temperature: For cooling or heating the posterior :param returns_samples: If True, return samples on calling the layer, Else return mean and variance """ super().__init__(dtype=default_float()) assert input_dim >= 1 assert output_dim >= 1 assert num_data >= 1 if w_mu is not None: # add + 1 for the bias assert w_mu.shape == ((input_dim + 1) * output_dim,) if w_sqrt is not None: if not is_mean_field: assert w_sqrt.shape == ( (input_dim + 1) * output_dim, (input_dim + 1) * output_dim, ) else: assert w_sqrt.shape == ((input_dim + 1) * output_dim,) assert temperature > 0.0 self.input_dim = input_dim self.output_dim = output_dim self.num_data = num_data self.w_mu_ini = w_mu self.w_sqrt_ini = w_sqrt self.activation = activation self.is_mean_field = is_mean_field self.temperature = temperature self.returns_samples = returns_samples self.dim = (input_dim + 1) * output_dim self.full_output_cov = False self.full_cov = False self.w_mu = Parameter(np.zeros((self.dim,)), dtype=default_float(), name="w_mu") # [dim] self.w_sqrt = Parameter( np.zeros((self.dim, self.dim)) if not self.is_mean_field else np.ones((self.dim,)), transform=triangular() if not self.is_mean_field else positive(), dtype=default_float(), name="w_sqrt", ) # [dim, dim] or [dim]