def test_q_sqrt_constraints(inducing_points, kernel, mu, white):
    """ Test that sending in an unconstrained q_sqrt returns the same conditional
    evaluation and gradients. This is important to match the behaviour of the KL, which
    enforces q_sqrt is triangular.
    """

    tril = np.tril(rng.randn(Ln, Nn, Nn))

    q_sqrt_constrained = Parameter(tril, transform=triangular())
    q_sqrt_unconstrained = Parameter(tril)

    diff_before_gradient_step = (q_sqrt_constrained - q_sqrt_unconstrained).numpy()
    assert_allclose(diff_before_gradient_step, 0)

    kls = []
    for q_sqrt in [q_sqrt_constrained, q_sqrt_unconstrained]:

        with tf.GradientTape() as tape:
            kl = prior_kl(inducing_points, kernel, mu, q_sqrt, whiten=white)

        grad = tape.gradient(kl, q_sqrt.unconstrained_variable)
        q_sqrt.unconstrained_variable.assign_sub(grad)
        kls.append(kl)

    diff_kls_before_gradient_step = kls[0] - kls[1]

    assert_allclose(diff_kls_before_gradient_step, 0)

    diff_after_gradient_step = (q_sqrt_constrained - q_sqrt_unconstrained).numpy()
    assert_allclose(diff_after_gradient_step, 0)
    def __init__(self, num_data: int, latent_dim: int, means: Optional[np.ndarray] = None):
        """
        Directly parameterise the posterior of the latent variables associated with
        each datapoint with a diagonal multivariate Normal distribution. Note that across
        latent variables we assume a mean-field approximation.

        See :cite:t:`dutordoir2018cde` for a more thorough explanation of
        latent variable models and encoders.

        :param num_data: The number of datapoints, ``N``.
        :param latent_dim: The dimensionality of the latent variable, ``W``.
        :param means: The initialisation of the mean of the latent variable posterior
            distribution. (see :attr:`means`). If `None` (the default setting), set to
            ``np.random.randn(N, W) * 0.01``; otherwise, ``means`` should be an array of
            rank two with the shape ``[N, W]``.
        """
        super().__init__()
        if means is None:
            # break the symmetry in the means:
            means = 0.01 * np.random.randn(num_data, latent_dim)
        else:
            if np.any(means.shape != (num_data, latent_dim)):
                raise EncoderInitializationError(
                    f"means must have shape [num_data, latent_dim] = [{num_data}, {latent_dim}]; "
                    f"got {means.shape} instead."
                )

        # initialise distribution with a small standard deviation, as this has
        # been observed to help fitting:
        stds = 1e-5 * np.ones_like(means)

        # TODO: Rename to `scale` and `loc` to match tfp.distributions
        self.means = Parameter(means, dtype=default_float(), name="w_means")
        self.stds = Parameter(stds, transform=positive(), dtype=default_float(), name="w_stds")
Beispiel #3
0
    def __init__(self,active_dims=[0],decay=0.1,max_subsequence_length=3,
                 alphabet = [], maxlen=0, batch_size=100):
        super().__init__(active_dims=active_dims)
        # constrain decay kernel params to between 0 and 1
        self.logistic = tfb.Chain([tfb.Shift(tf.cast(0,tf.float64))(tfb.Scale(tf.cast(1,tf.float64))),tfb.Sigmoid()])
        self.decay_param= Parameter(decay, transform=self.logistic ,name="decay")

        # use will use copies of the kernel params to stop building expensive computation graph
        # we instead efficientely calculate gradients using dynamic programming
        # These params are updated at every call to K and K_diag (to check if parameters have been updated)
        self.decay = self.decay_param.numpy()

        self.decay_unconstrained = self.decay_param.unconstrained_variable.numpy()

        self.order_coefs=tf.ones(max_subsequence_length,dtype=tf.float64)
        
        # store additional kernel parameters
        self.max_subsequence_length = tf.constant(max_subsequence_length)
        self.alphabet =  tf.constant(alphabet)
        self.alphabet_size=tf.shape(self.alphabet)[0]
        self.maxlen =  tf.constant(maxlen)
        self.batch_size = tf.constant(batch_size)

        # build a lookup table of the alphabet to encode input strings
        self.table = tf.lookup.StaticHashTable(
            initializer=tf.lookup.KeyValueTensorInitializer(
                keys=tf.constant(["PAD"]+alphabet),
                values=tf.constant(range(0,len(alphabet)+1)),),default_value=0)

        # initialize helful construction matricies to be lazily computed once needed
        self.D = None
        self.dD_dgap = None
Beispiel #4
0
    def __init__(self,rank=1,active_dims=[0],gap_decay=0.1, match_decay=0.9,max_subsequence_length=3,
                 alphabet = [], maxlen=0):
        super().__init__(active_dims=active_dims)
        # constrain decay kernel params to between 0 and 1
        logistic_gap = tfb.Chain([tfb.Shift(tf.cast(0,tf.float64))(tfb.Scale(tf.cast(1,tf.float64))),tfb.Sigmoid()])
        logisitc_match = tfb.Chain([tfb.AffineScalar(shift=tf.cast(0,tf.float64),scale=tf.cast(1,tf.float64)),tfb.Sigmoid()])
        self.gap_decay= Parameter(gap_decay, transform=logistic_gap ,name="gap_decay")
        self.match_decay = Parameter(match_decay, transform=logisitc_match,name="match_decay")

        # prepare similarity matrix parameters
        self.rank=rank
        W = 0.1 * tf.ones((len(alphabet), self.rank))
        kappa = tf.ones(len(alphabet))

        self.W = Parameter(W,name="W")
        self.kappa = Parameter(kappa, transform=positive(),name="kappa")
  
        # store additional kernel parameters
        self.max_subsequence_length = tf.constant(max_subsequence_length)
        self.alphabet =  tf.constant(alphabet)
        self.alphabet_size=tf.shape(self.alphabet)[0]
        self.maxlen =  tf.constant(maxlen)

        # build a lookup table of the alphabet to encode input strings
        self.table = tf.lookup.StaticHashTable(
            initializer=tf.lookup.KeyValueTensorInitializer(
                keys=tf.constant(["PAD"]+alphabet),
                values=tf.constant(range(0,len(alphabet)+1)),),default_value=0)
class LinearController(gpflow.Module):
    def __init__(self, state_dim, control_dim, max_action=1.0):
        gpflow.Module.__init__(self)
        self.W = Parameter(np.random.rand(control_dim, state_dim))
        self.b = Parameter(np.random.rand(1, control_dim))
        self.max_action = max_action

    def compute_action(self, m, s, squash=True):
        '''
        Simple affine action:  M <- W(m-t) - b
        IN: mean (m) and variance (s) of the state
        OUT: mean (M) and variance (S) of the action
        '''
        M = m @ tf.transpose(self.W) + self.b  # mean output
        S = self.W @ s @ tf.transpose(self.W)  # output variance
        V = tf.transpose(self.W)  #input output covariance
        if squash:
            M, S, V2 = squash_sin(M, S, self.max_action)
            V = V @ V2
        return M, S, V

    def randomize(self):
        mean = 0
        sigma = 1
        self.W.assign(mean + sigma * np.random.normal(size=self.W.shape))
        self.b.assign(mean + sigma * np.random.normal(size=self.b.shape))
Beispiel #6
0
    def __init__(self,
                 active_dims=[0],
                 gap_decay=0.1,
                 match_decay=0.9,
                 max_subsequence_length=3,
                 max_occurence_length=10,
                 alphabet=[],
                 maxlen=0,
                 normalize=True,
                 batch_size=1000):
        super().__init__(active_dims=active_dims)
        # constrain kernel params to between 0 and 1
        self.logistic_gap = tfb.Chain([
            tfb.AffineScalar(shift=tf.cast(0, tf.float64),
                             scale=tf.cast(1, tf.float64)),
            tfb.Sigmoid()
        ])
        self.logisitc_match = tfb.Chain([
            tfb.AffineScalar(shift=tf.cast(0, tf.float64),
                             scale=tf.cast(1, tf.float64)),
            tfb.Sigmoid()
        ])
        self.gap_decay_param = Parameter(gap_decay,
                                         transform=self.logistic_gap,
                                         name="gap_decay")
        self.match_decay_param = Parameter(match_decay,
                                           transform=self.logisitc_match,
                                           name="match_decay")
        self.max_subsequence_length = max_subsequence_length
        self.max_occurence_length = max_occurence_length
        self.alphabet = alphabet
        self.maxlen = maxlen
        self.normalize = normalize
        self.batch_size = batch_size
        self.symmetric = False

        # use will use copies of the kernel params to stop building expensive computation graph
        # we instead efficientely calculate gradients using dynamic programming
        # These params are updated at every call to K and K_diag (to check if parameters have been updated)
        self.match_decay = self.match_decay_param.numpy()
        self.gap_decay = self.gap_decay_param.numpy()
        self.match_decay_unconstrained = self.match_decay_param.unconstrained_variable.numpy(
        )
        self.gap_decay_unconstrained = self.gap_decay_param.unconstrained_variable.numpy(
        )

        # initialize helful construction matricies to be lazily computed once needed
        self.D = None
        self.dD_dgap = None

        # build a lookup table of the alphabet to encode input strings
        self.table = tf.lookup.StaticHashTable(
            initializer=tf.lookup.KeyValueTensorInitializer(
                keys=tf.constant(["PAD"] + alphabet),
                values=tf.constant(range(0,
                                         len(alphabet) + 1)),
            ),
            default_value=0)
 def __init__(self, state_dim, rewards=[], coefs=None):
     self.state_dim = state_dim
     self.base_rewards = rewards
     if coefs is not None:
         self.coefs = Parameter(coefs, trainable=False)
     else:
         self.coefs = Parameter(np.ones(len(rewards)),
                                dtype=float_type,
                                trainable=False)
 def __init__(self, state_dim, W=None, t=None):
     self.state_dim = state_dim
     if W is not None:
         self.W = Parameter(np.reshape(W, (state_dim, state_dim)),
                            trainable=False)
     else:
         self.W = Parameter(np.eye(state_dim), trainable=False)
     if t is not None:
         self.t = Parameter(np.reshape(t, (1, state_dim)), trainable=False)
     else:
         self.t = Parameter(np.zeros((1, state_dim)), trainable=False)
Beispiel #9
0
    def __init__(self,
                 m=1,
                 active_dims=[0],
                 gap_decay=0.1,
                 match_decay=0.9,
                 max_subsequence_length=3,
                 alphabet=[],
                 maxlen=0):
        super().__init__(active_dims=active_dims)
        # constrain decay kernel params to between 0 and 1
        logistic_gap = tfb.Chain([
            tfb.Shift(tf.cast(0, tf.float64))(tfb.Scale(tf.cast(1,
                                                                tf.float64))),
            tfb.Sigmoid()
        ])
        logisitc_match = tfb.Chain([
            tfb.AffineScalar(shift=tf.cast(0, tf.float64),
                             scale=tf.cast(1, tf.float64)),
            tfb.Sigmoid()
        ])
        self.gap_decay = Parameter(gap_decay,
                                   transform=logistic_gap,
                                   name="gap_decay")
        self.match_decay = Parameter(match_decay,
                                     transform=logisitc_match,
                                     name="match_decay")

        # prepare order coefs params
        order_coefs = tf.ones(max_subsequence_length)
        self.order_coefs = Parameter(order_coefs,
                                     transform=positive(),
                                     name="order_coefs")

        # get split weights
        self.m = m
        split_weights = tf.ones(2 * self.m - 1)
        self.split_weights = Parameter(split_weights,
                                       transform=positive(),
                                       name="order_coefs")

        # store additional kernel parameters
        self.max_subsequence_length = tf.constant(max_subsequence_length)
        self.alphabet = tf.constant(alphabet)
        self.alphabet_size = tf.shape(self.alphabet)[0]
        self.maxlen = tf.cast(tf.math.ceil(maxlen / self.m), dtype=tf.int32)
        self.full_maxlen = tf.constant(maxlen)
        # build a lookup table of the alphabet to encode input strings
        self.table = tf.lookup.StaticHashTable(
            initializer=tf.lookup.KeyValueTensorInitializer(
                keys=tf.constant(["PAD"] + alphabet),
                values=tf.constant(range(0,
                                         len(alphabet) + 1)),
            ),
            default_value=0)
Beispiel #10
0
 def __init__(self, state_dim, weight_matrix, target_state):
     """
     :param state_dim: dimensionality of observation.
     :param weight_matrix: numpy array giving the pre-defined weight matrix which sets the sensitivity of the reward
                           to the different dimensions.
     :param target_state: numpy array giving the target state.
     """
     self.state_dim = state_dim
     self.weight_matrix = Parameter(np.reshape(weight_matrix,
                                               (state_dim, state_dim)),
                                    trainable=False)
     self.target_state = Parameter(np.reshape(target_state, (1, state_dim)),
                                   trainable=False)
 def __init__(self, data, kernel, X=None, likelihood_variance=1e-4):
     gpflow.Module.__init__(self)
     if X is None:
         self.X = Parameter(data[0],
                            name="DataX",
                            dtype=gpflow.default_float())
     else:
         self.X = X
     self.Y = Parameter(data[1], name="DataY", dtype=gpflow.default_float())
     self.data = [self.X, self.Y]
     self.kernel = kernel
     self.likelihood = gpflow.likelihoods.Gaussian()
     self.likelihood.variance.assign(likelihood_variance)
     set_trainable(self.likelihood.variance, False)
Beispiel #12
0
    def init_variational_params(self, num_inducing):
        q_mu = np.zeros(
            (num_inducing, self.num_kernels, self.num_latent_gps))  # M x K x O
        self.q_mu = Parameter(q_mu, dtype=default_float())

        q_sqrt = []
        for _ in range(self.num_kernels):
            q_sqrt.append([
                np.eye(num_inducing, dtype=default_float())
                for _ in range(self.num_latent_gps)
            ])
        q_sqrt = np.array(q_sqrt)
        self.q_sqrt = Parameter(q_sqrt,
                                transform=triangular())  # K x O x M x M
Beispiel #13
0
    def __init__(self,
                 data: RegressionData,
                 kernel,
                 noise_variance: float = 1.0,
                 parallel=False,
                 max_parallel=10000):

        self.noise_variance = Parameter(noise_variance, transform=positive())
        ts, ys = data_input_to_tensor(data)
        super().__init__(kernel, None, None, num_latent_gps=ys.shape[-1])
        self.data = ts, ys
        filter_spec = kernel.get_spec(ts.shape[0])
        filter_ys_spec = tf.TensorSpec((ts.shape[0], 1),
                                       config.default_float())
        smoother_spec = kernel.get_spec(None)
        smoother_ys_spec = tf.TensorSpec((None, 1), config.default_float())

        if not parallel:
            self._kf = tf.function(
                partial(kf, return_loglikelihood=True, return_predicted=False),
                input_signature=[filter_spec, filter_ys_spec])
            self._kfs = tf.function(
                kfs, input_signature=[smoother_spec, smoother_ys_spec])
        else:
            self._kf = tf.function(
                partial(pkf,
                        return_loglikelihood=True,
                        max_parallel=ts.shape[0]),
                input_signature=[filter_spec, filter_ys_spec])
            self._kfs = tf.function(
                partial(pkfs, max_parallel=max_parallel),
                input_signature=[smoother_spec, smoother_ys_spec])
Beispiel #14
0
    def __init__(self,
                 variance,
                 lengthscales,
                 name='Kernel',
                 active_dims=None):
        """ Kernel Constructor.

        Args:
            variance: An (L,L) symmetric, positive definite matrix for the signal variance.
            lengthscales: An (L,M) matrix of positive definite lengthscales.
            is_lengthscales_trainable: Whether the lengthscales of this kernel are trainable.
            name: The name of this kernel.
            active_dims: Which of the input dimensions are used. The default None means all of them.
        """
        super(AnisotropicStationary, self).__init__(
            active_dims=active_dims, name=name
        )  # Do not call gf.kernels.AnisotropicStationary.__init__()!
        self.variance = Variance(value=np.atleast_2d(variance),
                                 name=name + 'Variance')
        self._L = self.variance.shape[0]
        lengthscales = data_input_to_tensor(lengthscales)
        lengthscales_shape = tuple(tf.shape(lengthscales).numpy())
        self._M = 1 if lengthscales_shape in ((), (1, ), (1, 1), (
            self._L, )) else lengthscales_shape[-1]
        lengthscales = tf.reshape(
            tf.broadcast_to(lengthscales, (self._L, self._M)),
            (self._L, 1, self._M))
        self.lengthscales = Parameter(lengthscales,
                                      transform=positive(),
                                      trainable=False,
                                      name=name + 'Lengthscales')
        self._validate_ard_active_dims(self.lengthscales[0, 0])
Beispiel #15
0
    def __init__(
            self,
            value,
            name: str = 'Variance',
            cholesky_diagonal_lower_bound: float = CHOLESKY_DIAGONAL_LOWER_BOUND
    ):
        """ Construct a non-diagonal covariance matrix. Mutable only through it's properties cholesky_diagonal and cholesky_lower_triangle.

        Args:
            value: A symmetric, positive definite matrix, expressed in tensorflow or numpy.
            cholesky_diagonal_lower_bound: Lower bound on the diagonal of the Cholesky decomposition.
        """
        super().__init__(name=name)
        value = data_input_to_tensor(value)
        self._shape = (value.shape[-1], value.shape[-1])
        self._broadcast_shape = (value.shape[-1], 1, value.shape[-1], 1)
        if value.shape != self._shape:
            raise ValueError('Variance must have shape (L,L).')

        cholesky = tf.linalg.cholesky(value)

        self._cholesky_diagonal = tf.linalg.diag_part(cholesky)
        if min(self._cholesky_diagonal) <= cholesky_diagonal_lower_bound:
            raise ValueError(
                f'The Cholesky diagonal of {name} must be strictly greater than {cholesky_diagonal_lower_bound}.'
            )
        self._cholesky_diagonal = Parameter(
            self._cholesky_diagonal,
            transform=positive(lower=cholesky_diagonal_lower_bound),
            name=name + '.cholesky_diagonal')

        mask = sum([
            list(range(i * self._shape[0], i * (self._shape[0] + 1)))
            for i in range(1, self._shape[0])
        ],
                   start=[])
        self._cholesky_lower_triangle = Parameter(
            tf.gather(tf.reshape(cholesky, [-1]), mask),
            name=name + '.cholesky_lower_triangle')

        self._row_lengths = tuple(range(self._shape[0]))
Beispiel #16
0
def test_q_sqrt_constraints(Xdata, Xnew, kernel, mu, white):
    """ Test that sending in an unconstrained q_sqrt returns the same conditional
    evaluation and gradients. This is important to match the behaviour of the KL, which
    enforces q_sqrt is triangular.
    """

    tril = np.tril(rng.randn(Ln, Nn, Nn))

    q_sqrt_constrained = Parameter(tril, transform=triangular())
    q_sqrt_unconstrained = Parameter(tril)

    diff_before_gradient_step = (q_sqrt_constrained -
                                 q_sqrt_unconstrained).numpy()
    assert_allclose(diff_before_gradient_step, 0)

    Fstars = []
    for q_sqrt in [q_sqrt_constrained, q_sqrt_unconstrained]:

        with tf.GradientTape() as tape:
            _, Fstar_var = conditional(Xnew,
                                       Xdata,
                                       kernel,
                                       mu,
                                       q_sqrt=q_sqrt,
                                       white=white)

        grad = tape.gradient(Fstar_var, q_sqrt.unconstrained_variable)
        q_sqrt.unconstrained_variable.assign_sub(grad)
        Fstars.append(Fstar_var)

    diff_Fstar_before_gradient_step = Fstars[0] - Fstars[1]
    assert_allclose(diff_Fstar_before_gradient_step, 0)

    diff_after_gradient_step = (q_sqrt_constrained -
                                q_sqrt_unconstrained).numpy()
    assert_allclose(diff_after_gradient_step, 0)
 def create_models(self, data):
     self.models = []
     for i in range(self.num_outputs):
         kernel = gpflow.kernels.SquaredExponential(
             lengthscales=tf.ones([
                 data[0].shape[1],
             ], dtype=float_type))
         transformed_lengthscales = Parameter(
             kernel.lengthscales, transform=positive(lower=1e-3))
         kernel.lengthscales = transformed_lengthscales
         kernel.lengthscales.prior = tfd.Gamma(f64(1.1), f64(1 / 10.0))
         if i == 0:
             self.models.append(
                 FakeGPR((data[0], data[1][:, i:i + 1]), kernel))
         else:
             self.models.append(
                 FakeGPR((data[0], data[1][:, i:i + 1]), kernel,
                         self.models[-1].X))
Beispiel #18
0
class BayesianDenseLayer(TrackableLayer):
    """A Bayesian dense layer for variational Bayesian neural networks"""

    def __init__(
        self,
        input_dim: int,
        output_dim: int,
        num_data: int,
        w_mu: Optional[np.ndarray] = None,
        w_sqrt: Optional[np.ndarray] = None,
        activation: Optional[Callable] = None,
        is_mean_field: bool = True,
        temperature: float = 1e-4,
        returns_samples: bool = True,
    ):
        """
        A Bayesian dense layer for variational Bayesian neural nets. This layer holds the
        weight mean and sqrt as well as the temperature for cooling (or heating) the posterior.

        :param input_dim: The layer's input dimension (excluding bias)
        :param output_dim: The layer's output dimension
        :param num_data: number of data points
        :param w_mu: Initial value of the variational mean (weights + bias)
        :param w_sqrt: Initial value of the variational Cholesky (covering weights + bias)
        :param activation: The type of activation function (None is linear)
        :param is_mean_field: Determines mean field approximation of the weight posterior
        :param temperature: For cooling or heating the posterior
        :param returns_samples: If True, return samples on calling the layer,
             Else return mean and variance
        """

        super().__init__(dtype=default_float())

        assert input_dim >= 1
        assert output_dim >= 1
        assert num_data >= 1
        if w_mu is not None:  # add + 1 for the bias
            assert w_mu.shape == ((input_dim + 1) * output_dim,)
        if w_sqrt is not None:
            if not is_mean_field:
                assert w_sqrt.shape == (
                    (input_dim + 1) * output_dim,
                    (input_dim + 1) * output_dim,
                )
            else:
                assert w_sqrt.shape == ((input_dim + 1) * output_dim,)
        assert temperature > 0.0

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.num_data = num_data

        self.w_mu_ini = w_mu
        self.w_sqrt_ini = w_sqrt

        self.activation = activation
        self.is_mean_field = is_mean_field
        self.temperature = temperature
        self.returns_samples = returns_samples

        self.dim = (input_dim + 1) * output_dim
        self.full_output_cov = False
        self.full_cov = False

        self.w_mu = Parameter(np.zeros((self.dim,)), dtype=default_float(), name="w_mu")  # [dim]

        self.w_sqrt = Parameter(
            np.zeros((self.dim, self.dim)) if not self.is_mean_field else np.ones((self.dim,)),
            transform=triangular() if not self.is_mean_field else positive(),
            dtype=default_float(),
            name="w_sqrt",
        )  # [dim, dim] or [dim]

    def initialize_variational_distribution(self) -> None:
        if self.w_mu_ini is None:
            w = xavier_initialization_numpy(self.input_dim, self.output_dim)
            b = np.zeros((1, self.output_dim))
            self.w_mu_ini = np.concatenate((w, b), axis=0).reshape((self.dim,))
        self.w_mu.assign(self.w_mu_ini)

        if self.w_sqrt_ini is None:
            if not self.is_mean_field:
                self.w_sqrt_ini = 1e-5 * np.eye(self.dim)
            else:
                self.w_sqrt_ini = 1e-5 * np.ones((self.dim,))
        self.w_sqrt.assign(self.w_sqrt_ini)

    def build(self, input_shape: ShapeType) -> None:
        """Build the variables necessary on first call"""
        super().build(input_shape)
        self.initialize_variational_distribution()

    def predict_samples(
        self,
        inputs: TensorType,
        *,
        num_samples: Optional[int] = None,
        full_output_cov: bool = False,
        full_cov: bool = False,
        whiten: bool = False,
    ) -> tf.Tensor:
        """
        Make a sample predictions at N test inputs, with input_dim = D, output_dim = Q. Return a
        sample, and the conditional mean and covariance at these points.

        :param inputs: the inputs to predict at. shape [N, D]
        :param num_samples: the number of samples S, to draw.
            shape [S, N, Q] if S is not None else [N, Q].
        :param full_output_cov: assert to False since not supported for now
        :param full_cov: assert to False since not supported for now
        :param whiten: assert to False since not sensible in Bayesian neural nets
        """
        assert full_output_cov is False
        assert full_cov is False
        assert whiten is False

        _num_samples = num_samples or 1
        z = tf.random.normal((self.dim, _num_samples), dtype=default_float())  # [dim, S]
        if not self.is_mean_field:
            w = self.w_mu[:, None] + tf.matmul(self.w_sqrt, z)  # [dim, S]
        else:
            w = self.w_mu[:, None] + self.w_sqrt[:, None] * z  # [dim, S]

        N = tf.shape(inputs)[0]
        inputs_concat_1 = tf.concat(
            (inputs, tf.ones((N, 1), dtype=default_float())), axis=-1
        )  # [N, D+1]
        samples = tf.tensordot(
            inputs_concat_1,
            tf.reshape(tf.transpose(w), (_num_samples, self.input_dim + 1, self.output_dim)),
            [[-1], [1]],
        )  # [N, S, Q]
        if num_samples is None:
            samples = tf.squeeze(samples, axis=-2)  # [N, Q]
        else:
            samples = tf.transpose(samples, perm=[1, 0, 2])  # [S, N, Q]

        if self.activation is not None:
            samples = self.activation(samples)

        return samples

    def call(
        self, inputs: TensorType, training: Optional[bool] = False
    ) -> Union[tf.Tensor, MeanAndVariance]:
        """The default behaviour upon calling the BayesianDenseLayer()(X)"""
        sample = self.predict_samples(
            inputs,
            num_samples=None,
            full_output_cov=self.full_output_cov,
            full_cov=self.full_cov,
        )

        # TF quirk: add_loss must add a tensor to compile
        if training:
            # Wenzel et al. 2020: How good is the Bayes posterior in DNNs really?
            loss = self.temperature * self.prior_kl()
        else:
            loss = tf.constant(0.0, dtype=default_float())
        loss_per_datapoint = loss / self.num_data

        self.add_loss(loss_per_datapoint)

        # for latent layers, return samples
        if self.returns_samples:
            return sample  # [N, Q]
        # for output layers, return samples as mean with 0 cov
        return sample, tf.ones_like(sample) * 1e-10  # [N, Q], [N, Q]

    def prior_kl(self) -> tf.Tensor:
        """
        The KL divergence from the variational distribution to the prior
        :return: KL divergence from N(w_mu, w_sqrt) to N(0, I)
        """
        return gauss_kl(
            self.w_mu[:, None],
            self.w_sqrt[None] if not self.is_mean_field else self.w_sqrt[:, None],
        )
Beispiel #19
0
class StringKernel(Kernel):
    """
    Code to run the SSK of Moss et al. 2020 with gpflow
    
   with hyperparameters:
    1) match_decay float
        decrease the contribution of long subsequences
    2) gap_decay float
        decrease the contribtuion of subsequences with large gaps (penalize non-contiguous)
    3) max_subsequence_length int 
        largest subsequence considered
    4) max_occurence_length int
        longest non-contiguous occurences of subsequences considered (max_occurence_length > max_subsequence_length)
    We calculate gradients for match_decay and gap_decay w.r.t kernel hyperparameters following Beck (2017)
    We recommend normalize = True to allow meaningful comparrison of strings of different length
    """
    def __init__(self,
                 active_dims=[0],
                 gap_decay=0.1,
                 match_decay=0.9,
                 max_subsequence_length=3,
                 max_occurence_length=10,
                 alphabet=[],
                 maxlen=0,
                 normalize=True,
                 batch_size=1000):
        super().__init__(active_dims=active_dims)
        # constrain kernel params to between 0 and 1
        self.logistic_gap = tfb.Chain([
            tfb.AffineScalar(shift=tf.cast(0, tf.float64),
                             scale=tf.cast(1, tf.float64)),
            tfb.Sigmoid()
        ])
        self.logisitc_match = tfb.Chain([
            tfb.AffineScalar(shift=tf.cast(0, tf.float64),
                             scale=tf.cast(1, tf.float64)),
            tfb.Sigmoid()
        ])
        self.gap_decay_param = Parameter(gap_decay,
                                         transform=self.logistic_gap,
                                         name="gap_decay")
        self.match_decay_param = Parameter(match_decay,
                                           transform=self.logisitc_match,
                                           name="match_decay")
        self.max_subsequence_length = max_subsequence_length
        self.max_occurence_length = max_occurence_length
        self.alphabet = alphabet
        self.maxlen = maxlen
        self.normalize = normalize
        self.batch_size = batch_size
        self.symmetric = False

        # use will use copies of the kernel params to stop building expensive computation graph
        # we instead efficientely calculate gradients using dynamic programming
        # These params are updated at every call to K and K_diag (to check if parameters have been updated)
        self.match_decay = self.match_decay_param.numpy()
        self.gap_decay = self.gap_decay_param.numpy()
        self.match_decay_unconstrained = self.match_decay_param.unconstrained_variable.numpy(
        )
        self.gap_decay_unconstrained = self.gap_decay_param.unconstrained_variable.numpy(
        )

        # initialize helful construction matricies to be lazily computed once needed
        self.D = None
        self.dD_dgap = None

        # build a lookup table of the alphabet to encode input strings
        self.table = tf.lookup.StaticHashTable(
            initializer=tf.lookup.KeyValueTensorInitializer(
                keys=tf.constant(["PAD"] + alphabet),
                values=tf.constant(range(0,
                                         len(alphabet) + 1)),
            ),
            default_value=0)

    def K_diag(self, X):
        r"""
        Calc just the diagonal elements of a kernel matrix
        """

        # check if string is not longer than max length
        if tf.reduce_max(tf.strings.length(X)) + 1 > 2 * self.maxlen:
            raise ValueError(
                "An input string is longer that max-length so refit the kernel with a larger maxlen param"
            )

        if self.normalize:
            # if normalizing then diagonal will just be ones
            return tf.cast(tf.fill(tf.shape(X)[:-1], 1), tf.float64)
        else:
            # otherwise have to calc kernel elements
            # Turn inputs into lists of integers using one-hot embedding and pad until all same length
            X = tf.strings.split(tf.squeeze(X, 1)).to_tensor(
                "PAD", shape=[None, self.maxlen])
            X = self.table.lookup(X)

            # prep required quantities and check kernel parameters
            self._precalc()

            # Proceed with kernel matrix calculations in batches
            k_results = tf.TensorArray(tf.float64,
                                       size=0,
                                       dynamic_size=True,
                                       infer_shape=False)

            num_batches = tf.math.ceil(tf.shape(X)[0] / self.batch_size)
            # iterate through batches
            for i in tf.range(
                    tf.cast(tf.math.ceil(tf.shape(X)[0] / self.batch_size),
                            dtype=tf.int32)):
                X_batch = X[self.batch_size * i:self.batch_size * (i + 1)]
                k_results = k_results.write(k_results.size(),
                                            self._k(X_batch, X_batch))

            # collect all batches
            return tf.reshape(k_results.concat(), (-1, ))

    def K(self, X, X2=None):
        r"""
        Now we calculate the kernel values and kernel gradients
        Efficientely calculating kernel gradients requires dynamic programming 
        and so we 'turn off' autograd and calculate manually

        We currently only bother calculating the kernel gradients for gram matricies
        i.e (when X=X2) as required when fitting the model.
        For predictions (where X != X2) we do not calculate gradients
        """

        if X2 is None:
            self.symmetric = True
            k_results = self.K_calc(X, X)
        else:
            self.symmetric = False
            k_results = self.K_calc(X, X2)

        return k_results

    def _precalc(self):
        r"""
        Update stored kernel params (incase they have changed)
        and precalc D and dD_dgap as required for kernel calcs
        following notation from Beck (2017)
        """
        self.match_decay = self.match_decay_param.numpy()
        self.gap_decay = self.gap_decay_param.numpy()
        self.match_decay_unconstrained = self.match_decay_param.unconstrained_variable.numpy(
        )
        self.gap_decay_unconstrained = self.gap_decay_param.unconstrained_variable.numpy(
        )

        tril = tf.linalg.band_part(
            tf.ones((self.maxlen, self.maxlen), dtype=tf.float64), -1, 0)
        # get upper triangle matrix of increasing intergers
        values = tf.TensorArray(tf.int32, size=self.maxlen)
        for i in tf.range(self.maxlen):
            values = values.write(i, tf.range(-i - 1, self.maxlen - 1 - i))
        power = tf.cast(values.stack(), tf.float64)
        values.close()
        power = tf.linalg.band_part(power, 0, -1) - tf.linalg.band_part(
            power, 0, 0) + tril
        tril = tf.transpose(
            tf.linalg.band_part(
                tf.ones((self.maxlen, self.maxlen), dtype=tf.float64),
                self.max_occurence_length, 0)) - tf.eye(self.maxlen,
                                                        dtype=tf.float64)
        gaps = tf.fill([self.maxlen, self.maxlen], self.gap_decay)

        self.D = tf.pow(gaps * tril, power)
        self.dD_dgap = tf.pow((tril * gaps), (power - 1.0)) * tril * power

    @tf.custom_gradient
    def K_calc(self, X, X2):
        r"""
        Calc the elements of the kernel matrix (and gradients if symmetric)
        """

        # check if input strings are longer than max allowed length
        if (tf.reduce_max(tf.strings.length(X)) + 1 > 2 * self.maxlen) or (
                tf.reduce_max(tf.strings.length(X2)) + 1 > 2 * self.maxlen):
            raise ValueError(
                "An input string is longer that max-length so refit the kernel with a larger maxlen param"
            )

        # Turn our inputs into lists of integers using one-hot embedding
        # first split up strings and pad to fixed length and prep for gpu
        # pad until all have length of self.maxlen
        X = tf.strings.split(tf.squeeze(X, 1)).to_tensor(
            "PAD", shape=[None, self.maxlen])
        X = self.table.lookup(X)
        if self.symmetric:
            X2 = X
        else:
            # pad until all have length of self.maxlen
            X2 = tf.strings.split(tf.squeeze(X2, 1)).to_tensor(
                "PAD", shape=[None, self.maxlen])
            X2 = self.table.lookup(X2)

        # get the decay tensors D and dD_dgap
        self._precalc()

        # get indicies of all possible pairings from X and X2
        # this way allows maximum number of kernel calcs to be squished onto the GPU (rather than just doing individual rows of gram)
        indicies_2, indicies_1 = tf.meshgrid(tf.range(0,
                                                      tf.shape(X2)[0]),
                                             tf.range(0,
                                                      tf.shape(X)[0]))
        indicies = tf.concat(
            [tf.reshape(indicies_1, (-1, 1)),
             tf.reshape(indicies_2, (-1, 1))],
            axis=1)
        # if symmetric then only calc upper matrix (fill in rest later)
        if self.symmetric:
            indicies = tf.boolean_mask(
                indicies, tf.greater_equal(indicies[:, 1], indicies[:, 0]))
        # make kernel calcs in batches
        num_batches = tf.math.ceil(tf.shape(indicies)[0] / self.batch_size)
        # iterate through batches

        if self.symmetric:
            k_results = tf.TensorArray(tf.float64,
                                       size=0,
                                       dynamic_size=True,
                                       infer_shape=False)
            gap_grads = tf.TensorArray(tf.float64,
                                       size=0,
                                       dynamic_size=True,
                                       infer_shape=False)
            match_grads = tf.TensorArray(tf.float64,
                                         size=0,
                                         dynamic_size=True,
                                         infer_shape=False)
            for i in tf.range(
                    tf.cast(tf.math.ceil(
                        tf.shape(indicies)[0] / self.batch_size),
                            dtype=tf.int32)):
                indicies_batch = indicies[self.batch_size * i:self.batch_size *
                                          (i + 1)]
                X_batch = tf.gather(X, indicies_batch[:, 0], axis=0)
                X2_batch = tf.gather(X2, indicies_batch[:, 1], axis=0)
                results = self._k_grads(X_batch, X2_batch)
                k_results = k_results.write(k_results.size(), results[0])
                gap_grads = gap_grads.write(gap_grads.size(), results[1])
                match_grads = match_grads.write(match_grads.size(), results[2])
            # combine indivual kernel results
            k_results = tf.reshape(k_results.concat(), [1, -1])
            gap_grads = tf.reshape(gap_grads.concat(), [1, -1])
            match_grads = tf.reshape(match_grads.concat(), [1, -1])
        else:
            k_results = tf.TensorArray(tf.float64,
                                       size=0,
                                       dynamic_size=True,
                                       infer_shape=False)
            for i in tf.range(
                    tf.cast(tf.math.ceil(
                        tf.shape(indicies)[0] / self.batch_size),
                            dtype=tf.int32)):
                indicies_batch = indicies[self.batch_size * i:self.batch_size *
                                          (i + 1)]
                X_batch = tf.gather(X, indicies_batch[:, 0], axis=0)
                X2_batch = tf.gather(X2, indicies_batch[:, 1], axis=0)
                k_results = k_results.write(k_results.size(),
                                            self._k(X_batch, X2_batch))
            # combine indivual kernel results
            k_results = tf.reshape(k_results.concat(), [1, -1])

        # put results into the right places in the gram matrix
        # if symmetric then only put in top triangle (inc diag)
        if self.symmetric:
            mask = tf.linalg.band_part(
                tf.ones((tf.shape(X)[0], tf.shape(X)[0]), dtype=tf.int64), 0,
                -1)
            non_zero = tf.not_equal(mask, tf.constant(0, dtype=tf.int64))
            indices = tf.where(
                non_zero)  # Extracting the indices of upper triangle elements
            out = tf.SparseTensor(indices,
                                  tf.squeeze(k_results),
                                  dense_shape=tf.cast(
                                      (tf.shape(X)[0], tf.shape(X)[0]),
                                      dtype=tf.int64))
            k_results = tf.sparse.to_dense(out)
            out = tf.SparseTensor(indices,
                                  tf.squeeze(gap_grads),
                                  dense_shape=tf.cast(
                                      (tf.shape(X)[0], tf.shape(X)[0]),
                                      dtype=tf.int64))
            gap_grads = tf.sparse.to_dense(out)
            out = tf.SparseTensor(indices,
                                  tf.squeeze(match_grads),
                                  dense_shape=tf.cast(
                                      (tf.shape(X)[0], tf.shape(X)[0]),
                                      dtype=tf.int64))
            match_grads = tf.sparse.to_dense(out)

            #add in mising elements (lower diagonal)
            k_results = k_results + tf.linalg.set_diag(
                tf.transpose(k_results),
                tf.zeros(tf.shape(X)[0], dtype=tf.float64))
            gap_grads = gap_grads + tf.linalg.set_diag(
                tf.transpose(gap_grads),
                tf.zeros(tf.shape(X)[0], dtype=tf.float64))
            match_grads = match_grads + tf.linalg.set_diag(
                tf.transpose(match_grads),
                tf.zeros(tf.shape(X)[0], dtype=tf.float64))
        else:
            k_results = tf.reshape(
                k_results, [tf.shape(X)[0], tf.shape(X2)[0]])

        # normalize if required
        if self.normalize:
            if self.symmetric:
                # if symmetric then can extract normalization terms from gram
                X_diag_Ks = tf.linalg.diag_part(k_results)
                X_diag_gap_grads = tf.linalg.diag_part(gap_grads)
                X_diag_match_grads = tf.linalg.diag_part(match_grads)

                # norm for kernel entries
                norm = tf.tensordot(X_diag_Ks, X_diag_Ks, axes=0)
                k_results = tf.divide(k_results, tf.sqrt(norm))
                # norm for gap_decay and match_decay grads
                diff_gap = tf.divide(
                    tf.tensordot(X_diag_gap_grads, X_diag_Ks, axes=0) +
                    tf.tensordot(X_diag_Ks, X_diag_gap_grads, axes=0),
                    2 * norm)
                diff_match = tf.divide(
                    tf.tensordot(X_diag_match_grads, X_diag_Ks, axes=0) +
                    tf.tensordot(X_diag_Ks, X_diag_match_grads, axes=0),
                    2 * norm)
                gap_grads = tf.divide(gap_grads, tf.sqrt(norm)) - tf.multiply(
                    k_results, diff_gap)
                match_grads = tf.divide(match_grads,
                                        tf.sqrt(norm)) - tf.multiply(
                                            k_results, diff_match)

            else:
                # if not symmetric then need to calculate some extra kernel calcs
                # get diagonal kernel calcs for X1
                X_diag_Ks = tf.TensorArray(tf.float64,
                                           size=0,
                                           dynamic_size=True,
                                           infer_shape=False)
                num_batches = tf.math.ceil(tf.shape(X)[0] / self.batch_size)
                # iterate through batches
                for i in tf.range(
                        tf.cast(tf.math.ceil(tf.shape(X)[0] / self.batch_size),
                                dtype=tf.int32)):
                    X_batch = X[self.batch_size * i:self.batch_size * (i + 1)]
                    X_diag_Ks = X_diag_Ks.write(X_diag_Ks.size(),
                                                self._k(X_batch, X_batch))
                # collect up all batches
                X_diag_Ks = tf.reshape(X_diag_Ks.concat(), (-1, ))

                # get diagonal kernel calcs for X2
                X2_diag_Ks = tf.TensorArray(tf.float64,
                                            size=0,
                                            dynamic_size=True,
                                            infer_shape=False)
                num_batches = tf.math.ceil(tf.shape(X2)[0] / self.batch_size)
                # iterate through batches
                for i in tf.range(
                        tf.cast(tf.math.ceil(
                            tf.shape(X2)[0] / self.batch_size),
                                dtype=tf.int32)):
                    X2_batch = X2[self.batch_size * i:self.batch_size *
                                  (i + 1)]
                    X2_diag_Ks = X2_diag_Ks.write(X2_diag_Ks.size(),
                                                  self._k(X2_batch, X2_batch))
                # collect up all batches
                X2_diag_Ks = tf.reshape(X2_diag_Ks.concat(), (-1, ))

                # norm for kernel entries
                norm = tf.tensordot(X_diag_Ks, X2_diag_Ks, axes=0)
                k_results = tf.divide(k_results, tf.sqrt(norm))

        def grad(dy, variables=None):
            if self.symmetric:
                # get gradients of unconstrained params
                grads = {}
                grads['gap_decay:0'] = tf.reduce_sum(
                    tf.multiply(
                        dy,
                        gap_grads * tf.math.exp(
                            self.logistic_gap.forward_log_det_jacobian(
                                self.gap_decay_unconstrained, 0))))
                grads['match_decay:0'] = tf.reduce_sum(
                    tf.multiply(
                        dy,
                        match_grads * tf.math.exp(
                            self.logisitc_match.forward_log_det_jacobian(
                                self.match_decay_unconstrained, 0))))
                gradient = [grads[v.name] for v in variables]
                return ((None, None), gradient)
            else:
                return ((None, None), [None, None])

        return k_results, grad

    def _k_grads(self, X1, X2):
        r"""
        Vectorized kernel calc and kernel grad calc.
        Following notation from Beck (2017), i.e have tensors S,D,Kpp,Kp
        Input is two tensors of shape (# strings , # characters)
        and we calc the pair-wise kernel calcs between the elements (i.e n kern calcs for two lists of length n)
        D is the tensor than unrolls the recursion and allows vecotrizaiton
        """

        # turn into one-hot  i.e. shape (# strings, #characters+1, alphabet size)
        X1 = tf.one_hot(X1, len(self.alphabet) + 1, dtype=tf.float64)
        X2 = tf.one_hot(X2, len(self.alphabet) + 1, dtype=tf.float64)
        # remove the ones in the first column that encode the padding (i.e we dont want them to count as a match)
        paddings = tf.constant([[0, 0], [0, 0], [0, len(self.alphabet)]])
        X1 = X1 - tf.pad(tf.expand_dims(X1[:, :, 0], 2), paddings, "CONSTANT")
        X2 = X2 - tf.pad(tf.expand_dims(X2[:, :, 0], 2), paddings, "CONSTANT")
        # store squared match coef
        match_sq = tf.square(self.match_decay)
        # Make S: the similarity tensor of shape (# strings, #characters, # characters)
        S = tf.matmul(X1, tf.transpose(X2, perm=(0, 2, 1)))
        # Main loop, where Kp, Kpp values and gradients are calculated.
        Kp = tf.TensorArray(tf.float64,
                            size=0,
                            dynamic_size=True,
                            clear_after_read=False)
        dKp_dgap = tf.TensorArray(tf.float64,
                                  size=0,
                                  dynamic_size=True,
                                  clear_after_read=False)
        dKp_dmatch = tf.TensorArray(tf.float64,
                                    size=0,
                                    dynamic_size=True,
                                    clear_after_read=False)
        Kp = Kp.write(
            Kp.size(),
            tf.ones(shape=tf.stack([tf.shape(X1)[0], self.maxlen,
                                    self.maxlen]),
                    dtype=tf.float64))
        dKp_dgap = dKp_dgap.write(
            dKp_dgap.size(),
            tf.zeros(shape=tf.stack(
                [tf.shape(X1)[0], self.maxlen, self.maxlen]),
                     dtype=tf.float64))
        dKp_dmatch = dKp_dmatch.write(
            dKp_dmatch.size(),
            tf.zeros(shape=tf.stack(
                [tf.shape(X1)[0], self.maxlen, self.maxlen]),
                     dtype=tf.float64))

        # calc subkernels for each subsequence length
        for i in tf.range(0, self.max_subsequence_length - 1):

            Kp_temp = tf.multiply(S, Kp.read(i))
            Kp_temp0 = match_sq * Kp_temp
            Kp_temp1 = tf.matmul(Kp_temp0, self.D)
            Kp_temp2 = tf.matmul(self.D, Kp_temp1, transpose_a=True)
            Kp = Kp.write(Kp.size(), Kp_temp2)

            dKp_dgap_temp_1 = tf.matmul(self.dD_dgap,
                                        Kp_temp1,
                                        transpose_a=True)
            dKp_dgap_temp_2 = tf.multiply(S, dKp_dgap.read(i))
            dKp_dgap_temp_2 = dKp_dgap_temp_2 * match_sq
            dKp_dgap_temp_2 = tf.matmul(dKp_dgap_temp_2, self.D)
            dKp_dgap_temp_2 = dKp_dgap_temp_2 + tf.matmul(
                Kp_temp0, self.dD_dgap)
            dKp_dgap_temp_2 = tf.matmul(self.D,
                                        dKp_dgap_temp_2,
                                        transpose_a=True)
            dKp_dgap = dKp_dgap.write(dKp_dgap.size(),
                                      dKp_dgap_temp_1 + dKp_dgap_temp_2)

            dKp_dmatch_temp_1 = 2 * tf.divide(Kp_temp2, self.match_decay)
            dKp_dmatch_temp_2 = tf.multiply(S, dKp_dmatch.read(i))
            dKp_dmatch_temp_2 = dKp_dmatch_temp_2 * match_sq
            dKp_dmatch_temp_2 = tf.matmul(dKp_dmatch_temp_2, self.D)
            dKp_dmatch_temp_2 = tf.matmul(self.D,
                                          dKp_dmatch_temp_2,
                                          transpose_a=True)
            dKp_dmatch = dKp_dmatch.write(
                dKp_dmatch.size(), dKp_dmatch_temp_1 + dKp_dmatch_temp_2)

        # Final calculation. We gather all Kps
        Kp_stacked = Kp.stack()
        Kp.close()
        dKp_dgap_stacked = dKp_dgap.stack()
        dKp_dgap.close()
        dKp_dmatch_stacked = dKp_dmatch.stack()
        dKp_dmatch.close()

        # get k
        temp = tf.multiply(S, Kp_stacked)
        temp = tf.reduce_sum(temp, -1)
        sum2 = tf.reduce_sum(temp, -1)
        Ki = sum2 * match_sq
        k = tf.reduce_sum(Ki, 0)
        k = tf.expand_dims(k, 1)

        # get gap decay grads
        temp = tf.multiply(S, dKp_dgap_stacked)
        temp = tf.reduce_sum(temp, -1)
        temp = tf.reduce_sum(temp, -1)
        temp = temp * match_sq
        dk_dgap = tf.reduce_sum(temp, 0)
        dk_dgap = tf.expand_dims(dk_dgap, 1)

        # get match decay grads
        temp = tf.multiply(S, dKp_dmatch_stacked)
        temp = tf.reduce_sum(temp, -1)
        temp = tf.reduce_sum(temp, -1)
        temp = temp * match_sq
        temp = temp + 2 * self.match_decay * sum2
        dk_dmatch = tf.reduce_sum(temp, 0)
        dk_dmatch = tf.expand_dims(dk_dmatch, 1)

        return k, dk_dgap, dk_dmatch

    def _k(self, X1, X2):
        r"""
        Vectorized kernel calc.
        Following notation from Beck (2017), i.e have tensors S,D,Kpp,Kp
        Input is two tensors of shape (# strings , # characters)
        and we calc the pair-wise kernel calcs between the elements (i.e n kern calcs for two lists of length n)
        D is the tensor than unrolls the recursion and allows vecotrizaiton
        """

        # turn into one-hot  i.e. shape (# strings, #characters+1, alphabet size)
        X1 = tf.one_hot(X1, len(self.alphabet) + 1, dtype=tf.float64)
        X2 = tf.one_hot(X2, len(self.alphabet) + 1, dtype=tf.float64)
        # remove the ones in the first column that encode the padding (i.e we dont want them to count as a match)
        paddings = tf.constant([[0, 0], [0, 0], [0, len(self.alphabet)]])
        X1 = X1 - tf.pad(tf.expand_dims(X1[:, :, 0], 2), paddings, "CONSTANT")
        X2 = X2 - tf.pad(tf.expand_dims(X2[:, :, 0], 2), paddings, "CONSTANT")
        # store squared match coef
        match_sq = tf.square(self.match_decay)
        # Make S: the similarity tensor of shape (# strings, #characters, # characters)
        S = tf.matmul(X1, tf.transpose(X2, perm=(0, 2, 1)))
        # Main loop, where Kp, Kpp values and gradients are calculated.
        Kp = tf.TensorArray(tf.float64,
                            size=0,
                            dynamic_size=True,
                            clear_after_read=False)
        Kp = Kp.write(
            Kp.size(),
            tf.ones(shape=tf.stack([tf.shape(X1)[0], self.maxlen,
                                    self.maxlen]),
                    dtype=tf.float64))

        # calc subkernels for each subsequence length
        for i in tf.range(0, self.max_subsequence_length - 1):
            temp = tf.multiply(S, Kp.read(i))
            temp = tf.matmul(temp, self.D)
            temp = tf.matmul(self.D, temp, transpose_a=True)
            temp = match_sq * temp
            Kp = Kp.write(Kp.size(), temp)

        # Final calculation. We gather all Kps
        Kp_stacked = Kp.stack()
        Kp.close()

        # Get k
        aux = tf.multiply(S, Kp_stacked)
        aux = tf.reduce_sum(aux, -1)
        sum2 = tf.reduce_sum(aux, -1)
        Ki = tf.multiply(sum2, match_sq)
        k = tf.reduce_sum(Ki, 0)
        k = tf.expand_dims(k, 1)

        return k
 def __init__(self, state_dim, W):
     self.state_dim = state_dim
     self.W = Parameter(np.reshape(W, (state_dim, 1)), trainable=False)
Beispiel #21
0
class Batch_simple_SSK(Kernel):
    """
   with hyperparameters:
    1) match_decay float
        decrease the contribution of long subsequences
    3) max_subsequence_length int 
        largest subsequence considered
    """

    def __init__(self,active_dims=[0],decay=0.1,max_subsequence_length=3,
                 alphabet = [], maxlen=0, batch_size=100):
        super().__init__(active_dims=active_dims)
        # constrain decay kernel params to between 0 and 1
        self.logistic = tfb.Chain([tfb.Shift(tf.cast(0,tf.float64))(tfb.Scale(tf.cast(1,tf.float64))),tfb.Sigmoid()])
        self.decay_param= Parameter(decay, transform=self.logistic ,name="decay")

        # use will use copies of the kernel params to stop building expensive computation graph
        # we instead efficientely calculate gradients using dynamic programming
        # These params are updated at every call to K and K_diag (to check if parameters have been updated)
        self.decay = self.decay_param.numpy()

        self.decay_unconstrained = self.decay_param.unconstrained_variable.numpy()

        self.order_coefs=tf.ones(max_subsequence_length,dtype=tf.float64)
        
        # store additional kernel parameters
        self.max_subsequence_length = tf.constant(max_subsequence_length)
        self.alphabet =  tf.constant(alphabet)
        self.alphabet_size=tf.shape(self.alphabet)[0]
        self.maxlen =  tf.constant(maxlen)
        self.batch_size = tf.constant(batch_size)

        # build a lookup table of the alphabet to encode input strings
        self.table = tf.lookup.StaticHashTable(
            initializer=tf.lookup.KeyValueTensorInitializer(
                keys=tf.constant(["PAD"]+alphabet),
                values=tf.constant(range(0,len(alphabet)+1)),),default_value=0)

        # initialize helful construction matricies to be lazily computed once needed
        self.D = None
        self.dD_dgap = None


    def K_diag(self, X):
        r"""
        The diagonal elements of the string kernel are always unity (due to normalisation)
        """
        return tf.ones(tf.shape(X)[:-1],dtype=tf.float64)



    def K(self, X1, X2=None):
        r"""
        Vectorized kernel calc.
        Following notation from Beck (2017), i.e have tensors S,D,Kpp,Kp
        Input is two tensors of shape (# strings , # characters)
        and we calc the pair-wise kernel calcs between the elements (i.e n kern calcs for two lists of length n)
        D is the tensor than unrolls the recursion and allows vecotrizaiton
        """

        # Turn our inputs into lists of integers using one-hot embedding
        # first split up strings and pad to fixed length and prep for gpu
        # pad until all have length of self.maxlen
        # turn into one-hot  i.e. shape (# strings, #characters+1, alphabet size)
        X1 = tf.strings.split(tf.squeeze(X1,1)).to_tensor("PAD",shape=[None,self.maxlen])
        X1 = self.table.lookup(X1)
        # keep track of original input sizes
        X1_shape = tf.shape(X1)[0]
        X1 = tf.one_hot(X1,self.alphabet_size+1,dtype=tf.float64)
        if X2 is None:
            X2 = X1
            X2_shape = X1_shape
            self.symmetric = True
        else:
            self.symmetric = False
            X2 = tf.strings.split(tf.squeeze(X2,1)).to_tensor("PAD",shape=[None,self.maxlen])
            X2 = self.table.lookup(X2)
            X2_shape = tf.shape(X2)[0]
            X2 = tf.one_hot(X2,self.alphabet_size+1,dtype=tf.float64)
  
        # prep the decay tensors 
        self._precalc()
      


        # combine all target strings and remove the ones in the first column that encode the padding (i.e we dont want them to count as a match)
        X_full = tf.concat([X1,X2],0)[:,:,1:]

        # get indicies of all possible pairings from X and X2
        # this way allows maximum number of kernel calcs to be squished onto the GPU (rather than just doing individual rows of gram)
        indicies_2, indicies_1 = tf.meshgrid(tf.range(0, X1_shape ),tf.range(X1_shape , tf.shape(X_full)[0]))
        indicies = tf.concat([tf.reshape(indicies_1,(-1,1)),tf.reshape(indicies_2,(-1,1))],axis=1)
        if self.symmetric:
            # if symmetric then only calc upper matrix (fill in rest later)
            indicies = tf.boolean_mask(indicies,tf.greater_equal(indicies[:,1]+ X1_shape ,indicies[:,0]))
        else:
            # if not symmetric need to calculate some extra kernel evals for the normalization later on
            indicies = tf.concat([indicies,tf.tile(tf.expand_dims(tf.range(tf.shape(X_full)[0]),1),(1,2))],0)

        # make kernel calcs in batches
        num_batches = tf.cast(tf.math.ceil(tf.shape(indicies)[0]/self.batch_size),dtype=tf.int32)
        k_split =  tf.TensorArray(tf.float64, size=num_batches,clear_after_read=False,infer_shape=False)
        

        # iterate through batches
        for j in tf.range(num_batches):
            # collect strings for this batch
            indicies_batch = indicies[self.batch_size*j:self.batch_size*(j+1)]
            X_batch = tf.gather(X_full,indicies_batch[:,0],axis=0)
            X2_batch = tf.gather(X_full,indicies_batch[:,1],axis=0)

            # Make S: the similarity tensor of shape (# strings, #characters, # characters)
            #S = tf.matmul( tf.matmul(X_batch,self.sim),tf.transpose(X2_batch,perm=(0,2,1)))
            S = tf.matmul(X_batch,tf.transpose(X2_batch,perm=(0,2,1)))
            # collect results for the batch
            result = self.kernel_calc(S)
            k_split = k_split.write(j,result)

        # combine batch results
        k = tf.expand_dims(k_split.concat(),1)
        k_split.close()

        # put results into the right places in the gram matrix and normalize
        if self.symmetric:
            # if symmetric then only put in top triangle (inc diag)
            mask = tf.linalg.band_part(tf.ones((X1_shape,X2_shape),dtype=tf.int64), 0, -1)
            non_zero = tf.not_equal(mask, tf.constant(0, dtype=tf.int64))
            
            # Extracting the indices of upper triangle elements
            indices = tf.where(non_zero)
            out = tf.SparseTensor(indices,tf.squeeze(k),dense_shape=tf.cast((X1_shape,X2_shape),dtype=tf.int64))
            k_results = tf.sparse.to_dense(out)
            
            # add in mising elements (lower diagonal)
            k_results = k_results + tf.linalg.set_diag(tf.transpose(k_results),tf.zeros(X1_shape,dtype=tf.float64))
            
            # normalise
            X_diag_Ks = tf.linalg.diag_part(k_results)
            norm = tf.tensordot(X_diag_Ks, X_diag_Ks,axes=0)
            k_results = tf.divide(k_results, tf.sqrt(norm))
        else:

            # otherwise can just reshape into gram matrix
            # but first take extra kernel calcs off end of k and use them to normalise
            X_diag_Ks = tf.reshape(k[X1_shape*X2_shape:X1_shape*X2_shape+X1_shape],(-1,))
            X2_diag_Ks = tf.reshape(k[-X2_shape:],(-1,))
            k = k[0:X1_shape*X2_shape]
            k_results = tf.transpose(tf.reshape(k,[X2_shape,X1_shape]))
            # normalise
            norm = tf.tensordot(X_diag_Ks, X2_diag_Ks,axes=0)
            k_results = tf.divide(k_results, tf.sqrt(norm))


        return k_results


    def _precalc(self):
        r"""
        Update stored kernel params (incase they have changed)
        and precalc D and dD_dgap as required for kernel calcs
        following notation from Beck (2017)
        """
        self.decay = self.decay_param.numpy()
        self.decay_unconstrained = self.decay_param.unconstrained_variable.numpy()

        tril =  tf.linalg.band_part(tf.ones((self.maxlen,self.maxlen),dtype=tf.float64), -1, 0)
        # get upper triangle matrix of increasing intergers
        values = tf.TensorArray(tf.int32, size= self.maxlen)
        for i in tf.range(self.maxlen):
            values = values.write(i,tf.range(-i-1,self.maxlen-1-i)) 
        power = tf.cast(values.stack(),tf.float64)
        values.close()
        power = tf.linalg.band_part(power, 0, -1) - tf.linalg.band_part(power, 0, 0) + tril
        tril = tf.transpose(tf.linalg.band_part(tf.ones((self.maxlen,self.maxlen),dtype=tf.float64), -1, 0))-tf.eye(self.maxlen,dtype=tf.float64)
        gaps = tf.fill([self.maxlen, self.maxlen],self.decay)
        
        self.D = tf.pow(gaps*tril, power)
        self.dD_dgap = tf.pow((tril * gaps), (power - 1.0)) * tril * power



    @tf.custom_gradient
    def kernel_calc(self,S):

        # fake computations to ensure we take the custom gradients for these two params
        a = tf.square(self.decay_param)

        if self.symmetric:
            k, dk_dgap = tf.stop_gradient(self.kernel_calc_with_grads(S))
        else:
            k = tf.stop_gradient(self.kernel_calc_without_grads(S))


        def grad(dy, variables=None):
            # get gradients of unconstrained params
            grads= {}
            if self.symmetric:
                grads['decay:0'] = tf.reduce_sum(tf.multiply(dy,dk_dgap*tf.math.exp(self.logistic.forward_log_det_jacobian(self.decay_unconstrained,0))))
                gradient = [grads[v.name] for v in variables]
            else:
                gradient = [None for v in variables]
            return ((None),gradient)


        return k, grad

    def kernel_calc_without_grads(self,S):

        # store squared match coef for easier calc later
        match_sq = tf.square(self.decay)


        # calc subkernels for each subsequence length (See Moss et al. 2020 for notation)
        Kp = tf.TensorArray(tf.float64,size=self.max_subsequence_length,clear_after_read=False)

        # fill in first entries
        Kp = Kp.write(0, tf.ones(shape=tf.stack([tf.shape(S)[0], self.maxlen,self.maxlen]), dtype=tf.float64))

        # calculate dynamic programs
        for i in tf.range(self.max_subsequence_length-1):
            Kp_temp = tf.multiply(S, Kp.read(i))
            Kp_temp0 =  match_sq * Kp_temp
            Kp_temp1 = tf.matmul(Kp_temp0,self.D)
            Kp_temp2 = tf.matmul(self.D,Kp_temp1,transpose_a=True)
            Kp = Kp.write(i+1,Kp_temp2)

        # Final calculation. We gather all Kps 
        Kp_stacked = Kp.stack()
        Kp.close()

        # combine and get overall kernel
        aux = tf.multiply(S, Kp_stacked)
        aux = tf.reduce_sum(aux, -1)
        sum2 = tf.reduce_sum(aux, -1)
        Ki = sum2 * match_sq
        k = tf.linalg.matvec(tf.transpose(Ki),self.order_coefs)

        return k

    
    def kernel_calc_with_grads(self,S):
        # store squared match coef for easier calc later
        match_sq = tf.square(self.decay)
        # calc subkernels for each subsequence length (See Moss et al. 2020 for notation)
        Kp = tf.TensorArray(tf.float64,size=self.max_subsequence_length,clear_after_read=False)
        dKp_dgap = tf.TensorArray(tf.float64, size=self.max_subsequence_length, clear_after_read=False)

        # fill in first entries
        Kp = Kp.write(0, tf.ones(shape=tf.stack([tf.shape(S)[0], self.maxlen,self.maxlen]), dtype=tf.float64))
        dKp_dgap = dKp_dgap.write(0, tf.zeros(shape=tf.stack([tf.shape(S)[0], self.maxlen,self.maxlen]), dtype=tf.float64))

        # calculate dynamic programs
        for i in tf.range(self.max_subsequence_length-1):
            Kp_temp = tf.multiply(S, Kp.read(i))
            Kp_temp0 =  match_sq * Kp_temp
            Kp_temp1 = tf.matmul(Kp_temp0,self.D)
            Kp_temp2 = tf.matmul(self.D,Kp_temp1,transpose_a=True)
            Kp = Kp.write(i+1,Kp_temp2)

            dKp_dgap_temp_1 =  tf.matmul(self.dD_dgap,Kp_temp1,transpose_a=True)
            dKp_dgap_temp_2 =  tf.multiply(S, dKp_dgap.read(i))
            dKp_dgap_temp_2 = dKp_dgap_temp_2 * match_sq
            dKp_dgap_temp_2 = tf.matmul(dKp_dgap_temp_2,self.D)
            dKp_dgap_temp_2 = dKp_dgap_temp_2 + tf.matmul(Kp_temp0,self.dD_dgap)
            dKp_dgap_temp_2 = tf.matmul(self.D,dKp_dgap_temp_2,transpose_a=True)
            dKp_dgap = dKp_dgap.write(i+1,dKp_dgap_temp_1 + dKp_dgap_temp_2)



        # Final calculation. We gather all Kps 
        Kp_stacked = Kp.stack()
        Kp.close()
        dKp_dgap_stacked = dKp_dgap.stack()
        dKp_dgap.close()


        # combine and get overall kernel

        # get k
        aux = tf.multiply(S, Kp_stacked)
        aux = tf.reduce_sum(aux, -1)
        sum2 = tf.reduce_sum(aux, -1)
        Ki = sum2 * match_sq
        k = tf.linalg.matvec(tf.transpose(Ki),self.order_coefs)

        # get gap decay grads
        temp = tf.multiply(S, dKp_dgap_stacked)
        temp = tf.reduce_sum(temp, -1)
        temp = tf.reduce_sum(temp, -1)
        temp = temp * match_sq
        dk_dgap = tf.linalg.matvec(tf.transpose(temp),self.order_coefs)


        return k, dk_dgap
Beispiel #22
0
    def __init__(
        self,
        kernel: MultioutputKernel,
        inducing_variable: MultioutputInducingVariables,
        num_data: int,
        mean_function: Optional[MeanFunction] = None,
        *,
        num_samples: Optional[int] = None,
        full_cov: bool = False,
        full_output_cov: bool = False,
        num_latent_gps: int = None,
        whiten: bool = True,
        name: Optional[str] = None,
        verbose: bool = False,
    ):
        """
        :param kernel: The multioutput kernel for this layer.
        :param inducing_variable: The inducing features for this layer.
        :param num_data: The number of points in the training dataset (see :attr:`num_data`).
        :param mean_function: The mean function that will be applied to the
            inputs. Default: :class:`~gpflow.mean_functions.Identity`.

            .. note:: The Identity mean function requires the input and output
                dimensionality of this layer to be the same. If you want to
                change the dimensionality in a layer, you may want to provide a
                :class:`~gpflow.mean_functions.Linear` mean function instead.

        :param num_samples: The number of samples to draw when converting the
            :class:`~tfp.layers.DistributionLambda` into a `tf.Tensor`, see
            :meth:`_convert_to_tensor_fn`. Will be stored in the
            :attr:`num_samples` attribute.  If `None` (the default), draw a
            single sample without prefixing the sample shape (see
            :class:`tfp.distributions.Distribution`'s `sample()
            <https://www.tensorflow.org/probability/api_docs/python/tfp/distributions/Distribution#sample>`_
            method).
        :param full_cov: Sets default behaviour of calling this layer
            (:attr:`full_cov` attribute):
            If `False` (the default), only predict marginals (diagonal
            of covariance) with respect to inputs.
            If `True`, predict full covariance over inputs.
        :param full_output_cov: Sets default behaviour of calling this layer
            (:attr:`full_output_cov` attribute):
            If `False` (the default), only predict marginals (diagonal
            of covariance) with respect to outputs.
            If `True`, predict full covariance over outputs.
        :param num_latent_gps: The number of (latent) GPs in the layer
            (which can be different from the number of outputs, e.g. with a
            :class:`~gpflow.kernels.LinearCoregionalization` kernel).
            This is used to determine the size of the
            variational parameters :attr:`q_mu` and :attr:`q_sqrt`.
            If possible, it is inferred from the *kernel* and *inducing_variable*.
        :param whiten: If `True` (the default), uses the whitened parameterisation
            of the inducing variables; see :attr:`whiten`.
        :param name: The name of this layer.
        :param verbose: The verbosity mode. Set this parameter to `True`
            to show debug information.
        """

        super().__init__(
            make_distribution_fn=self._make_distribution_fn,
            convert_to_tensor_fn=self._convert_to_tensor_fn,
            dtype=default_float(),
            name=name,
        )

        self.kernel = kernel
        self.inducing_variable = inducing_variable

        self.num_data = num_data

        if mean_function is None:
            mean_function = Identity()
        self.mean_function = mean_function

        self.full_output_cov = full_output_cov
        self.full_cov = full_cov
        self.whiten = whiten
        self.verbose = verbose

        try:
            num_inducing, self.num_latent_gps = verify_compatibility(
                kernel, mean_function, inducing_variable)
            # TODO: if num_latent_gps is not None, verify it is equal to self.num_latent_gps
        except GPLayerIncompatibilityException as e:
            if num_latent_gps is None:
                raise e

            if self.verbose:
                warnings.warn(
                    "Could not verify the compatibility of the `kernel`, `inducing_variable` "
                    "and `mean_function`. We advise using `gpflux.helpers.construct_*` to create "
                    "compatible kernels and inducing variables. As "
                    f"`num_latent_gps={num_latent_gps}` has been specified explicitly, this will "
                    "be used to create the `q_mu` and `q_sqrt` parameters.")

            num_inducing, self.num_latent_gps = (
                len(inducing_variable),
                num_latent_gps,
            )

        self.q_mu = Parameter(
            np.zeros((num_inducing, self.num_latent_gps)),
            dtype=default_float(),
            name=f"{self.name}_q_mu" if self.name else "q_mu",
        )  # [num_inducing, num_latent_gps]

        self.q_sqrt = Parameter(
            np.stack(
                [np.eye(num_inducing) for _ in range(self.num_latent_gps)]),
            transform=triangular(),
            dtype=default_float(),
            name=f"{self.name}_q_sqrt" if self.name else "q_sqrt",
        )  # [num_latent_gps, num_inducing, num_inducing]

        self.num_samples = num_samples
Beispiel #23
0
    def __init__(
        self,
        inducing_variable: gpflow.inducing_variables.InducingVariables,
        kernel: gpflow.kernels.Kernel,
        domain: np.ndarray,
        q_mu: np.ndarray,
        q_S: np.ndarray,
        *,
        beta0: float = 1e-6,
        num_observations: int = 1,
        num_events: Optional[int] = None,
    ):
        """
        D = number of dimensions
        M = size of inducing variables (number of inducing points)

        :param inducing_variable: inducing variables (here only implemented for a gpflow
            .inducing_variables.InducingPoints instance, with Z of shape M x D)
        :param kernel: the kernel (here only implemented for a gpflow.kernels
            .SquaredExponential instance)
        :param domain: lower and upper bounds of (hyper-rectangular) domain
            (D x 2)

        :param q_mu: initial mean vector of the variational distribution q(u)
            (length M)
        :param q_S: how to initialise the covariance matrix of the variational
            distribution q(u)  (M x M)

        :param beta0: a constant offset, corresponding to initial value of the
            prior mean of the GP (but trainable); should be sufficiently large
            so that the GP does not go negative...

        :param num_observations: number of observations of sets of events
            under the distribution

        :param num_events: total number of events, defaults to events.shape[0]
            (relevant when feeding in minibatches)
        """
        super().__init__(kernel, likelihood=None)  # custom likelihood

        # observation domain  (D x 2)
        self.domain = domain
        if domain.ndim != 2 or domain.shape[1] != 2:
            raise ValueError("domain must be of shape D x 2")

        self.num_observations = num_observations
        self.num_events = num_events

        if not (isinstance(kernel, gpflow.kernels.SquaredExponential)
                and isinstance(inducing_variable,
                               gpflow.inducing_variables.InducingPoints)):
            raise NotImplementedError(
                "This VBPP implementation can only handle real-space "
                "inducing points together with the SquaredExponential "
                "kernel.")
        self.kernel = kernel
        self.inducing_variable = inducing_variable

        self.beta0 = Parameter(beta0, transform=positive(),
                               name="beta0")  # constant mean offset

        # variational approximate Gaussian posterior q(u) = N(u; m, S)
        self.q_mu = Parameter(q_mu, name="q_mu")  # mean vector  (length M)

        # covariance:
        L = np.linalg.cholesky(
            q_S)  # S = L L^T, with L lower-triangular  (M x M)
        self.q_sqrt = Parameter(L, transform=triangular(), name="q_sqrt")

        self.psi_jitter = 0.0
Beispiel #24
0
    def __init__(
            self,
            input_dim: int,
            output_dim: int,
            num_data: int,
            w_mu: Optional[np.ndarray] = None,
            w_sqrt: Optional[np.ndarray] = None,
            activation: Optional[Callable] = None,
            is_mean_field: bool = True,
            temperature: float = 1e-4,  # TODO is this intentional?
    ):
        """
        :param input_dim: The input dimension (excluding bias) of this layer.
        :param output_dim: The output dimension of this layer.
        :param num_data: The number of points in the training dataset (used for
            scaling the KL regulariser).
        :param w_mu: Initial value of the variational mean for weights + bias.
            If not specified, this defaults to `xavier_initialization_numpy`
            for the weights and zero for the bias.
        :param w_sqrt: Initial value of the variational Cholesky of the
            (co)variance for weights + bias. If not specified, this defaults to
            1e-5 * Identity.
        :param activation: The activation function. If not specified, this defaults to the identity.
        :param is_mean_field: Determines whether the approximation to the
            weight posterior is mean field. Must be consistent with the shape
            of ``w_sqrt``, if specified.
        :param temperature: For cooling (< 1.0) or heating (> 1.0) the posterior.
        """

        super().__init__(dtype=default_float())

        assert input_dim >= 1
        assert output_dim >= 1
        assert num_data >= 1

        if w_mu is not None:  # add + 1 for the bias
            assert w_mu.shape == ((input_dim + 1) * output_dim, )
        if w_sqrt is not None:
            if not is_mean_field:
                assert w_sqrt.shape == (
                    (input_dim + 1) * output_dim,
                    (input_dim + 1) * output_dim,
                )
            else:
                assert w_sqrt.shape == ((input_dim + 1) * output_dim, )
        assert temperature > 0.0

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.num_data = num_data

        self.w_mu_ini = w_mu
        self.w_sqrt_ini = w_sqrt

        self.activation = activation
        self.is_mean_field = is_mean_field
        self.temperature = temperature

        self.dim = (input_dim + 1) * output_dim
        self.full_output_cov = False
        self.full_cov = False

        self.w_mu = Parameter(np.zeros((self.dim, )),
                              dtype=default_float(),
                              name="w_mu")  # [dim]

        self.w_sqrt = Parameter(
            np.zeros(
                (self.dim, self.dim)) if not self.is_mean_field else np.ones(
                    (self.dim, )),
            transform=triangular() if not self.is_mean_field else positive(),
            dtype=default_float(),
            name="w_sqrt",
        )  # [dim, dim] or [dim]
Beispiel #25
0
class BayesianDenseLayer(TrackableLayer):
    """
    A dense (fully-connected) layer for variational Bayesian neural networks.

    This layer holds the mean and square-root of the variance of the
    distribution over the weights. This layer also has a temperature for
    cooling (or heating) the posterior.
    """
    def __init__(
            self,
            input_dim: int,
            output_dim: int,
            num_data: int,
            w_mu: Optional[np.ndarray] = None,
            w_sqrt: Optional[np.ndarray] = None,
            activation: Optional[Callable] = None,
            is_mean_field: bool = True,
            temperature: float = 1e-4,  # TODO is this intentional?
    ):
        """
        :param input_dim: The input dimension (excluding bias) of this layer.
        :param output_dim: The output dimension of this layer.
        :param num_data: The number of points in the training dataset (used for
            scaling the KL regulariser).
        :param w_mu: Initial value of the variational mean for weights + bias.
            If not specified, this defaults to `xavier_initialization_numpy`
            for the weights and zero for the bias.
        :param w_sqrt: Initial value of the variational Cholesky of the
            (co)variance for weights + bias. If not specified, this defaults to
            1e-5 * Identity.
        :param activation: The activation function. If not specified, this defaults to the identity.
        :param is_mean_field: Determines whether the approximation to the
            weight posterior is mean field. Must be consistent with the shape
            of ``w_sqrt``, if specified.
        :param temperature: For cooling (< 1.0) or heating (> 1.0) the posterior.
        """

        super().__init__(dtype=default_float())

        assert input_dim >= 1
        assert output_dim >= 1
        assert num_data >= 1

        if w_mu is not None:  # add + 1 for the bias
            assert w_mu.shape == ((input_dim + 1) * output_dim, )
        if w_sqrt is not None:
            if not is_mean_field:
                assert w_sqrt.shape == (
                    (input_dim + 1) * output_dim,
                    (input_dim + 1) * output_dim,
                )
            else:
                assert w_sqrt.shape == ((input_dim + 1) * output_dim, )
        assert temperature > 0.0

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.num_data = num_data

        self.w_mu_ini = w_mu
        self.w_sqrt_ini = w_sqrt

        self.activation = activation
        self.is_mean_field = is_mean_field
        self.temperature = temperature

        self.dim = (input_dim + 1) * output_dim
        self.full_output_cov = False
        self.full_cov = False

        self.w_mu = Parameter(np.zeros((self.dim, )),
                              dtype=default_float(),
                              name="w_mu")  # [dim]

        self.w_sqrt = Parameter(
            np.zeros(
                (self.dim, self.dim)) if not self.is_mean_field else np.ones(
                    (self.dim, )),
            transform=triangular() if not self.is_mean_field else positive(),
            dtype=default_float(),
            name="w_sqrt",
        )  # [dim, dim] or [dim]

    def initialize_variational_distribution(self) -> None:
        if self.w_mu_ini is None:
            w = xavier_initialization_numpy(self.input_dim, self.output_dim)
            b = np.zeros((1, self.output_dim))
            self.w_mu_ini = np.concatenate((w, b), axis=0).reshape(
                (self.dim, ))
        self.w_mu.assign(self.w_mu_ini)

        if self.w_sqrt_ini is None:
            if not self.is_mean_field:
                self.w_sqrt_ini = 1e-5 * np.eye(self.dim)
            else:
                self.w_sqrt_ini = 1e-5 * np.ones((self.dim, ))
        self.w_sqrt.assign(self.w_sqrt_ini)

    def build(self, input_shape: ShapeType) -> None:
        """Build the variables necessary on first call"""
        super().build(input_shape)
        self.initialize_variational_distribution()

    def predict_samples(
        self,
        inputs: TensorType,
        *,
        num_samples: Optional[int] = None,
    ) -> tf.Tensor:
        """
        Samples from the approximate posterior at N test inputs, with input_dim = D, output_dim = Q.

        :param inputs: The inputs to predict at; shape ``[N, D]``.
        :param num_samples: The number of samples S, to draw.
        :returns: Samples, shape ``[S, N, Q]`` if S is not None else ``[N, Q]``.
        """
        _num_samples = num_samples or 1
        z = tf.random.normal((self.dim, _num_samples),
                             dtype=default_float())  # [dim, S]
        if not self.is_mean_field:
            w = self.w_mu[:, None] + tf.matmul(self.w_sqrt, z)  # [dim, S]
        else:
            w = self.w_mu[:, None] + self.w_sqrt[:, None] * z  # [dim, S]

        N = tf.shape(inputs)[0]
        inputs_concat_1 = tf.concat(
            (inputs, tf.ones(
                (N, 1), dtype=default_float())), axis=-1)  # [N, D+1]
        samples = tf.tensordot(
            inputs_concat_1,
            tf.reshape(tf.transpose(w),
                       (_num_samples, self.input_dim + 1, self.output_dim)),
            [[-1], [1]],
        )  # [N, S, Q]
        if num_samples is None:
            samples = tf.squeeze(samples, axis=-2)  # [N, Q]
        else:
            samples = tf.transpose(samples, perm=[1, 0, 2])  # [S, N, Q]

        if self.activation is not None:
            samples = self.activation(samples)

        return samples

    def call(
            self,
            inputs: TensorType,
            training: Optional[bool] = False
    ) -> Union[tf.Tensor, MeanAndVariance]:
        """
        The default behaviour upon calling this layer.
        """
        sample = self.predict_samples(
            inputs,
            num_samples=None,
        )

        # TF quirk: add_loss must add a tensor to compile
        if training:
            # Wenzel et al. 2020: How good is the Bayes posterior in DNNs really?
            loss = self.temperature * self.prior_kl()
        else:
            loss = tf.constant(0.0, dtype=default_float())
        loss_per_datapoint = loss / self.num_data

        self.add_loss(loss_per_datapoint)

        return sample  # [N, Q]

    def prior_kl(self) -> tf.Tensor:
        """
        Returns the KL divergence ``KL[q(u)∥p(u)]`` from the prior ``p(u) = N(0, I)`` to
        the variational distribution ``q(u) = N(w_mu, w_sqrt²)``.
        """
        return gauss_kl(
            self.w_mu[:, None],
            self.w_sqrt[None] if not self.is_mean_field else self.w_sqrt[:,
                                                                         None],
        )
def mu():
    return Parameter(rng.randn(Nn, Ln))
Beispiel #27
0
 def __init__(self):
     super().__init__()
     self.var = tf.Variable(0.0)
     self.param = Parameter(0.0)
 def __init__(self, state_dim, control_dim, max_action=1.0):
     gpflow.Module.__init__(self)
     self.W = Parameter(np.random.rand(control_dim, state_dim))
     self.b = Parameter(np.random.rand(1, control_dim))
     self.max_action = max_action
Beispiel #29
0
    def __init__(
        self,
        input_dim: int,
        output_dim: int,
        num_data: int,
        w_mu: Optional[np.ndarray] = None,
        w_sqrt: Optional[np.ndarray] = None,
        activation: Optional[Callable] = None,
        is_mean_field: bool = True,
        temperature: float = 1e-4,
        returns_samples: bool = True,
    ):
        """
        A Bayesian dense layer for variational Bayesian neural nets. This layer holds the
        weight mean and sqrt as well as the temperature for cooling (or heating) the posterior.

        :param input_dim: The layer's input dimension (excluding bias)
        :param output_dim: The layer's output dimension
        :param num_data: number of data points
        :param w_mu: Initial value of the variational mean (weights + bias)
        :param w_sqrt: Initial value of the variational Cholesky (covering weights + bias)
        :param activation: The type of activation function (None is linear)
        :param is_mean_field: Determines mean field approximation of the weight posterior
        :param temperature: For cooling or heating the posterior
        :param returns_samples: If True, return samples on calling the layer,
             Else return mean and variance
        """

        super().__init__(dtype=default_float())

        assert input_dim >= 1
        assert output_dim >= 1
        assert num_data >= 1
        if w_mu is not None:  # add + 1 for the bias
            assert w_mu.shape == ((input_dim + 1) * output_dim,)
        if w_sqrt is not None:
            if not is_mean_field:
                assert w_sqrt.shape == (
                    (input_dim + 1) * output_dim,
                    (input_dim + 1) * output_dim,
                )
            else:
                assert w_sqrt.shape == ((input_dim + 1) * output_dim,)
        assert temperature > 0.0

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.num_data = num_data

        self.w_mu_ini = w_mu
        self.w_sqrt_ini = w_sqrt

        self.activation = activation
        self.is_mean_field = is_mean_field
        self.temperature = temperature
        self.returns_samples = returns_samples

        self.dim = (input_dim + 1) * output_dim
        self.full_output_cov = False
        self.full_cov = False

        self.w_mu = Parameter(np.zeros((self.dim,)), dtype=default_float(), name="w_mu")  # [dim]

        self.w_sqrt = Parameter(
            np.zeros((self.dim, self.dim)) if not self.is_mean_field else np.ones((self.dim,)),
            transform=triangular() if not self.is_mean_field else positive(),
            dtype=default_float(),
            name="w_sqrt",
        )  # [dim, dim] or [dim]