def make_kernel_bias_posterior_mvn_diag(kernel_shape, bias_shape, dtype=tf.float32, kernel_initializer=None, bias_initializer=None): """Create learnable posterior for Variational layers with kernel and bias.""" if kernel_initializer is None: kernel_initializer = tf.initializers.glorot_normal() if bias_initializer is None: bias_initializer = tf.initializers.glorot_normal() make_loc = lambda shape, init, name: tf.Variable( # pylint: disable=g-long-lambda init(shape, dtype=dtype), name=name + '_loc') make_scale = lambda shape, name: TransformedVariable( # pylint: disable=g-long-lambda tf.ones(shape, dtype=dtype), Chain([Shift(1e-5), Softplus()]), name=name + '_scale') return JointDistributionSequential([ Independent(Normal(loc=make_loc(kernel_shape, kernel_initializer, 'posterior_kernel'), scale=make_scale(kernel_shape, 'posterior_kernel')), reinterpreted_batch_ndims=prefer_static.size(kernel_shape), name='posterior_kernel'), Independent(Normal(loc=make_loc(bias_shape, bias_initializer, 'posterior_bias'), scale=make_scale(bias_shape, 'posterior_bias')), reinterpreted_batch_ndims=prefer_static.size(bias_shape), name='posterior_bias'), ])
def make_kernel_bias_posterior_mvn_diag( kernel_shape, bias_shape, kernel_initializer=None, bias_initializer=None, kernel_batch_ndims=0, # pylint: disable=unused-argument bias_batch_ndims=0, # pylint: disable=unused-argument dtype=tf.float32, kernel_name='posterior_kernel', bias_name='posterior_bias'): """Create learnable posterior for Variational layers with kernel and bias. Args: kernel_shape: ... bias_shape: ... kernel_initializer: ... Default value: `None` (i.e., `tf.initializers.glorot_uniform()`). bias_initializer: ... Default value: `None` (i.e., `tf.zeros`). kernel_batch_ndims: ... Default value: `0`. bias_batch_ndims: ... Default value: `0`. dtype: ... Default value: `tf.float32`. kernel_name: ... Default value: `"posterior_kernel"`. bias_name: ... Default value: `"posterior_bias"`. Returns: kernel_and_bias_distribution: ... """ if kernel_initializer is None: kernel_initializer = nn_init_lib.glorot_uniform() if bias_initializer is None: bias_initializer = tf.zeros make_loc = lambda init_fn, shape, batch_ndims, name: tf.Variable( # pylint: disable=g-long-lambda _try_call_init_fn(init_fn, shape, dtype, batch_ndims), name=name + '_loc') # Setting the initial scale to a relatively small value causes the `loc` to # quickly move toward a lower loss value. make_scale = lambda shape, name: TransformedVariable( # pylint: disable=g-long-lambda tf.fill(shape, value=tf.constant(1e-3, dtype=dtype)), Chain([Shift(1e-5), Softplus()]), name=name + '_scale') return JointDistributionSequential([ Independent(Normal(loc=make_loc(kernel_initializer, kernel_shape, kernel_batch_ndims, kernel_name), scale=make_scale(kernel_shape, kernel_name)), reinterpreted_batch_ndims=prefer_static.size(kernel_shape), name=kernel_name), Independent(Normal(loc=make_loc(bias_initializer, bias_shape, kernel_batch_ndims, bias_name), scale=make_scale(bias_shape, bias_name)), reinterpreted_batch_ndims=prefer_static.size(bias_shape), name=bias_name), ])
def make_kernel_bias_posterior_mvn_diag(kernel_shape, bias_shape, dtype=tf.float32, kernel_initializer=None, bias_initializer=None, kernel_name='posterior_kernel', bias_name='posterior_bias'): """Create learnable posterior for Variational layers with kernel and bias. Args: kernel_shape: ... bias_shape: ... dtype: ... Default value: `tf.float32`. kernel_initializer: ... Default value: `None` (i.e., `tf.initializers.glorot_uniform()`). bias_initializer: ... Default value: `None` (i.e., `tf.zeros`). kernel_name: ... Default value: `"posterior_kernel"`. bias_name: ... Default value: `"posterior_bias"`. Returns: kernel_and_bias_distribution: ... """ if kernel_initializer is None: kernel_initializer = tf.initializers.glorot_uniform() if bias_initializer is None: bias_initializer = tf.zeros make_loc = lambda shape, init, name: tf.Variable( # pylint: disable=g-long-lambda init(shape, dtype=dtype), name=name + '_loc') make_scale = lambda shape, name: TransformedVariable( # pylint: disable=g-long-lambda tf.ones(shape, dtype=dtype), Chain([Shift(1e-5), Softplus()]), name=name + '_scale') return JointDistributionSequential([ Independent(Normal(loc=make_loc(kernel_shape, kernel_initializer, kernel_name), scale=make_scale(kernel_shape, kernel_name)), reinterpreted_batch_ndims=prefer_static.size(kernel_shape), name=kernel_name), Independent(Normal(loc=make_loc(bias_shape, bias_initializer, bias_name), scale=make_scale(bias_shape, bias_name)), reinterpreted_batch_ndims=prefer_static.size(bias_shape), name=bias_name), ])
def make_kernel_bias_prior_spike_and_slab( kernel_shape, bias_shape, kernel_initializer=None, # pylint: disable=unused-argument bias_initializer=None, # pylint: disable=unused-argument kernel_batch_ndims=0, # pylint: disable=unused-argument bias_batch_ndims=0, # pylint: disable=unused-argument dtype=tf.float32, kernel_name='prior_kernel', bias_name='prior_bias'): """Create prior for Variational layers with kernel and bias. Note: Distribution scale is inversely related to regularization strength. Consider a "Normal" prior; bigger scale corresponds to less L2 regularization. I.e., ```python scale = (2. * l2weight)**-0.5 l2weight = scale**-2. / 2. ``` have a similar regularizing effect. The std. deviation of each of the component distributions returned by this function is approximately `1415` (or approximately `l2weight = 25e-6`). In other words this prior is extremely "weak". Args: kernel_shape: ... bias_shape: ... kernel_initializer: Ignored. Default value: `None` (i.e., `tf.initializers.glorot_uniform()`). bias_initializer: Ignored. Default value: `None` (i.e., `tf.zeros`). kernel_batch_ndims: ... Default value: `0`. bias_batch_ndims: ... Default value: `0`. dtype: ... Default value: `tf.float32`. kernel_name: ... Default value: `"prior_kernel"`. bias_name: ... Default value: `"prior_bias"`. Returns: kernel_and_bias_distribution: ... """ w = MixtureSameFamily(mixture_distribution=Categorical(probs=[0.5, 0.5]), components_distribution=Normal(loc=0., scale=tf.constant( [1., 2000.], dtype=dtype))) return JointDistributionSequential([ Sample(w, kernel_shape, name=kernel_name), Sample(w, bias_shape, name=bias_name), ])
def make_kernel_bias_prior_spike_and_slab(kernel_shape, bias_shape, dtype=tf.float32, kernel_initializer=None, bias_initializer=None): """Create prior for Variational layers with kernel and bias.""" del kernel_initializer, bias_initializer w = MixtureSameFamily(mixture_distribution=Categorical(probs=[0.5, 0.5]), components_distribution=Normal(loc=0., scale=tf.constant( [1., 2000.], dtype=dtype))) return JointDistributionSequential([ Sample(w, kernel_shape, name='prior_kernel'), Sample(w, bias_shape, name='prior_bias'), ])
def random_walk_mvnorm_fn(covariance, pu=0.95, fixed_variance=0.01, is_adaptive=1, name=None): """Returns callable that adds Multivariate Normal (MVN) noise to the input. Args: covariance: Python `list` of `Tensor`s representing each covariance matrix, size d x d, of the Multivariate Normal proposal. The number of parameters is d. pu: Python floating point number representing the bounded convergence parameter. If equal to 1, then all proposals are drawn from the MVN(0, `covariance`) distribution, if less than 1, proposals are drawn from MVN(0, `covariance`) with probability `pu`, and MVN(0, `fixed_variance`/d) otherwise. Default value: 0.95. fixed_variance: Python floating point number representing the variance of the fixed proposal distribution of the form MVN(0, `fixed_variance`/d). Default value: 0.01. is_adaptive: Python list of `Tensor`s representing the type of proposal where for each batch 0 represents a fixed proposal and 1 an adaptive proposal. Default value: 1. name: Python `str` name. Given the default value of `None` the name is set to `random_walk_mvnorm_fn`. Returns: random_walk_mvnorm_fn: A callable accepting a Python `list` of `Tensor`s representing the state parts of the `current_state` and an `int` representing the random seed to be used to generate the proposal. The callable returns two quantities. First, a `Tensor` of type integer representing whether each state part was updated using the fixed (value=0) or adaptive (value=1) proposal. Second, a `list` of `Tensor`s, with the same-type as the input state parts, which represents the proposal for the Metropolis Hastings algorithm. """ dtype = dtype_util.base_dtype(covariance[0].dtype) shape = tf.stack(covariance, axis=0).shape # for numerical stability ensure covariance matrix is positive semi-definite covariance = covariance + 1.0e-9 * tf.eye( shape[1], batch_shape=[shape[0]], dtype=dtype) scale_tril = tf.linalg.cholesky(covariance) rv_adaptive = MultivariateNormalTriL(loc=tf.zeros([shape[0], shape[1]], dtype=dtype), scale_tril=scale_tril) rv_fixed = Normal( loc=tf.zeros([shape[0], shape[1]], dtype=dtype), scale=tf.constant(fixed_variance, dtype=dtype) / shape[2], ) def _fn(state_parts, seed): with tf.name_scope(name or "random_walk_mvnorm_fn"): def proposal(): # For parallel computation it is quicker to sample # both distributions then select the result rv = tf.stack( [ rv_fixed.sample(seed=seed), rv_adaptive.sample(seed=seed), ], axis=1, ) return tf.squeeze(tf.gather(rv, is_adaptive, axis=1, batch_dims=1), axis=1) proposal_parts = tf.unstack(proposal()) new_state_parts = [ proposal_part + state_part for proposal_part, state_part in zip(proposal_parts, state_parts) ] return new_state_parts return _fn